In [142]:
import numpy as np
import pandas as pd
import seaborn as sns
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('seaborn-whitegrid')
from sklearn.impute import KNNImputer
import array
import time
import math
import os
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score

In [143]:
# Define functions here

def isNaN(string):
    return string != string

In [144]:
# Load csv and print header and values

df = pd.read_csv('pems_output_air_quality.csv',header=0)
print(df.head)

<bound method NDFrame.head of             DateTime  Sensor_Data  Feature_1  Feature_2  Actual_Data
0     3/10/2004 0:00          1.2         62         77          1.2
1     3/10/2004 1:00          1.0         62         76          1.0
2     3/10/2004 2:00          0.9         45         60          0.9
3     3/10/2004 3:00          0.6          1          2          0.6
4     3/10/2004 4:00          1.0         21         34          1.0
...              ...          ...        ...        ...          ...
9370  4/4/2005 10:00          3.1        472        190          3.1
9371  4/4/2005 11:00          2.4        353        179          2.4
9372  4/4/2005 12:00          2.4        293        175          2.4
9373  4/4/2005 13:00          2.1        235        156          2.1
9374  4/4/2005 14:00          2.2        265        168          2.2

[9375 rows x 5 columns]>


In [145]:
# Assign K value here

KNNImputation_value = 15

# Reduce the dataset to use 100 days of the data
df = df[:2400]

# pick datetime and sensor data with missing values

df_simplified = df[['DateTime', 'Sensor_Data']]

# Split the datetime column into date column and hour column

df_simplified.loc[:,"Date"] = pd.to_datetime(df_simplified['DateTime']).dt.date
df_simplified.loc[:,"Hour_of_Day"] = pd.to_datetime(df_simplified['DateTime']).dt.strftime("%H")
df_simplified["Hour_of_Day"] = pd.to_numeric(df_simplified["Hour_of_Day"]) + 1

# Generate a unique number for each date for simplicity of the program 
df_simplified.loc[:,"Date_Id"] = df_simplified.groupby(['Date'], sort=False).ngroup() + 1

# Do the same for original table by making sure date and hour columns are separte we will use these columns
# later for joining

df.loc[:,"Date"] = pd.to_datetime(df['DateTime']).dt.date
df.loc[:,"Hour_of_Day"] = pd.to_datetime(df['DateTime']).dt.strftime("%H")
df["Hour_of_Day"] = pd.to_numeric(df["Hour_of_Day"]) + 1
df.loc[:,"Date_Id"] = df.groupby(['Date'], sort=False).ngroup() + 1

# Now that we have split datetime into date and hour we will drop datetime column
df_simplified = df_simplified.drop(['DateTime'],axis=1)

# Pivot the table by transforming datetime row into column this way we have a new column for each date is generated for 24 hours of the day
# so the values of each column represent sensor data for each hour
df_pivot = df_simplified.pivot(index='Hour_of_Day', columns='Date_Id', values='Sensor_Data')
print(df_pivot)
# Filter all missing sensor data value into a separate data frameFind all missing values
df_missing = df_simplified[df_simplified['Sensor_Data'].isna()] 
df_missing["Window_Id"] = np.nan

# We need to loop thru the each missing value and create window. Each window will have one or more missing values along with the sensor data 
# before and after the missing value(s). If number of missing values within in the window is less then or equal 5, we take 5 sensor values before and 5 sensor 
# values after the missing value(s). However, if there are more than 5 missing values for example if there are 10 missing values it will take 
# 10 values before and 10 values after the missing values(s).

# min window imputing size is set to 5 as per above explaination
min_window_imputing_size = 20

# Create an empty data frame which will later be used to form windows
df_windows = pd.DataFrame(columns=['Window_Id', 'Hour_of_Day', 'Date_Id', 'Sensor_Data'])
window_id = 0
print(df_missing)
for index, row in df_missing.iterrows():
    
    col_index = row['Date_Id'] - 1
    row_index = row['Hour_of_Day'] - 1
   
    # check if the previous missing value is NaN, this check will help add all consecutive missing values into one window. 
    # When previous missing value is NaN then it means a window is already build in the previous iteration
    if (isNaN(df_pivot.iat[row_index,col_index-1])):
        continue
        
    # check if current value is not null
    #if(isNaN(df_pivot.iat[row_index,col_index]) == False):
        #continue
        
    counter = 1
    while True:
        # In the pivot table check if the next day for same hour has a NaN values if it is then continue to loop to find total consecutive 
        # missing values else break
        if(isNaN(df_pivot.iat[row_index,col_index + counter])):
            counter = counter + 1
        else:
            break
            
    # Find the total size of the window
    no_of_imputation_in_gap = min_window_imputing_size if counter <= min_window_imputing_size else counter
    
    # Find window start index
    start_index_of_window = (col_index - no_of_imputation_in_gap) if (col_index - no_of_imputation_in_gap) > 0 else 0       
    
    # Find window end index
    end_index_of_window = (col_index + counter + no_of_imputation_in_gap) if (col_index + counter + no_of_imputation_in_gap) < (df_pivot.shape[1] -1) else df_pivot.shape[1] -1
    
    # From the pivot table, filter the data frame to fetch window which contains both missing values and sensor data to use for imputation  
    ds_missing = df_pivot.iloc[row_index, start_index_of_window : end_index_of_window]
    
    # Update window column with a unique window id so later this column is used to filter to fetch window specific sensore data
    window_id = window_id + 1
    df_missing.loc[index, 'Window_Id'] = window_id
    
    # Create a new dataframe df_windows which holds all window data
    for date_id, value in ds_missing.items():
        df_windows = df_windows.append({'Window_Id': window_id, 'Hour_of_Day': row['Hour_of_Day'], 'Date_Id': date_id, 'Sensor_Data': value}, ignore_index=True)

# Now that we have all the windows data in df_windows, next part of the code will loop thru each window to impute data

# Create new dataframe df_merged which is an inner join of df_windows and original df dataframe to get some additional columns
# Merge will rename the columns if both the merged dataframe has same column names and we need to correct them and drop additionals
df_merged = pd.merge(df_windows, df, on=['Hour_of_Day', 'Date_Id'], how='inner')

df_merged = df_merged.drop(['Sensor_Data_y'], axis=1)
df_merged.rename(columns = {'Sensor_Data_x':'Sensor_Data'}, inplace = True)
   
# impute_column_name will hold the column name of the sensor data to impute
impute_column_name = "Sensor_Data"

# Create two new columns Forward_Impute, Backward_Impute which initially has same data as sensor data column but further these columns will 
# populate itself with the forward imputed and backward imputed values
df_merged["Forward_Impute"] = df_merged[impute_column_name]
df_merged["Backward_Impute"] = df_merged[impute_column_name]


# Loop each window
for i in range(1,window_id + 1):
   
    # Filter data specific to the window, this wil fetch all the records of a specific window which includes both NaN and sensor data before 
    # and after gap
    df_window = df_merged[df_merged['Window_Id'] == i]
    # Since the df_window now has filter records, its index column of dataframe will not be ordinal, reset will help get is reordered
    df_window.reset_index(drop=True, inplace=True)
   
    # declare some variables to used in the inner loop
    
    # Hour of the day
    window_hour = df_window['Hour_of_Day'].values[0]
    
    # First Date Id of the window
    first_date_id = df_window['Date_Id'].values[0]
  
    # Last Date Id of the window
    last_date_id = df_window['Date_Id'].values[-1]
    
    # Find the first occurrence of NaN in the window
    first_missing_occurrence_date_id = 0
    for index, row in df_window.iterrows():
        if(isNaN(row[impute_column_name])):
            first_missing_occurrence_date_id = row["Date_Id"]
            break;
        else:
            continue;
        
    # Find total missing values
    total_missing_values = df_window[impute_column_name].isna().sum()
    
    # Last missing occurance of NaN
    last_missing_occurrence_date_id = first_missing_occurrence_date_id + total_missing_values - 1
    
    
    # Forward Imputation - Loop for the sliding window for Forward imputation and populate the imputed values into "Forward_Impute" column
    for j in range(0,total_missing_values):
        
        # filter to fetch the sliding window from the main window. 
        # Sliding window in the forward impute will start from the begining and impute one missing value at a time so each sliding window
        # will have a missing sensor value at the end of the window
        df_window_subset = df_window[(df_window['Date_Id'] >= (first_date_id + j)) & (df_window['Date_Id'] <= (first_missing_occurrence_date_id + j))]
        cols = ['Date_Id','Forward_Impute', 'Feature_1', 'Feature_2']
        
        # From the sliding window, create a new dataframe to be used by KNN algorithm which requires feature columns and no Nan values in 
        # the data. So those are replaced by 0
        df_knn_window_subset = df_window_subset[cols].copy()
        df_knn_window_subset[cols] = df_knn_window_subset[cols].fillna(0).astype(int)
        df_knn_window_subset[cols] = df_knn_window_subset[cols].replace({'0':np.nan, 0:np.nan})
        
        # Perform KNN imputation on the resultant dataframe "df_knn_window_subset"
        # Define imputer
        imputer = KNNImputer(n_neighbors=KNNImputation_value, weights='uniform', metric='nan_euclidean')
        # fit on the dataset
        imputer.fit(df_knn_window_subset)
        # transform the dataset
        df_knn_window_subset_transformed = imputer.transform(df_knn_window_subset)
        df_result = pd.DataFrame(df_knn_window_subset_transformed, columns =cols)
        
        # Retrieve the imputed value from resultant data frame and update df_window
        # Missing row index
        missing_row_index = first_missing_occurrence_date_id - first_date_id
        df_window.at[missing_row_index+j, 'Forward_Impute'] = df_result.at[missing_row_index,"Forward_Impute"]
        j=j+1
    
    
    # Backward Imputation - Loop for the sliding window for Forward imputation and populate the imputed values into "Backward_Impute" column    
    for j in range(0,total_missing_values):
        
        # filter to fetch the sliding window from the main window. 
        # Sliding window in the backward impute will start from the end and impute one missing value at a time to the center so each sliding window
        # will have a missing sensor value at the begining of the window
        df_window_subset = df_window[(df_window['Date_Id'] <= (last_date_id - j)) & (df_window['Date_Id'] >= (last_missing_occurrence_date_id - j))]
        cols = ['Date_Id','Backward_Impute', 'Feature_1', 'Feature_2']
        df_knn_window_subset = df_window_subset[cols].copy()  
        df_knn_window_subset[cols] = df_knn_window_subset[cols].fillna(0).astype(int)
        df_knn_window_subset[cols] = df_knn_window_subset[cols].replace({'0':np.nan, 0:np.nan})
        
        # Perform KNN imputation on the resultant dataframe "df_knn_window_subset"
        # Define imputer
        imputer = KNNImputer(n_neighbors=KNNImputation_value, weights='uniform', metric='nan_euclidean')
        # fit on the dataset
        imputer.fit(df_knn_window_subset)
        # transform the dataset
        df_knn_window_subset_transformed = imputer.transform(df_knn_window_subset)
        df_result = pd.DataFrame(df_knn_window_subset_transformed, columns =cols)
        
        # Retrieve the imputed value from result data frame and update df_window
        # Missing row index
        missing_row_index = 0
        df_window.at[last_missing_occurrence_date_id-first_date_id - j, 'Backward_Impute'] = df_result.at[missing_row_index,"Backward_Impute"]
        

    # After forward and backward sliding, we will have columns "Forward_Impute", Backward_Impute" populated with imputed values
    # Now merge those values into df_merged table
    with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
        print( df_window)
    df_merged = pd.merge(df_merged, df_window, on=['Hour_of_Day', 'Date_Id'], how='left')
   
    df_merged['Backward_Impute_x'] = df_merged['Backward_Impute_y'].fillna(df_merged['Backward_Impute_x'])
    df_merged['Forward_Impute_x'] = df_merged['Forward_Impute_y'].fillna(df_merged['Forward_Impute_x'])
    drop_cols = ['Window_Id_y', 'Sensor_Data_y', 'DateTime_y', 'Feature_1_y', 'Feature_2_y', 'Date_y', 'Forward_Impute_y', 'Backward_Impute_y']
    df_merged = df_merged.drop(drop_cols, axis=1)    
    # Rename columns created with merge
    df_merged.rename(columns = {'Window_Id_x':'Window_Id', 'Sensor_Data_x':'Sensor_Data', 'DateTime_x':'DateTime', 'Feature_1_x':'Feature_1','Feature_2_x':'Feature_2', 'Date_x':'Date', 'Forward_Impute_x':'Forward_Impute', 'Backward_Impute_x':'Backward_Impute'}, inplace = True)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_simplified["Hour_of_Day"] = pd.to_numeric(df_simplified["Hour_of_Day"]) + 1


Date_Id      1    2    3    4    5    6    7    8    9    10   ...  91   92   \
Hour_of_Day                                                    ...             
1            1.2  1.2  1.7  2.7  2.9  1.8  2.1  1.7  2.3  2.0  ...  1.3  1.6   
2            1.0  1.0  NaN  NaN  NaN  1.8  1.2  1.2  1.4  1.6  ...  0.7  1.0   
3            0.9  0.9  1.4  1.6  2.5  1.8  0.8  0.9  1.0  0.9  ...  0.5  0.9   
4            0.6  0.6  0.8  1.7  2.4  1.1  0.7  0.7  0.7  0.7  ...  0.4  0.5   
5            1.0  1.0  1.0  1.0  1.0  1.0  1.0  1.0  1.0  1.0  ...  0.3  0.5   
6            0.7  0.7  0.6  1.0  1.2  1.0  0.6  0.5  0.6  0.5  ...  0.5  0.5   
7            0.7  0.7  0.8  1.2  1.0  1.4  0.9  0.5  0.7  0.7  ...  1.4  1.0   
8            1.1  1.1  1.4  1.5  0.9  2.2  1.3  1.6  1.5  1.5  ...  3.3  3.5   
9            2.0  2.0  4.4  2.7  1.4  5.5  3.4  4.1  4.7  4.8  ...  5.8  6.4   
10           2.2  2.2  1.0  3.7  1.6  8.1  3.7  6.6  6.6  6.2  ...  5.0  2.9   
11           1.7  1.7  3.1  3.2  2.2  5.

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_missing["Window_Id"] = np.nan
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[ro

    Window_Id  Hour_of_Day  Date_Id  Sensor_Data        DateTime  Feature_1  \
0         1.0          2.0      1.0          1.0  3/10/2004 1:00         62   
1         1.0          2.0      2.0          1.0  3/11/2004 1:00         62   
2         1.0          2.0      3.0          NaN  3/12/2004 1:00        133   
3         1.0          2.0      4.0          NaN  3/13/2004 1:00        139   
4         1.0          2.0      5.0          NaN  3/14/2004 1:00        174   
5         1.0          2.0      6.0          1.8  3/15/2004 1:00        106   
6         1.0          2.0      7.0          1.2  3/16/2004 1:00         79   
7         1.0          2.0      8.0          1.2  3/17/2004 1:00         95   
8         1.0          2.0      9.0          1.4  3/18/2004 1:00         92   
9         1.0          2.0     10.0          1.6  3/19/2004 1:00        103   
10        1.0          2.0     11.0          1.6  3/20/2004 1:00         86   
11        1.0          2.0     12.0          2.1  3/

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)
A value is trying to be set on a copy of a slice from a DataFrame.
Try us

In [147]:
# "Forward_Impute", "Backward_Impute" are further averaged into "Prediction" column



df_merged_actual= pd.merge(df_merged,df[['DateTime','Actual_Data']],on='DateTime', how='inner')
df_merged["Transform_Prediction"] = df_merged["Sensor_Data"].apply(lambda x: x if x!=x else 0)
df_merged['Prediction'] = df_merged["Transform_Prediction"].fillna((df_merged['Forward_Impute'] + df_merged['Backward_Impute'])/2)

df_merged_actual['Actual_Data'] = df_merged["Transform_Prediction"].fillna(df_merged_actual['Actual_Data'])



# Acutal data and Predictions columns are concatenated into df_Results
y_true = df_merged_actual[df_merged_actual['Actual_Data'] != 0]['Actual_Data']
y_pred = df_merged[df_merged['Prediction'] != 0]['Prediction']


# Calculate R2 score
print(y_true)
print(y_pred)
print("R2 Score:")
print(r2_score(y_true, y_pred))

# Calculate RMSE

MSE = np.square(np.subtract(y_true,y_pred)).mean() 
 
RMSE = math.sqrt(MSE)
print("\n Root Mean Square Error:")
print(RMSE)

# Calculate MAE

print("\n MAE:")
MAE = mean_absolute_error(y_true, y_pred)
print(MAE)

# Concatenating all the results
df_Results = pd.concat([y_true, y_pred], axis=1)
with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
    print(df_Results)
    
# All columns
df_all_result = pd.concat([df_Results, df_merged], axis=1)
with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
    print(df_all_result[['Forward_Impute', 'Backward_Impute', 'Sensor_Data', 'Prediction']])
    

    

2     1.9
3     1.9
4     2.8
36    1.7
37    1.8
Name: Actual_Data, dtype: float64
2     1.066667
3     1.066667
4     1.066667
36    2.478788
37    2.569697
Name: Prediction, dtype: float64
R2 Score:
-6.096798627716949

 Root Mean Square Error:
1.0575705478729025

 MAE:
0.9896969696969695
    Actual_Data  Prediction
2           1.9    1.066667
3           1.9    1.066667
4           2.8    1.066667
36          1.7    2.478788
37          1.8    2.569697
    Forward_Impute  Backward_Impute  Sensor_Data  Prediction  Prediction
0         1.000000         1.000000          1.0         NaN    0.000000
1         1.000000         1.000000          1.0         NaN    0.000000
2         1.000000         1.133333          NaN    1.066667    1.066667
3         1.000000         1.133333          NaN    1.066667    1.066667
4         1.000000         1.133333          NaN    1.066667    1.066667
5         1.800000         1.800000          1.8         NaN    0.000000
6         1.200000         1.