In [1]:
import pandas as pd
import numpy as np
# from scipy.stats import zscore
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
import plotly.express as px

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [2]:
df = pd.read_csv("../data/processed/df_intermediate.csv", parse_dates=['date'])

# Anamalous Observations Detection
- outliers will be found manually and treated as Na
    - 'check_point5' the drop from mid Aug - Nov seems anamalous and needs to be handled.
    - 'check_point6' has an outlier value of 9999 that needs to be handled.
    - 'check_point15' has zero values from Apr - mid May which seems anamalous and needs to be handled.
    - 'check_point15' has one zero value that needs to be handled.
- outliers will be imputed with modern missing value handling techniques like MICE.

In [3]:
# # removing outliers using sliding window and z-threshold

# df_cp = df.iloc[:,0:22] # filtering checkpoints dataset
# z_score_threshold = 1
# window_size = 30
# for column in df_cp_afterMay.columns[1:]:
#     z_scores = zscore(df_cp_afterMay[column])
#     outlier_flags = np.zeros(len(df_cp_afterMay))
#     for i in range(window_size, len(df_cp_afterMay) - window_size):
#         window_z_scores = z_scores[i - window_size:i + window_size + 1]
#         if all(np.abs(window_z_scores) > z_score_threshold):
#             outlier_flags[i] = 1
#     df_cp_afterMay.loc[outlier_flags.astype(bool), column] = np.nan

In [4]:
df.loc[(df['date'] > '2015-08-20') & (df['date'] < '2015-11-10'), 'check_point5'] = np.nan
df.loc[df['check_point6']==9999, 'check_point6'] = np.nan
df.loc[(df['date'] >= '2015-04-02') & (df['date'] <= '2015-05-14'), 'check_point15']  = np.nan
df.loc[df['check_point9']==0, 'check_point9'] = np.nan

# Multivariate Imputation by Chained Equations (MICE)
- To compute target variable(which is sum of all checkpoint values) we have to impute the values. Traditional modes of imputation like mean/median imputation can't be performed as there are number of research papers that advise against it. So, implementing more advanced, straightforward and robust technique like MICE. if the data is missing completely at random(MCAR) or missing at random(MAR), MICE can be performed. Multiple imputation is considered a good approach for data sets with a large amount of missing data. Multiple imputations can produce statistically valid results even when there is a small sample size or a large amount of missing data.
- I am going to make use of weather related variable info to predict the missing values with MICE.

In [5]:
df_toimpute = df.drop(columns=['date'])

imputer = IterativeImputer(random_state=0, min_value=0, max_iter=100)
df_imputed = imputer.fit_transform(df_toimpute)

df_imputed = pd.DataFrame(df_imputed, columns=df_toimpute.columns)
df_imputed['date'] = df['date']

# Before and After MICE implementation
## check_point5
![a](../images/1_after_MICE.png)
![b](../images/1_before_MICE.png)
## check_point6
![c](../images/2_after_MICE.png)
![d](../images/2_before_MICE.png)
## check_point15
![e](../images/3_after_MICE.png)
![f](../images/3_before_MICE.png)

In [6]:
df_imputed['sum_checkpointCount'] = df_imputed[df_imputed.columns[0:21]].sum(axis=1)

In [7]:
# exporting the processed data
df_imputed.to_csv("../data/processed/df_imputed.csv", index=0)