In [None]:
!pip install missingno

In [None]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import missingno as msno
from sklearn.impute import SimpleImputer
import seaborn as sns
plt.style.use("fivethirtyeight")

In [None]:
df = pd.read_csv('travel-times.csv', parse_dates=[['Date', 'StartTime']], index_col='Date_StartTime')

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
mask = np.random.choice([True, False], size=df['MaxSpeed'].shape, p=[0.1, 0.9])
mask[mask.all(),-1] = 0
df['MaxSpeed'] = df['MaxSpeed'].mask(mask)

In [None]:
df.head()

In [None]:
df.isnull().sum()

In [None]:
df.isnull().mean()*100

In [None]:
msno.matrix(df)

In [None]:
from sklearn.impute import SimpleImputer

In [None]:
# Mean Imputation

df_mean = df.copy(deep=True)
mean_imputer = SimpleImputer(strategy="mean")
df_mean['MaxSpeed'] = mean_imputer.fit_transform(df_mean['MaxSpeed'].values.reshape(-1,1))

In [None]:
fig = plt.Figure()
null_values = df["MaxSpeed"].isnull() 
fig = df_mean.plot(x="AvgSpeed", y="MaxSpeed", kind="scatter", c=null_values, cmap='winter', title='Mean Imputation', colorbar=False)

In [None]:
# Mode Imputation

df_mode = df.copy(deep=True)
mode_imputer = SimpleImputer(strategy="most_frequent")
df_mode['MaxSpeed'] = mode_imputer.fit_transform(df_mode['MaxSpeed'].values.reshape(-1,1))

In [None]:
fig = plt.Figure()
null_values = df["MaxSpeed"].isnull() 
fig = df_mode.plot(x="AvgSpeed", y="MaxSpeed", kind="scatter", c=null_values, cmap='winter', colorbar=False, title='Mode Imputation')

In [None]:
# Time Series Imputation

df['MaxSpeed'][-50:-40]

In [None]:
# Ffill method to replace nans with the last observed value

df['MaxSpeed'].fillna(method='ffill')[-50:-40]

In [None]:
# Bfill method to replace nans with the next observed value

df['MaxSpeed'].fillna(method='bfill')[-50:-40]

In [None]:
df["MaxSpeed"][:100].plot(title="MaxSpeed", marker="o", figsize=(30,10))

In [None]:
# Ffill imputation
ffill_imp = df.fillna(method="ffill")
ffill_imp["MaxSpeed"][:100].plot(color="red", marker="o", linestyle="dotted", figsize=(30,10))
df["MaxSpeed"][:100].plot(title="MaxSpeed", marker="o")

In [None]:
# Bfill imputation
bfill_imp = df.fillna(method="bfill")
bfill_imp["MaxSpeed"][:100].plot(color="red", marker="o", linestyle="dotted", figsize=(30,10))
df["MaxSpeed"][:100].plot(title="MaxSpeed", marker="o")

In [None]:
# Linear Interpolation imputation
linear_int = df.interpolate(method="linear")
linear_int["MaxSpeed"][:100].plot(color="red", marker="o", linestyle="dotted", figsize=(30,10))
df["MaxSpeed"][:100].plot(title="MaxSpeed", marker="o")

In [None]:
# Advanced Techniques

# Imputing with KNNImputer

from sklearn.impute import KNNImputer
from sklearn.preprocessing import MinMaxScaler

df_knn = df.filter(['Distance',	'MaxSpeed',	'AvgSpeed',	'AvgMovingSpeed'], axis=1).copy()

scaler = MinMaxScaler(feature_range=(0, 1))
df_knn = pd.DataFrame(scaler.fit_transform(df_knn), columns = df_knn.columns)

knn_imputer = KNNImputer(n_neighbors=5, weights='uniform', metric='nan_euclidean')

df_knn_imputed = pd.DataFrame(knn_imputer.fit_transform(df_knn), columns=df_knn.columns)

In [None]:
fig = plt.Figure()
null_values = df["MaxSpeed"].isnull() 
fig = df_knn_imputed.plot(x="AvgSpeed", y="MaxSpeed", kind="scatter", c=null_values, cmap='winter', title='KNN Imputation', colorbar=False)

In [None]:
# Imputing with MICE
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn import linear_model

df_mice = df.filter(['Distance',	'MaxSpeed',	'AvgSpeed',	'AvgMovingSpeed'], axis=1).copy()

mice_imputer = IterativeImputer(estimator=linear_model.BayesianRidge(), n_nearest_features=None, imputation_order='ascending')

df_mice_imputed = pd.DataFrame(mice_imputer.fit_transform(df_mice), columns=df_mice.columns)

In [None]:
fig = plt.Figure()
null_values = df["MaxSpeed"].isnull() 
fig = df_mice_imputed.plot(x="AvgSpeed", y="MaxSpeed", kind="scatter", c=null_values, cmap='winter', title='MICE Imputation', colorbar=False)