## This notebook inputs the missing values using Missing Indicators

In [1]:
import pandas as pd
from datetime import datetime
import numpy as np
from sklearn.impute import MissingIndicator

#### Choose wether the imputation should be done on the data set with daily or weekly granularity


In [2]:
#INPUT, OUTPUT = 'CompleteIndexes.csv','MIImputed.csv'
INPUT, OUTPUT = 'CompleteWeeklyIndexes', 'MIIWeeklyImputed'

In [3]:
df = pd.read_csv(INPUT+'_training_FE.csv', parse_dates=True, index_col=0)
df_test = pd.read_csv(INPUT+'_test_FE.csv', parse_dates=True, index_col=0)

In [4]:
indicator = MissingIndicator(missing_values=np.nan, features="all")

In [5]:
indicator.fit(df)
mask_all = indicator.transform(df)
mask_test_all = indicator.transform(df_test)

In [6]:
df_masks = pd.DataFrame(data=mask_all, index=df.index, columns=df.columns)
df_test_masks = pd.DataFrame(data=mask_test_all, index=df_test.index, columns=df_test.columns)

In [7]:
df_masks = df_masks.add_prefix('MI_')
df_test_masks = df_test_masks.add_prefix('MI_')

In [8]:
df_final = pd.concat([df, df_masks], axis=1, join='outer')
df_test_final = pd.concat([df_test, df_test_masks], axis=1, join='outer')

In [9]:
df_final.head()

Unnamed: 0,sp500_Close,sp500_Volume,vix_Close,vix_Volume,dji_Close,dji_Volume,ndx_Close,ndx_Volume,n255_Close,n255_Volume,...,MI_vix_ROC2_13_52,MI_vix_ROC2_13_52_diff1_period3,MI_vix_ROC2_5_26,MI_vix_ROC2_5_26_diff1_period3,MI_vix_ROC2_13_26,MI_vix_ROC2_13_26_diff1_period3,MI_vix_52_Week_High,MI_vix_52_Week_High_diff1_period3,MI_vix_52_Week_Low,MI_vix_52_Week_Low_diff1_period3
2000-01-07,1441.469971,1225200000.0,21.719999,0.0,11522.55957,184900000.0,3529.600098,1634930000.0,18193.410156,0.0,...,True,True,True,True,True,True,False,True,False,True
2000-01-14,1465.150024,1085900000.0,19.66,0.0,11722.980469,266830000.0,3704.73999,1656630000.0,18956.550781,0.0,...,True,True,True,True,True,True,False,True,False,True
2000-01-21,1441.359985,1209800000.0,20.82,0.0,11251.709961,205840000.0,3849.959961,1923680000.0,18878.089844,0.0,...,True,True,True,True,True,True,False,True,False,True
2000-01-28,1360.160034,1095800000.0,26.139999,0.0,10738.870117,183090000.0,3446.129883,1616370000.0,19434.779297,0.0,...,True,True,True,True,True,True,False,False,False,False
2000-02-04,1424.369995,1045100000.0,21.540001,0.0,10963.799805,166590000.0,3874.370117,1751450000.0,19763.130859,0.0,...,True,True,True,True,True,True,False,False,False,False


In [10]:
df.head()

Unnamed: 0,sp500_Close,sp500_Volume,vix_Close,vix_Volume,dji_Close,dji_Volume,ndx_Close,ndx_Volume,n255_Close,n255_Volume,...,vix_ROC2_13_52,vix_ROC2_13_52_diff1_period3,vix_ROC2_5_26,vix_ROC2_5_26_diff1_period3,vix_ROC2_13_26,vix_ROC2_13_26_diff1_period3,vix_52_Week_High,vix_52_Week_High_diff1_period3,vix_52_Week_Low,vix_52_Week_Low_diff1_period3
2000-01-07,1441.469971,1225200000.0,21.719999,0.0,11522.55957,184900000.0,3529.600098,1634930000.0,18193.410156,0.0,...,,,,,,,0.0,,0.0,
2000-01-14,1465.150024,1085900000.0,19.66,0.0,11722.980469,266830000.0,3704.73999,1656630000.0,18956.550781,0.0,...,,,,,,,-0.099647,,0.0,
2000-01-21,1441.359985,1209800000.0,20.82,0.0,11251.709961,205840000.0,3849.959961,1923680000.0,18878.089844,0.0,...,,,,,,,-0.042319,,0.057328,
2000-01-28,1360.160034,1095800000.0,26.139999,0.0,10738.870117,183090000.0,3446.129883,1616370000.0,19434.779297,0.0,...,,,,,,,0.0,0.0,0.284881,0.284881
2000-02-04,1424.369995,1045100000.0,21.540001,0.0,10963.799805,166590000.0,3874.370117,1751450000.0,19763.130859,0.0,...,,,,,,,-0.193555,-0.093908,0.091326,0.091326


In [11]:
df_final=df_final.fillna(0)
df_test_final=df_test_final.fillna(0)
df_final.head()

Unnamed: 0,sp500_Close,sp500_Volume,vix_Close,vix_Volume,dji_Close,dji_Volume,ndx_Close,ndx_Volume,n255_Close,n255_Volume,...,MI_vix_ROC2_13_52,MI_vix_ROC2_13_52_diff1_period3,MI_vix_ROC2_5_26,MI_vix_ROC2_5_26_diff1_period3,MI_vix_ROC2_13_26,MI_vix_ROC2_13_26_diff1_period3,MI_vix_52_Week_High,MI_vix_52_Week_High_diff1_period3,MI_vix_52_Week_Low,MI_vix_52_Week_Low_diff1_period3
2000-01-07,1441.469971,1225200000.0,21.719999,0.0,11522.55957,184900000.0,3529.600098,1634930000.0,18193.410156,0.0,...,True,True,True,True,True,True,False,True,False,True
2000-01-14,1465.150024,1085900000.0,19.66,0.0,11722.980469,266830000.0,3704.73999,1656630000.0,18956.550781,0.0,...,True,True,True,True,True,True,False,True,False,True
2000-01-21,1441.359985,1209800000.0,20.82,0.0,11251.709961,205840000.0,3849.959961,1923680000.0,18878.089844,0.0,...,True,True,True,True,True,True,False,True,False,True
2000-01-28,1360.160034,1095800000.0,26.139999,0.0,10738.870117,183090000.0,3446.129883,1616370000.0,19434.779297,0.0,...,True,True,True,True,True,True,False,False,False,False
2000-02-04,1424.369995,1045100000.0,21.540001,0.0,10963.799805,166590000.0,3874.370117,1751450000.0,19763.130859,0.0,...,True,True,True,True,True,True,False,False,False,False


In [12]:
df_final.to_csv(OUTPUT+'_training_FE.csv', index = True)
df_test_final.to_csv(OUTPUT+ '_test_FE.csv', index = True)