## This notebook inputs the missing values using Missing Indicators

In [1]:
import pandas as pd
from datetime import datetime
import numpy as np
from sklearn.impute import MissingIndicator

#### Choose wether the imputation should be done on the data set with daily or weekly granularity


In [2]:
INPUT, OUTPUT = 'output/\CompleteWeeklyIndexes', 'output/\MIIWeeklyImputed'

In [3]:
df = pd.read_csv(INPUT+'.csv', parse_dates=True, index_col=0)
df_test = df.loc['2015-01-02':] #Selecting test data
df = df.loc['2000-01-01':'2015-01-01'] #Selecting training data

In [4]:
indicator = MissingIndicator(missing_values=np.nan, features="all")

In [5]:
indicator.fit(df)
mask_all = indicator.transform(df)
mask_test_all = indicator.transform(df_test)

In [6]:
df_masks = pd.DataFrame(data=mask_all, index=df.index, columns=df.columns)
df_test_masks = pd.DataFrame(data=mask_test_all, index=df_test.index, columns=df_test.columns)

In [7]:
df_masks = df_masks.add_prefix('MI_')
df_test_masks = df_test_masks.add_prefix('MI_')

In [8]:
df_final = pd.concat([df, df_masks], axis=1, join='outer')
df_test_final = pd.concat([df_test, df_test_masks], axis=1, join='outer')

In [9]:
df_final.head()

Unnamed: 0,sp500_Close,sp500_Volume,vix_Close,vix_Volume,dji_Close,dji_Volume,ndx_Close,ndx_Volume,n225_Close,n225_Volume,...,MI_hsi_Close,MI_hsi_Volume,MI_n100_Close,MI_n100_Volume,MI_Overall EMV Tracker,MI_infectious_daily_infect_emv_index,MI_GPR,MI_trade_US Trade Policy Uncertainty,MI_trade_Japanese Trade Policy Uncertainty,MI_trade_Trade Policy EMV Fraction
2000-01-07,1441.469971,1225200000.0,21.719999,0.0,11522.55957,184900000.0,3529.600098,1634930000.0,18193.410156,0.0,...,False,False,False,False,True,False,True,True,True,True
2000-01-14,1465.150024,1085900000.0,19.66,0.0,11722.980469,266830000.0,3704.73999,1656630000.0,18956.550781,0.0,...,False,False,False,False,True,False,True,True,True,True
2000-01-21,1441.359985,1209800000.0,20.82,0.0,11251.709961,205840000.0,3849.959961,1923680000.0,18878.089844,0.0,...,False,False,False,False,True,False,True,True,True,True
2000-01-28,1360.160034,1095800000.0,26.139999,0.0,10738.870117,183090000.0,3446.129883,1616370000.0,19434.779297,0.0,...,False,False,False,False,False,False,True,False,False,False
2000-02-04,1424.369995,1045100000.0,21.540001,0.0,10963.799805,166590000.0,3874.370117,1751450000.0,19763.130859,0.0,...,True,True,False,False,True,False,True,True,True,True


In [10]:
df.head()

Unnamed: 0,sp500_Close,sp500_Volume,vix_Close,vix_Volume,dji_Close,dji_Volume,ndx_Close,ndx_Volume,n225_Close,n225_Volume,...,hsi_Close,hsi_Volume,n100_Close,n100_Volume,Overall EMV Tracker,infectious_daily_infect_emv_index,GPR,trade_US Trade Policy Uncertainty,trade_Japanese Trade Policy Uncertainty,trade_Trade Policy EMV Fraction
2000-01-07,1441.469971,1225200000.0,21.719999,0.0,11522.55957,184900000.0,3529.600098,1634930000.0,18193.410156,0.0,...,15405.629883,0.0,943.880005,0.0,,0.0,,,,
2000-01-14,1465.150024,1085900000.0,19.66,0.0,11722.980469,266830000.0,3704.73999,1656630000.0,18956.550781,0.0,...,15542.230469,0.0,973.859985,0.0,,0.0,,,,
2000-01-21,1441.359985,1209800000.0,20.82,0.0,11251.709961,205840000.0,3849.959961,1923680000.0,18878.089844,0.0,...,15108.410156,0.0,951.51001,0.0,,0.0,,,,
2000-01-28,1360.160034,1095800000.0,26.139999,0.0,10738.870117,183090000.0,3446.129883,1616370000.0,19434.779297,0.0,...,16185.94043,0.0,948.210022,0.0,24.412899,0.0,,71.914247,117.577146,0.033573
2000-02-04,1424.369995,1045100000.0,21.540001,0.0,10963.799805,166590000.0,3874.370117,1751450000.0,19763.130859,0.0,...,,,1010.429993,0.0,,0.75,,,,


In [11]:
df_final=df_final.fillna(0)
df_test_final=df_test_final.fillna(0)
df_final.head()

Unnamed: 0,sp500_Close,sp500_Volume,vix_Close,vix_Volume,dji_Close,dji_Volume,ndx_Close,ndx_Volume,n225_Close,n225_Volume,...,MI_hsi_Close,MI_hsi_Volume,MI_n100_Close,MI_n100_Volume,MI_Overall EMV Tracker,MI_infectious_daily_infect_emv_index,MI_GPR,MI_trade_US Trade Policy Uncertainty,MI_trade_Japanese Trade Policy Uncertainty,MI_trade_Trade Policy EMV Fraction
2000-01-07,1441.469971,1225200000.0,21.719999,0.0,11522.55957,184900000.0,3529.600098,1634930000.0,18193.410156,0.0,...,False,False,False,False,True,False,True,True,True,True
2000-01-14,1465.150024,1085900000.0,19.66,0.0,11722.980469,266830000.0,3704.73999,1656630000.0,18956.550781,0.0,...,False,False,False,False,True,False,True,True,True,True
2000-01-21,1441.359985,1209800000.0,20.82,0.0,11251.709961,205840000.0,3849.959961,1923680000.0,18878.089844,0.0,...,False,False,False,False,True,False,True,True,True,True
2000-01-28,1360.160034,1095800000.0,26.139999,0.0,10738.870117,183090000.0,3446.129883,1616370000.0,19434.779297,0.0,...,False,False,False,False,False,False,True,False,False,False
2000-02-04,1424.369995,1045100000.0,21.540001,0.0,10963.799805,166590000.0,3874.370117,1751450000.0,19763.130859,0.0,...,True,True,False,False,True,False,True,True,True,True


In [12]:
df_final.to_csv(OUTPUT+'_training.csv', index = True)
df_test_final.to_csv(OUTPUT+ '_test.csv', index = True)