## This notebook inputs the missing values using Missing Indicators

In [1]:
import pandas as pd
from datetime import datetime
import numpy as np
from sklearn.impute import MissingIndicator

#### Choose wether the imputation should be done on the data set with daily or weekly granularity


In [2]:
INPUT, OUTPUT = 'output/\CompleteWeeklyIndexes', 'output/\MIIWeeklyImputed'

In [3]:
df = pd.read_csv(INPUT+'.csv', parse_dates=True, index_col=0)
df_test = df.loc['2015-01-02':] #Selecting test data
df = df.loc['2000-01-01':'2015-01-01'] #Selecting training data

In [4]:
indicator = MissingIndicator(missing_values=np.nan, features="all")

In [5]:
indicator.fit(df)
mask_all = indicator.transform(df)
mask_test_all = indicator.transform(df_test)

In [6]:
df_masks = pd.DataFrame(data=mask_all, index=df.index, columns=df.columns)
df_test_masks = pd.DataFrame(data=mask_test_all, index=df_test.index, columns=df_test.columns)

In [7]:
df_masks = df_masks.add_prefix('MI_')
df_test_masks = df_test_masks.add_prefix('MI_')

In [8]:
df_final = pd.concat([df, df_masks], axis=1, join='outer')
df_test_final = pd.concat([df_test, df_test_masks], axis=1, join='outer')

In [9]:
df_final.head()

Unnamed: 0,sp500_Close,sp500_Volume,vix_Close,vix_Volume,dji_Close,dji_Volume,ndx_Close,ndx_Volume,n225_Close,n225_Volume,...,MI_hsi_Close,MI_hsi_Volume,MI_n100_Close,MI_n100_Volume,MI_Overall EMV Tracker,MI_infectious_daily_infect_emv_index,MI_GPR,MI_trade_US Trade Policy Uncertainty,MI_trade_Japanese Trade Policy Uncertainty,MI_trade_Trade Policy EMV Fraction
2000-01-07,1420.333984,1068760000.0,25.016,0.0,11250.781836,182562000.0,3542.894043,1598166000.0,18476.772461,0.0,...,False,False,False,False,True,False,True,True,True,True
2000-01-14,1448.648023,1033940000.0,21.684,0.0,11587.958008,196256000.0,3611.343994,1609134000.0,18829.544922,0.0,...,False,False,False,False,True,False,True,True,True,True
2000-01-21,1449.492493,1113750000.0,21.4475,0.0,11413.272461,197595000.0,3810.092468,1753105000.0,19083.530078,0.0,...,False,False,False,False,True,False,True,True,True,True
2000-01-28,1394.874023,1106420000.0,23.96,0.0,10967.58789,195636000.0,3616.111963,1772902000.0,19141.585938,0.0,...,False,False,False,False,False,False,True,False,False,False
2000-02-04,1412.43999,1041000000.0,23.014001,0.0,10992.404102,176876000.0,3744.364014,1581506000.0,19618.308203,0.0,...,False,False,False,False,True,False,False,True,True,True


In [10]:
df.head()

Unnamed: 0,sp500_Close,sp500_Volume,vix_Close,vix_Volume,dji_Close,dji_Volume,ndx_Close,ndx_Volume,n225_Close,n225_Volume,...,hsi_Close,hsi_Volume,n100_Close,n100_Volume,Overall EMV Tracker,infectious_daily_infect_emv_index,GPR,trade_US Trade Policy Uncertainty,trade_Japanese Trade Policy Uncertainty,trade_Trade Policy EMV Fraction
2000-01-07,1420.333984,1068760000.0,25.016,0.0,11250.781836,182562000.0,3542.894043,1598166000.0,18476.772461,0.0,...,16169.60625,0.0,949.868006,0.0,,0.252857,,,,
2000-01-14,1448.648023,1033940000.0,21.684,0.0,11587.958008,196256000.0,3611.343994,1609134000.0,18829.544922,0.0,...,15720.128125,0.0,957.357996,0.0,,0.214286,,,,
2000-01-21,1449.492493,1113750000.0,21.4475,0.0,11413.272461,197595000.0,3810.092468,1753105000.0,19083.530078,0.0,...,15392.563867,0.0,958.146008,0.0,,0.11,,,,
2000-01-28,1394.874023,1106420000.0,23.96,0.0,10967.58789,195636000.0,3616.111963,1772902000.0,19141.585938,0.0,...,15560.411914,0.0,948.61001,0.0,24.412899,0.49,,71.914247,117.577146,0.033573
2000-02-04,1412.43999,1041000000.0,23.014001,0.0,10992.404102,176876000.0,3744.364014,1581506000.0,19618.308203,0.0,...,15736.035157,0.0,969.562,0.0,,0.741429,34.391162,,,


In [11]:
df_final=df_final.fillna(0)
df_test_final=df_test_final.fillna(0)
df_final.head()

Unnamed: 0,sp500_Close,sp500_Volume,vix_Close,vix_Volume,dji_Close,dji_Volume,ndx_Close,ndx_Volume,n225_Close,n225_Volume,...,MI_hsi_Close,MI_hsi_Volume,MI_n100_Close,MI_n100_Volume,MI_Overall EMV Tracker,MI_infectious_daily_infect_emv_index,MI_GPR,MI_trade_US Trade Policy Uncertainty,MI_trade_Japanese Trade Policy Uncertainty,MI_trade_Trade Policy EMV Fraction
2000-01-07,1420.333984,1068760000.0,25.016,0.0,11250.781836,182562000.0,3542.894043,1598166000.0,18476.772461,0.0,...,False,False,False,False,True,False,True,True,True,True
2000-01-14,1448.648023,1033940000.0,21.684,0.0,11587.958008,196256000.0,3611.343994,1609134000.0,18829.544922,0.0,...,False,False,False,False,True,False,True,True,True,True
2000-01-21,1449.492493,1113750000.0,21.4475,0.0,11413.272461,197595000.0,3810.092468,1753105000.0,19083.530078,0.0,...,False,False,False,False,True,False,True,True,True,True
2000-01-28,1394.874023,1106420000.0,23.96,0.0,10967.58789,195636000.0,3616.111963,1772902000.0,19141.585938,0.0,...,False,False,False,False,False,False,True,False,False,False
2000-02-04,1412.43999,1041000000.0,23.014001,0.0,10992.404102,176876000.0,3744.364014,1581506000.0,19618.308203,0.0,...,False,False,False,False,True,False,False,True,True,True


In [12]:
df_final.to_csv(OUTPUT+'_training.csv', index = True)
df_test_final.to_csv(OUTPUT+ '_test.csv', index = True)