## This notebook inputs the missing values using Regression

In [1]:
import pandas as pd
from datetime import datetime
import numpy as np
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn import preprocessing

#### Choose wether the imputation should be done on the data set with daily or weekly granularity

In [2]:
#INPUT, OUTPUT = 'CompleteIndexes.csv','RegressionImputed.csv'
INPUT, OUTPUT = 'CompleteWeeklyIndexes', 'RegressionWeeklyImputed'

In [3]:
df = pd.read_csv(INPUT+'.csv', parse_dates=True, index_col=0)
df_test = df.loc['2015-01-02':] #Selecting test data
df = df.loc['2000-01-01':'2015-01-01'] #Selecting training data

In [4]:
#Scaled data has zero mean and unit variance
scaler = preprocessing.StandardScaler()
scaler.fit(df)
array = scaler.transform(df)
array_test = scaler.transform(df_test)
df_scaled = pd.DataFrame(data=array, index=df.index, columns=df.columns)
df_test_scaled = pd.DataFrame(data=array_test, index=df_test.index, columns=df_test.columns)
df_scaled.head()

Unnamed: 0,sp500_Close,sp500_Volume,vix_Close,vix_Volume,dji_Close,dji_Volume,ndx_Close,ndx_Volume,n255_Close,n255_Volume,...,hsi_Close,hsi_Volume,n100_Close,n100_Volume,Overall EMV Tracker,infectious_daily_infect_emv_index,GPR,trade_US Trade Policy Uncertainty,trade_Japanese Trade Policy Uncertainty,trade_Trade Policy EMV Fraction
2000-01-07,0.588334,-1.002211,0.099912,0.0,0.020227,-0.443957,1.707031,-0.537214,1.945019,-1.565826,...,-0.455167,-1.135995,1.241361,-1.046107,,-0.472653,,,,
2000-01-14,0.676196,-1.085193,-0.125931,0.0,0.109104,0.33604,1.914791,-0.494165,2.196016,-1.565826,...,-0.427369,-1.135995,1.43219,-1.046107,,-0.472653,,,,
2000-01-21,0.587926,-1.011385,0.001243,0.0,-0.099881,-0.244602,2.087059,0.035615,2.17021,-1.565826,...,-0.515651,-1.135995,1.289928,-1.046107,,-0.472653,,,,
2000-01-28,0.286643,-1.079295,0.584489,0.0,-0.3273,-0.461189,1.608014,-0.574034,2.353305,-1.565826,...,-0.296374,-1.135995,1.268922,-1.046107,0.239136,-0.472653,,-0.038733,0.104335,1.056953
2000-02-04,0.524886,-1.109498,0.080178,0.0,-0.227555,-0.618274,2.116016,-0.306059,2.4613,-1.565826,...,,,1.664966,-1.046107,,0.309479,,,,


In [5]:
imp = IterativeImputer(max_iter=10, random_state=0)
imp.fit(df_scaled)
df_imputed = imp.transform(df_scaled)
df_test_imputed = imp.transform(df_test_scaled)
df_final = pd.DataFrame(data=df_imputed, index=df.index, columns=df.columns)
df_test_final = pd.DataFrame(data=df_test_imputed, index=df_test.index, columns=df_test.columns)



In [6]:
# Descaling the data
array_2 = scaler.inverse_transform(df_final)
df_final2 = pd.DataFrame(data=array_2, index=df.index, columns=df.columns)

array_test_2 = scaler.inverse_transform(df_test_final)
df_test_final2 = pd.DataFrame(data=array_test_2, index=df_test.index, columns=df_test.columns)

df_final2.head()

Unnamed: 0,sp500_Close,sp500_Volume,vix_Close,vix_Volume,dji_Close,dji_Volume,ndx_Close,ndx_Volume,n255_Close,n255_Volume,...,hsi_Close,hsi_Volume,n100_Close,n100_Volume,Overall EMV Tracker,infectious_daily_infect_emv_index,GPR,trade_US Trade Policy Uncertainty,trade_Japanese Trade Policy Uncertainty,trade_Trade Policy EMV Fraction
2000-01-07,1441.469971,1225200000.0,21.719999,0.0,11522.55957,184900000.0,3529.600098,1634930000.0,18193.410156,0.0,...,15405.629883,0.0,943.880005,2.980232e-08,25.449135,0.0,85.08444,73.677708,75.376046,0.028165
2000-01-14,1465.150024,1085900000.0,19.66,0.0,11722.980469,266830000.0,3704.73999,1656630000.0,18956.550781,0.0,...,15542.230469,0.0,973.859985,2.980232e-08,24.782856,0.0,85.067248,73.619851,66.06679,0.028227
2000-01-21,1441.359985,1209800000.0,20.82,0.0,11251.709961,205840000.0,3849.959961,1923680000.0,18878.089844,0.0,...,15108.410156,0.0,951.51001,2.980232e-08,25.447923,0.0,85.083162,73.661115,72.308703,0.028414
2000-01-28,1360.160034,1095800000.0,26.139999,0.0,10738.870117,183090000.0,3446.129883,1616370000.0,19434.779297,0.0,...,16185.94043,0.0,948.210022,2.980232e-08,24.412899,0.0,85.094398,71.914247,117.577146,0.033573
2000-02-04,1424.369995,1045100000.0,21.540001,0.0,10963.799805,166590000.0,3874.370117,1751450000.0,19763.130859,0.0,...,16501.644187,215924000.0,1010.429993,2.980232e-08,23.91644,0.75,85.189301,73.59499,43.911194,0.031909


In [7]:
df_final2.to_csv(OUTPUT+'_training.csv', index = True)
df_test_final2.to_csv(OUTPUT+ '_test.csv', index = True)