In [1]:
from imputationLibrary import decompose, forwardFilling, hotDeck, meanImputation, movingAverage, splineInterpolation, randomSampleImputation
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from scipy import signal
from sklearn import preprocessing
from datetime import datetime, timedelta

In [2]:
INPUT = 'output/\CompleteWeeklyIndexes'

In [3]:
df = pd.read_csv(INPUT+'.csv', parse_dates=True, index_col=0)
df_test = df.loc['2015-01-02':] #Selecting test data
df_train = df.loc['2000-01-01':'2015-01-01'] #Selecting training data
df_filled = df_train.fillna(0)
df_filled_test = df_test.fillna(0)

In [4]:
df_decomposed_resid = pd.DataFrame()
df_decomposed_trend = pd.DataFrame()
df_decomposed_seasonal = pd.DataFrame()

df_decomposed_resid_test = pd.DataFrame()
df_decomposed_trend_test = pd.DataFrame()
df_decomposed_seasonal_test = pd.DataFrame()

df_nan = df_train.isna()
df_nan_test = df_test.isna()

#df_mask_nan_resid = pd.DataFrame()
#for col in df_nan.columns:
#    df_mask_nan[col+"_resid"] = df_nan.loc[:,col]
#    df_mask_nan[col+"_trend"] = df_nan.loc[:,col]
#    df_mask_nan[col+"_seasonal"] = df_nan.loc[:,col]

In [5]:
def plot_ac(df, name):
    normalized = preprocessing.scale([np.array(df.fillna(0))])
    corr = signal.correlate(normalized, normalized, mode='full')
    plt.plot(corr[0], 'o-', markersize=2)
    plt.axhline(2/np.sqrt(len(df)), ls=':')
    plt.axhline(-2/np.sqrt(len(df)), ls=':')
    plt.title(name + ' auto-correlation')
    plt.show()

In [6]:
def decompose_and_plot(ts, flag_plot = False):
    ts_decomposed = decompose.decompose(ts)
    if flag_plot:
        ts_decomposed.plot()
        plt.show()
        print("Resid is white noise? ", nature.isWhiteNoise(ts_decomposed.resid))
        print("Resid is seasonal noise? ", nature.isSeasonal(ts_decomposed.resid))
        print("Resid is trended noise? ", nature.isTrended(ts_decomposed.resid))
        print("Resid is seasonal and trended noise? ", nature.isTrendedAndSeasonal(ts_decomposed.resid))
        plot_ac(ts_decomposed.resid, "Resid")
        plot_ac(ts_decomposed.trend, "Trend")
        plot_ac(ts_decomposed.seasonal, "Seasonal")
    return ts_decomposed.resid, ts_decomposed.trend, ts_decomposed.seasonal
    

In [7]:
for col in df_filled.columns:
    resid, trend, seasonal = decompose_and_plot(df_filled.loc[:,col])
    df_decomposed_resid[col] = resid
    df_decomposed_trend[col] = trend
    df_decomposed_seasonal[col] = seasonal

In [8]:
for col in df_filled_test.columns:
    resid, trend, seasonal = decompose_and_plot(df_filled_test.loc[:,col])
    df_decomposed_resid_test[col] = resid
    df_decomposed_trend_test[col] = trend
    df_decomposed_seasonal_test[col] = seasonal

In [9]:
df_final_resid = df_decomposed_resid.mask(df_nan == True, np.nan)
df_final_trend = df_decomposed_trend.mask(df_nan == True, np.nan)
df_final_seasonal = df_decomposed_seasonal.mask(df_nan == True, np.nan)

df_final_resid_test = df_decomposed_resid_test.mask(df_nan_test == True, np.nan)
df_final_trend_test = df_decomposed_trend_test.mask(df_nan_test == True, np.nan)
df_final_seasonal_test = df_decomposed_seasonal_test.mask(df_nan_test == True, np.nan)

In [10]:
df_final_resid.describe()

Unnamed: 0,sp500_Close,sp500_Volume,vix_Close,vix_Volume,dji_Close,dji_Volume,ndx_Close,ndx_Volume,n225_Close,n225_Volume,...,hsi_Close,hsi_Volume,n100_Close,n100_Volume,Overall EMV Tracker,infectious_daily_infect_emv_index,GPR,trade_US Trade Policy Uncertainty,trade_Japanese Trade Policy Uncertainty,trade_Trade Policy EMV Fraction
count,707.0,707.0,707.0,707.0,707.0,707.0,707.0,707.0,697.0,697.0,...,687.0,687.0,709.0,709.0,25.0,730.0,24.0,25.0,25.0,25.0
mean,31.520689,82101650.0,0.512391,0.0,289.243539,6704290.0,44.757206,55202030.0,447.022516,4202.613414,...,841.955498,58430900.0,15.889874,4573705.0,20.257319,-0.006521,74.817213,69.938125,105.83281,0.019893
std,99.058609,774229400.0,5.423991,0.0,877.47509,82997150.0,188.244001,438219300.0,1120.341098,35491.714212,...,2137.166798,492867500.0,61.735075,79614120.0,6.903183,0.85157,35.803494,41.372648,68.535009,0.010112
min,-221.168286,-2891955000.0,-16.230384,0.0,-1938.036097,-183163500.0,-592.751217,-1317365000.0,-2382.626743,-80782.471999,...,-5582.148859,-2179870000.0,-174.931935,-269913600.0,10.992357,-1.718898,30.05577,19.401226,22.679564,0.006184
25%,-24.864021,-339380100.0,-2.549696,0.0,-209.870293,-45596740.0,-58.311797,-196803000.0,-347.109257,-18750.091047,...,-554.759976,-236842700.0,-17.193426,-33724540.0,15.159727,-0.376253,56.789393,32.764978,66.660433,0.010815
50%,16.048851,-52837580.0,-0.459174,0.0,133.476078,-6410131.0,22.713524,-2205822.0,328.706128,340.673605,...,414.149348,-12573450.0,8.266563,-5069997.0,18.202432,-0.122785,64.942739,56.271474,81.57624,0.019595
75%,62.598499,409875600.0,2.625513,0.0,561.759328,44600360.0,121.472766,266266000.0,1024.002095,20653.9291,...,2001.898123,212838300.0,38.352098,32314600.0,23.465207,0.176837,90.307426,81.458473,116.46083,0.026684
max,533.817031,5623223000.0,37.894347,0.0,4739.816728,430299200.0,905.695825,2106075000.0,5427.616492,237253.448331,...,10902.639054,3800258000.0,335.322171,602331000.0,38.904223,15.883698,197.477275,167.837522,299.885719,0.039206


In [11]:
df_train.describe()

Unnamed: 0,sp500_Close,sp500_Volume,vix_Close,vix_Volume,dji_Close,dji_Volume,ndx_Close,ndx_Volume,n225_Close,n225_Volume,...,hsi_Close,hsi_Volume,n100_Close,n100_Volume,Overall EMV Tracker,infectious_daily_infect_emv_index,GPR,trade_US Trade Policy Uncertainty,trade_Japanese Trade Policy Uncertainty,trade_Trade Policy EMV Fraction
count,757.0,757.0,757.0,757.0,757.0,757.0,757.0,757.0,747.0,747.0,...,736.0,736.0,760.0,760.0,28.0,782.0,25.0,28.0,28.0,28.0
mean,1282.90576,2907586000.0,20.808666,0.0,11476.946036,231532700.0,2090.59152,1905727000.0,12279.704607,113937.751004,...,17642.327716,1155053000.0,748.857302,189492800.0,22.428534,0.453235,85.251996,73.614479,110.105627,0.022047
std,269.692241,1679785000.0,9.127394,0.0,2256.533097,105108200.0,843.54667,504410700.0,3042.473486,72814.03795,...,4917.36064,1017467000.0,157.207398,181260300.0,8.450329,0.95953,45.389471,44.702038,72.924769,0.011105
min,683.380005,356070000.0,10.02,0.0,6626.939941,19950000.0,815.400024,31740000.0,7173.100098,0.0,...,8409.009766,0.0,436.23999,0.0,11.617693,0.0,31.498811,18.482397,23.492212,0.006231
25%,1107.77002,1440500000.0,14.33,0.0,10139.780273,161120000.0,1493.079956,1615340000.0,9765.495117,68650.0,...,13494.907714,273420400.0,629.179993,0.0,16.303862,0.0,61.481388,34.994613,67.198094,0.011663
50%,1260.310059,2686480000.0,18.530001,0.0,10850.360352,218010000.0,1814.790039,1855510000.0,11360.400391,119900.0,...,17920.894532,1105677000.0,719.955017,219326900.0,20.04791,0.0,70.311863,63.463323,83.018032,0.021982
75%,1418.160034,3917450000.0,24.42,0.0,12681.160156,282930000.0,2571.22998,2168420000.0,14802.774902,156950.0,...,21917.960449,1810017000.0,852.002502,321470200.0,26.287055,0.59,97.519973,86.634818,122.046196,0.03285
max,2088.77002,11456230000.0,79.129997,0.0,18053.710938,738440000.0,4691.609863,4227720000.0,20434.679688,477400.0,...,30468.339844,6690249000.0,1134.329956,1095260000.0,43.429073,18.0,214.361523,185.267663,331.094112,0.042553


In [12]:
df_final_resid_test.describe()

Unnamed: 0,sp500_Close,sp500_Volume,vix_Close,vix_Volume,dji_Close,dji_Volume,ndx_Close,ndx_Volume,n225_Close,n225_Volume,...,hsi_Close,hsi_Volume,n100_Close,n100_Volume,Overall EMV Tracker,infectious_daily_infect_emv_index,GPR,trade_US Trade Policy Uncertainty,trade_Japanese Trade Policy Uncertainty,trade_Trade Policy EMV Fraction
count,213.0,213.0,213.0,213.0,213.0,213.0,213.0,213.0,212.0,212.0,...,208.0,208.0,214.0,214.0,7.0,220.0,8.0,7.0,7.0,7.0
mean,42.059701,99054760.0,0.576973,0.0,382.868523,10945060.0,92.438502,47312750.0,365.976295,2510.885533,...,999.5472,75229060.0,17.202183,6328578.0,16.807196,-0.028638,102.875075,299.650213,246.609426,0.105668
std,185.485405,643354900.0,3.718382,0.0,1649.132049,143374200.0,458.374582,410095200.0,1796.672848,28650.116428,...,2590.889191,534064300.0,73.218034,87956030.0,7.308045,0.423412,26.831599,377.194791,158.283427,0.08809
min,-220.879748,-1741794000.0,-6.728044,0.0,-1915.244451,-404014600.0,-607.786985,-1479630000.0,-2348.636559,-78403.198964,...,-2834.805186,-1419004000.0,-82.424398,-256437700.0,10.230143,-1.425292,61.590481,27.812459,96.350788,0.024504
25%,-52.548195,-250202100.0,-1.839246,0.0,-484.815557,-35492140.0,-164.441821,-163020000.0,-641.658601,-15274.605214,...,-830.464763,-247118900.0,-22.644552,-43281540.0,12.095908,-0.268599,84.834891,73.841265,126.378326,0.030521
50%,-9.818855,6061369.0,0.146283,0.0,-67.293213,4897355.0,-33.181403,-902694.3,16.835719,-727.838388,...,292.108099,374391.7,2.123282,4812520.0,13.328997,-0.048306,104.026847,123.561955,184.570754,0.069909
75%,67.640966,363569200.0,2.323495,0.0,506.831886,50099610.0,171.751001,218271500.0,694.953395,17499.721709,...,1511.822874,334214600.0,24.893745,46606060.0,20.459965,0.175797,117.540244,360.253014,337.678167,0.173995
max,781.975406,4602307000.0,15.909745,0.0,7204.819373,1544979000.0,1921.737392,2481641000.0,15025.145409,112751.368343,...,9110.703124,2230054000.0,290.855186,440103400.0,28.979487,1.654776,149.673751,1077.988521,517.231453,0.236229


In [13]:
df_final_resid.describe()

Unnamed: 0,sp500_Close,sp500_Volume,vix_Close,vix_Volume,dji_Close,dji_Volume,ndx_Close,ndx_Volume,n225_Close,n225_Volume,...,hsi_Close,hsi_Volume,n100_Close,n100_Volume,Overall EMV Tracker,infectious_daily_infect_emv_index,GPR,trade_US Trade Policy Uncertainty,trade_Japanese Trade Policy Uncertainty,trade_Trade Policy EMV Fraction
count,707.0,707.0,707.0,707.0,707.0,707.0,707.0,707.0,697.0,697.0,...,687.0,687.0,709.0,709.0,25.0,730.0,24.0,25.0,25.0,25.0
mean,31.520689,82101650.0,0.512391,0.0,289.243539,6704290.0,44.757206,55202030.0,447.022516,4202.613414,...,841.955498,58430900.0,15.889874,4573705.0,20.257319,-0.006521,74.817213,69.938125,105.83281,0.019893
std,99.058609,774229400.0,5.423991,0.0,877.47509,82997150.0,188.244001,438219300.0,1120.341098,35491.714212,...,2137.166798,492867500.0,61.735075,79614120.0,6.903183,0.85157,35.803494,41.372648,68.535009,0.010112
min,-221.168286,-2891955000.0,-16.230384,0.0,-1938.036097,-183163500.0,-592.751217,-1317365000.0,-2382.626743,-80782.471999,...,-5582.148859,-2179870000.0,-174.931935,-269913600.0,10.992357,-1.718898,30.05577,19.401226,22.679564,0.006184
25%,-24.864021,-339380100.0,-2.549696,0.0,-209.870293,-45596740.0,-58.311797,-196803000.0,-347.109257,-18750.091047,...,-554.759976,-236842700.0,-17.193426,-33724540.0,15.159727,-0.376253,56.789393,32.764978,66.660433,0.010815
50%,16.048851,-52837580.0,-0.459174,0.0,133.476078,-6410131.0,22.713524,-2205822.0,328.706128,340.673605,...,414.149348,-12573450.0,8.266563,-5069997.0,18.202432,-0.122785,64.942739,56.271474,81.57624,0.019595
75%,62.598499,409875600.0,2.625513,0.0,561.759328,44600360.0,121.472766,266266000.0,1024.002095,20653.9291,...,2001.898123,212838300.0,38.352098,32314600.0,23.465207,0.176837,90.307426,81.458473,116.46083,0.026684
max,533.817031,5623223000.0,37.894347,0.0,4739.816728,430299200.0,905.695825,2106075000.0,5427.616492,237253.448331,...,10902.639054,3800258000.0,335.322171,602331000.0,38.904223,15.883698,197.477275,167.837522,299.885719,0.039206


In [14]:
df_final_resid.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 782 entries, 2000-01-07 to 2014-12-26
Data columns (total 22 columns):
sp500_Close                                707 non-null float64
sp500_Volume                               707 non-null float64
vix_Close                                  707 non-null float64
vix_Volume                                 707 non-null float64
dji_Close                                  707 non-null float64
dji_Volume                                 707 non-null float64
ndx_Close                                  707 non-null float64
ndx_Volume                                 707 non-null float64
n225_Close                                 697 non-null float64
n225_Volume                                697 non-null float64
ftse_Close                                 665 non-null float64
ftse_Volume                                665 non-null float64
hsi_Close                                  687 non-null float64
hsi_Volume                                 6

In [15]:
df_final_resid_test.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 272 entries, 2015-01-02 to 2020-03-13
Data columns (total 22 columns):
sp500_Close                                213 non-null float64
sp500_Volume                               213 non-null float64
vix_Close                                  213 non-null float64
vix_Volume                                 213 non-null float64
dji_Close                                  213 non-null float64
dji_Volume                                 213 non-null float64
ndx_Close                                  213 non-null float64
ndx_Volume                                 213 non-null float64
n225_Close                                 212 non-null float64
n225_Volume                                212 non-null float64
ftse_Close                                 212 non-null float64
ftse_Volume                                212 non-null float64
hsi_Close                                  208 non-null float64
hsi_Volume                                 2

In [16]:
df_white_noise_train, df_white_noise_test =  meanImputation.input(df_final_resid, df_final_resid_test)

In [17]:
df_seasonal_train, df_seasonal_test = splineInterpolation.input(df_final_seasonal, df_final_seasonal_test)

In [18]:
df_trended_train, df_trended_test = forwardFilling.input(df_final_trend, df_final_trend_test)

In [19]:
df_final_train = df_white_noise_train + df_seasonal_train + df_trended_train
df_final_test = df_white_noise_test + df_seasonal_test + df_trended_test

In [20]:
df_final_train

Unnamed: 0,sp500_Close,sp500_Volume,vix_Close,vix_Volume,dji_Close,dji_Volume,ndx_Close,ndx_Volume,n225_Close,n225_Volume,...,hsi_Close,hsi_Volume,n100_Close,n100_Volume,Overall EMV Tracker,infectious_daily_infect_emv_index,GPR,trade_US Trade Policy Uncertainty,trade_Japanese Trade Policy Uncertainty,trade_Trade Policy EMV Fraction
2000-01-07,1302.759115,1.053365e+08,20.313193,0.0,9643.063345,1.589524e+08,3335.360666,1.387053e+09,12919.706843,-50297.881620,...,13138.976698,-3.586002e+08,945.772328,-9.200717e+07,21.353774,0.708640,79.894781,72.070390,107.529366,0.021449
2000-01-14,1392.538401,3.368326e+08,21.674478,0.0,10440.301008,1.687793e+08,3514.037333,1.582127e+09,13623.026315,-37682.634367,...,14250.868194,-2.666304e+08,986.891974,-8.010566e+07,21.211218,0.521497,79.709465,71.560113,106.791898,0.021185
2000-01-21,1479.006707,1.132773e+09,22.117307,0.0,11202.079570,2.030456e+08,3667.996984,1.921738e+09,15692.398811,1802.667831,...,17453.691286,1.886116e+08,1086.506214,-1.448692e+07,21.088289,0.705069,79.528987,71.121006,106.159413,0.020958
2000-01-28,1469.721456,1.486310e+09,22.997761,0.0,11102.617134,2.625860e+08,3653.057405,2.042156e+09,16731.444549,21800.195303,...,15221.359112,2.656471e+08,1079.020054,3.010646e+07,20.984220,0.493640,79.353310,70.750115,105.627189,0.020767
2000-02-04,1465.731384,1.504249e+09,23.736854,0.0,11118.321386,2.719860e+08,3627.539720,1.990107e+09,16646.704492,21401.981018,...,17300.878223,4.233838e+08,1077.763867,5.152582e+07,20.898242,0.422747,79.182396,70.444486,105.190500,0.020609
2000-02-11,1462.635332,1.438858e+09,23.753241,0.0,11104.459008,2.225883e+08,3605.272494,2.031833e+09,16167.023997,27471.257045,...,17110.575915,2.657649e+08,1076.131318,1.729532e+07,20.829589,0.316346,79.016207,70.201163,104.844624,0.020484
2000-02-18,1464.446933,1.466994e+09,23.563709,0.0,11129.253869,2.165908e+08,3611.575601,1.950463e+09,15748.600274,22075.538710,...,11196.034489,-3.227866e+08,1076.210174,3.002427e+07,20.777493,0.384938,78.854707,70.017193,104.584835,0.020390
2000-02-25,1463.647242,1.050740e+09,23.215673,0.0,11136.204302,2.213977e+08,3607.017127,1.833320e+09,15840.977662,1869.700798,...,17057.467708,-4.741663e+07,1073.182974,1.770158e+07,20.741184,0.476181,78.697856,69.889620,104.406409,0.020325
2000-03-03,1469.314120,1.030734e+09,22.767774,0.0,11172.568665,2.405142e+08,3609.419957,1.902344e+09,16631.880889,22187.077171,...,16993.181924,-1.022106e+08,1078.792393,3.835954e+06,20.719896,0.394080,78.545618,69.815490,104.304623,0.020287
2000-03-10,1460.723057,1.354553e+09,23.633441,0.0,11094.399635,2.623964e+08,3593.290980,1.950526e+09,16740.021672,11798.752996,...,16940.550720,7.825305e+06,1074.728243,1.546442e+07,20.712861,0.379918,78.397955,69.791849,104.274753,0.020276


In [21]:
df_train

Unnamed: 0,sp500_Close,sp500_Volume,vix_Close,vix_Volume,dji_Close,dji_Volume,ndx_Close,ndx_Volume,n225_Close,n225_Volume,...,hsi_Close,hsi_Volume,n100_Close,n100_Volume,Overall EMV Tracker,infectious_daily_infect_emv_index,GPR,trade_US Trade Policy Uncertainty,trade_Japanese Trade Policy Uncertainty,trade_Trade Policy EMV Fraction
2000-01-07,1441.469971,1.225200e+09,21.719999,0.0,11522.559570,184900000.0,3529.600098,1.634930e+09,18193.410156,0.0,...,15405.629883,0.000000e+00,943.880005,0.0,,0.00,,,,
2000-01-14,1465.150024,1.085900e+09,19.660000,0.0,11722.980469,266830000.0,3704.739990,1.656630e+09,18956.550781,0.0,...,15542.230469,0.000000e+00,973.859985,0.0,,0.00,,,,
2000-01-21,1441.359985,1.209800e+09,20.820000,0.0,11251.709961,205840000.0,3849.959961,1.923680e+09,18878.089844,0.0,...,15108.410156,0.000000e+00,951.510010,0.0,,0.00,,,,
2000-01-28,1360.160034,1.095800e+09,26.139999,0.0,10738.870117,183090000.0,3446.129883,1.616370e+09,19434.779297,0.0,...,16185.940430,0.000000e+00,948.210022,0.0,24.412899,0.00,,71.914247,117.577146,0.033573
2000-02-04,1424.369995,1.045100e+09,21.540001,0.0,10963.799805,166590000.0,3874.370117,1.751450e+09,19763.130859,0.0,...,,,1010.429993,0.0,,0.75,,,,
2000-02-11,1387.119995,1.025700e+09,24.420000,0.0,10425.209961,193540000.0,3968.889893,1.738590e+09,,,...,17380.300781,0.000000e+00,1014.169983,0.0,,0.74,,,,
2000-02-18,1346.089966,1.042300e+09,26.000000,0.0,10219.519531,208930000.0,3965.750000,1.898410e+09,19789.029297,0.0,...,16599.160156,0.000000e+00,992.530029,0.0,,0.00,,,,
2000-02-25,1333.359985,1.065200e+09,25.200001,0.0,9862.120117,195240000.0,4178.580078,1.825500e+09,19817.880859,0.0,...,17200.980469,0.000000e+00,1004.719971,0.0,,0.00,,,,
2000-03-03,1409.170044,1.150300e+09,19.209999,0.0,10367.200195,210460000.0,4442.870117,2.136530e+09,19927.539063,0.0,...,17285.240234,0.000000e+00,1042.400024,0.0,,0.00,,,,
2000-03-10,1395.069946,1.138800e+09,21.240000,0.0,9928.820313,197440000.0,4587.160156,1.992170e+09,19750.400391,0.0,...,17831.859375,0.000000e+00,1045.270020,0.0,,0.00,,,,


In [22]:
# Subsample para weekly

# Generating weekly sampled dataset
def generate_weekly(df, initial_friday):
    df_friday = initial_friday #datetime(2000,1,7)

    df_weekly = pd.DataFrame(columns = df.columns)
    
    while df_friday <= df.tail(1).index:
        df_weekly = df_weekly.append(df.loc[df_friday])
        df_friday+=timedelta(days=7)
    return df_weekly

In [23]:
df_train_weekly = generate_weekly(df_final_train, datetime(2000,1,7))
df_test_weekly = generate_weekly(df_final_test, datetime(2015,1,2))

In [1]:
df_train_weekly.to_csv(r'output\/CompleteIndexesWeeklyTrainDecomp.csv', index = True)
df_test_weekly.to_csv(r'output\/CompleteIndexesWeeklyTestDecomp.csv', index = True)

NameError: name 'df_train_weekly' is not defined