In [1]:
from imputationLibrary import decompose, forwardFilling, hotDeck, meanImputation, movingAverage, splineInterpolation, randomSampleImputation
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from scipy import signal
from sklearn import preprocessing
from datetime import datetime, timedelta
from tsmoothie.smoother import *

In [2]:
INPUT = 'output/\CompleteWeeklyIndexes'

In [3]:
df = pd.read_csv(INPUT+'.csv', parse_dates=True, index_col=0)
df_test = df.loc['2015-01-02':] #Selecting test data
df_train = df.loc['2000-01-01':'2015-01-01'] #Selecting training data
df_filled = df_train.fillna(0)
df_filled_test = df_test.fillna(0)

In [4]:
df_decomposed_resid = pd.DataFrame()
df_decomposed_trend = pd.DataFrame()
df_decomposed_seasonal = pd.DataFrame()

df_decomposed_resid_test = pd.DataFrame()
df_decomposed_trend_test = pd.DataFrame()
df_decomposed_seasonal_test = pd.DataFrame()

df_nan = df_train.isna()
df_nan_test = df_test.isna()

#df_mask_nan_resid = pd.DataFrame()
#for col in df_nan.columns:
#    df_mask_nan[col+"_resid"] = df_nan.loc[:,col]
#    df_mask_nan[col+"_trend"] = df_nan.loc[:,col]
#    df_mask_nan[col+"_seasonal"] = df_nan.loc[:,col]

In [5]:
def plot_ac(df, name):
    normalized = preprocessing.scale([np.array(df.fillna(0))])
    corr = signal.correlate(normalized, normalized, mode='full')
    plt.plot(corr[0], 'o-', markersize=2)
    plt.axhline(2/np.sqrt(len(df)), ls=':')
    plt.axhline(-2/np.sqrt(len(df)), ls=':')
    plt.title(name + ' auto-correlation')
    plt.show()

In [6]:
def decompose_and_plot(ts, flag_plot = False):
    ts_decomposed = decompose.decompose(ts)
    if flag_plot:
        ts_decomposed.plot()
        plt.show()
        print("Resid is white noise? ", nature.isWhiteNoise(ts_decomposed.resid))
        print("Resid is seasonal noise? ", nature.isSeasonal(ts_decomposed.resid))
        print("Resid is trended noise? ", nature.isTrended(ts_decomposed.resid))
        print("Resid is seasonal and trended noise? ", nature.isTrendedAndSeasonal(ts_decomposed.resid))
        plot_ac(ts_decomposed.resid, "Resid")
        plot_ac(ts_decomposed.trend, "Trend")
        plot_ac(ts_decomposed.seasonal, "Seasonal")
    return ts_decomposed.resid, ts_decomposed.trend, ts_decomposed.seasonal
    

In [7]:
for col in df_filled.columns:
    resid, trend, seasonal = decompose_and_plot(df_filled.loc[:,col])
    df_decomposed_resid[col] = resid
    df_decomposed_trend[col] = trend
    df_decomposed_seasonal[col] = seasonal

In [8]:
for col in df_filled_test.columns:
    resid, trend, seasonal = decompose_and_plot(df_filled_test.loc[:,col])
    df_decomposed_resid_test[col] = resid
    df_decomposed_trend_test[col] = trend
    df_decomposed_seasonal_test[col] = seasonal

In [9]:
df_final_resid = df_decomposed_resid.mask(df_nan == True, np.nan)
df_final_trend = df_decomposed_trend.mask(df_nan == True, np.nan)
df_final_seasonal = df_decomposed_seasonal.mask(df_nan == True, np.nan)

df_final_resid_test = df_decomposed_resid_test.mask(df_nan_test == True, np.nan)
df_final_trend_test = df_decomposed_trend_test.mask(df_nan_test == True, np.nan)
df_final_seasonal_test = df_decomposed_seasonal_test.mask(df_nan_test == True, np.nan)

In [10]:
df_white_noise_train, df_white_noise_test =  meanImputation.input(df_final_resid, df_final_resid_test)

In [11]:
df_seasonal_train, df_seasonal_test = splineInterpolation.input(df_final_seasonal, df_final_seasonal_test)

In [12]:
df_trended_train, df_trended_test = forwardFilling.input(df_final_trend, df_final_trend_test)

In [13]:
df_final_train = df_white_noise_train + df_seasonal_train + df_trended_train
df_final_test = df_white_noise_test + df_seasonal_test + df_trended_test

In [14]:
df_final_train

Unnamed: 0,sp500_Close,sp500_Volume,vix_Close,vix_Volume,dji_Close,dji_Volume,ndx_Close,ndx_Volume,n225_Close,n225_Volume,...,hsi_Close,hsi_Volume,n100_Close,n100_Volume,Overall EMV Tracker,infectious_daily_infect_emv_index,GPR,trade_US Trade Policy Uncertainty,trade_Japanese Trade Policy Uncertainty,trade_Trade Policy EMV Fraction
2000-01-07,1302.759115,1.053365e+08,20.313193,0.0,9643.063345,1.589524e+08,3335.360666,1.387053e+09,12919.706843,-50297.881620,...,13138.976698,-3.586002e+08,945.772328,-9.200717e+07,21.353774,0.708640,79.894781,72.070390,107.529366,0.021449
2000-01-14,1392.538401,3.368326e+08,21.674478,0.0,10440.301008,1.687793e+08,3514.037333,1.582127e+09,13623.026315,-37682.634367,...,14250.868194,-2.666304e+08,986.891974,-8.010566e+07,21.211218,0.521497,79.709465,71.560113,106.791898,0.021185
2000-01-21,1479.006707,1.132773e+09,22.117307,0.0,11202.079570,2.030456e+08,3667.996984,1.921738e+09,15692.398811,1802.667831,...,17453.691286,1.886116e+08,1086.506214,-1.448692e+07,21.088289,0.705069,79.528987,71.121006,106.159413,0.020958
2000-01-28,1469.721456,1.486310e+09,22.997761,0.0,11102.617134,2.625860e+08,3653.057405,2.042156e+09,16731.444549,21800.195303,...,15221.359112,2.656471e+08,1079.020054,3.010646e+07,20.984220,0.493640,79.353310,70.750115,105.627189,0.020767
2000-02-04,1465.731384,1.504249e+09,23.736854,0.0,11118.321386,2.719860e+08,3627.539720,1.990107e+09,16646.704492,21401.981018,...,17300.878223,4.233838e+08,1077.763867,5.152582e+07,20.898242,0.422747,79.182396,70.444486,105.190500,0.020609
2000-02-11,1462.635332,1.438858e+09,23.753241,0.0,11104.459008,2.225883e+08,3605.272494,2.031833e+09,16167.023997,27471.257045,...,17110.575915,2.657649e+08,1076.131318,1.729532e+07,20.829589,0.316346,79.016207,70.201163,104.844624,0.020484
2000-02-18,1464.446933,1.466994e+09,23.563709,0.0,11129.253869,2.165908e+08,3611.575601,1.950463e+09,15748.600274,22075.538710,...,11196.034489,-3.227866e+08,1076.210174,3.002427e+07,20.777493,0.384938,78.854707,70.017193,104.584835,0.020390
2000-02-25,1463.647242,1.050740e+09,23.215673,0.0,11136.204302,2.213977e+08,3607.017127,1.833320e+09,15840.977662,1869.700798,...,17057.467708,-4.741663e+07,1073.182974,1.770158e+07,20.741184,0.476181,78.697856,69.889620,104.406409,0.020325
2000-03-03,1469.314120,1.030734e+09,22.767774,0.0,11172.568665,2.405142e+08,3609.419957,1.902344e+09,16631.880889,22187.077171,...,16993.181924,-1.022106e+08,1078.792393,3.835954e+06,20.719896,0.394080,78.545618,69.815490,104.304623,0.020287
2000-03-10,1460.723057,1.354553e+09,23.633441,0.0,11094.399635,2.623964e+08,3593.290980,1.950526e+09,16740.021672,11798.752996,...,16940.550720,7.825305e+06,1074.728243,1.546442e+07,20.712861,0.379918,78.397955,69.791849,104.274753,0.020276


In [15]:
### USE KALMAN FILTER TO SMOOTH ALL DATA (ONLY VISUALIZATION PURPOSE) ###

smoother_train = KalmanSmoother(component='level_longseason', 
                          component_noise={'level':0.1, 'longseason':0.1}, 
                          n_longseasons=365)
smoother_train.smooth(df_final_train.T)

smoother_test = KalmanSmoother(component='level_longseason', 
                          component_noise={'level':0.1, 'longseason':0.1}, 
                          n_longseasons=365)
smoother_test.smooth(df_final_test.T)

<tsmoothie.smoother.KalmanSmoother>

In [16]:
df_filled = pd.DataFrame(data = smoother_train.smooth_data.T, index = df_train.index, columns= df_train.columns)
df_filled_test = pd.DataFrame(data = smoother_test.smooth_data.T, index = df_test.index, columns= df_test.columns)

In [17]:
df_filled

Unnamed: 0,sp500_Close,sp500_Volume,vix_Close,vix_Volume,dji_Close,dji_Volume,ndx_Close,ndx_Volume,n225_Close,n225_Volume,...,hsi_Close,hsi_Volume,n100_Close,n100_Volume,Overall EMV Tracker,infectious_daily_infect_emv_index,GPR,trade_US Trade Policy Uncertainty,trade_Japanese Trade Policy Uncertainty,trade_Trade Policy EMV Fraction
2000-01-07,1380.426986,6.434459e+08,21.408708,0.0,10347.846457,1.892585e+08,3478.821666,1.651520e+09,14293.742325,-20599.037444,...,14614.797819,-1.109422e+08,1000.661368,-4.607964e+07,20.928042,0.590220,79.011763,70.613827,105.517056,0.020852
2000-01-14,1402.447219,7.594384e+08,21.784139,0.0,10539.453272,1.966982e+08,3520.959260,1.714725e+09,14645.822150,-14796.524443,...,14983.388198,-6.034297e+07,1017.149159,-3.660867e+07,20.971787,0.567612,79.157177,70.746507,105.680419,0.020864
2000-01-21,1426.328260,9.599312e+08,22.180850,0.0,10750.174953,2.097164e+08,3564.016076,1.804350e+09,15201.088208,-4413.478391,...,15497.052958,3.152078e+07,1039.593171,-1.843868e+07,20.966970,0.554178,79.188670,70.714467,105.617220,0.020811
2000-01-28,1439.544104,1.125756e+09,22.589475,0.0,10869.725672,2.240585e+08,3585.794759,1.870361e+09,15656.506444,4725.502109,...,15617.783237,9.193822e+07,1052.551303,-1.066798e+06,20.937244,0.510525,79.148716,70.599233,105.441477,0.020729
2000-02-04,1446.589877,1.219303e+09,22.915509,0.0,10941.862611,2.306793e+08,3593.629387,1.901851e+09,15895.168969,10444.844353,...,15816.114065,1.175594e+08,1060.107111,1.005535e+07,20.897522,0.470226,79.064572,70.452092,105.224700,0.020638
2000-02-11,1449.670551,1.255651e+09,23.076237,0.0,10977.850231,2.290204e+08,3594.187025,1.915516e+09,15981.656615,13965.466365,...,15715.758361,8.195206e+07,1064.020250,1.286352e+07,20.857101,0.439417,78.953726,70.304909,105.011102,0.020553
2000-02-18,1450.021087,1.255132e+09,23.100487,0.0,10987.652512,2.286301e+08,3592.032582,1.905738e+09,16029.167162,14776.151462,...,15334.759371,9.532826e+06,1065.398763,1.476365e+07,20.821676,0.433229,78.827392,70.177084,104.827372,0.020482
2000-02-25,1447.350202,1.212019e+09,23.031036,0.0,10968.277410,2.306302e+08,3585.476085,1.886844e+09,16130.870227,14117.771049,...,15779.923387,3.552980e+06,1064.502663,1.358929e+07,20.794627,0.436711,78.692753,70.080004,104.688939,0.020428
2000-03-03,1441.285913,1.200954e+09,22.923661,0.0,10914.479314,2.344587e+08,3574.121269,1.878490e+09,16288.596187,15899.864473,...,15967.889184,7.749685e+06,1061.759233,1.157047e+07,20.777847,0.432311,78.554408,70.019916,104.603998,0.020395
2000-03-10,1429.485880,1.223733e+09,22.846545,0.0,10808.261368,2.370571e+08,3555.222187,1.865208e+09,16375.659250,16414.904334,...,15949.029817,3.391863e+07,1055.499927,1.107726e+07,20.772276,0.435569,78.415289,69.999756,104.576091,0.020384


In [18]:
# Subsample para weekly

# Generating weekly sampled dataset
def generate_weekly(df, initial_friday):
    df_friday = initial_friday #datetime(2000,1,7)

    df_weekly = pd.DataFrame(columns = df.columns)
    
    while df_friday <= df.tail(1).index:
        df_weekly = df_weekly.append(df.loc[df_friday])
        df_friday+=timedelta(days=7)
    return df_weekly

In [19]:
df_train_weekly = generate_weekly(df_filled, datetime(2000,1,7))
df_test_weekly = generate_weekly(df_filled_test, datetime(2015,1,2))

In [20]:
df_train_weekly.to_csv(r'output\/CompleteIndexesWeeklyTrainDecompKalman.csv', index = True)
df_test_weekly.to_csv(r'output\/CompleteIndexesWeeklyTestDecompKalman.csv', index = True)

In [21]:
df_train_weekly.head(50)

Unnamed: 0,sp500_Close,sp500_Volume,vix_Close,vix_Volume,dji_Close,dji_Volume,ndx_Close,ndx_Volume,n225_Close,n225_Volume,...,hsi_Close,hsi_Volume,n100_Close,n100_Volume,Overall EMV Tracker,infectious_daily_infect_emv_index,GPR,trade_US Trade Policy Uncertainty,trade_Japanese Trade Policy Uncertainty,trade_Trade Policy EMV Fraction
2000-01-07,1380.426986,643445900.0,21.408708,0.0,10347.846457,189258500.0,3478.821666,1651520000.0,14293.742325,-20599.037444,...,14614.797819,-110942200.0,1000.661368,-46079640.0,20.928042,0.59022,79.011763,70.613827,105.517056,0.020852
2000-01-14,1402.447219,759438400.0,21.784139,0.0,10539.453272,196698200.0,3520.95926,1714725000.0,14645.82215,-14796.524443,...,14983.388198,-60342970.0,1017.149159,-36608670.0,20.971787,0.567612,79.157177,70.746507,105.680419,0.020864
2000-01-21,1426.32826,959931200.0,22.18085,0.0,10750.174953,209716400.0,3564.016076,1804350000.0,15201.088208,-4413.478391,...,15497.052958,31520780.0,1039.593171,-18438680.0,20.96697,0.554178,79.18867,70.714467,105.61722,0.020811
2000-01-28,1439.544104,1125756000.0,22.589475,0.0,10869.725672,224058500.0,3585.794759,1870361000.0,15656.506444,4725.502109,...,15617.783237,91938220.0,1052.551303,-1066798.0,20.937244,0.510525,79.148716,70.599233,105.441477,0.020729
2000-02-04,1446.589877,1219303000.0,22.915509,0.0,10941.862611,230679300.0,3593.629387,1901851000.0,15895.168969,10444.844353,...,15816.114065,117559400.0,1060.107111,10055350.0,20.897522,0.470226,79.064572,70.452092,105.2247,0.020638
2000-02-11,1449.670551,1255651000.0,23.076237,0.0,10977.850231,229020400.0,3594.187025,1915516000.0,15981.656615,13965.466365,...,15715.758361,81952060.0,1064.02025,12863520.0,20.857101,0.439417,78.953726,70.304909,105.011102,0.020553
2000-02-18,1450.021087,1255132000.0,23.100487,0.0,10987.652512,228630100.0,3592.032582,1905738000.0,16029.167162,14776.151462,...,15334.759371,9532826.0,1065.398763,14763650.0,20.821676,0.433229,78.827392,70.177084,104.827372,0.020482
2000-02-25,1447.350202,1212019000.0,23.031036,0.0,10968.27741,230630200.0,3585.476085,1886844000.0,16130.870227,14117.771049,...,15779.923387,3552980.0,1064.502663,13589290.0,20.794627,0.436711,78.692753,70.080004,104.688939,0.020428
2000-03-03,1441.285913,1200954000.0,22.923661,0.0,10914.479314,234458700.0,3574.121269,1878490000.0,16288.596187,15899.864473,...,15967.889184,7749685.0,1061.759233,11570470.0,20.777847,0.432311,78.554408,70.019916,104.603998,0.020395
2000-03-10,1429.48588,1223733000.0,22.846545,0.0,10808.261368,237057100.0,3555.222187,1865208000.0,16375.65925,16414.904334,...,15949.029817,33918630.0,1055.499927,11077260.0,20.772276,0.435569,78.415289,69.999756,104.576091,0.020384
