In [1]:
from imputationLibrary import decompose, forwardFilling, hotDeck, meanImputation, movingAverage, splineInterpolation, randomSampleImputation
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from scipy import signal
from sklearn import preprocessing
from datetime import datetime, timedelta
from tsmoothie.smoother import *

In [2]:
INPUT = 'output/\CompleteWeeklyIndexes'

In [3]:
df = pd.read_csv(INPUT+'.csv', parse_dates=True, index_col=0)
df_test = df.loc['2015-01-02':] #Selecting test data
df_train = df.loc['2000-01-01':'2015-01-01'] #Selecting training data
df_filled = df_train.fillna(0)
df_filled_test = df_test.fillna(0)

In [4]:
df_decomposed_resid = pd.DataFrame()
df_decomposed_trend = pd.DataFrame()
df_decomposed_seasonal = pd.DataFrame()

df_decomposed_resid_test = pd.DataFrame()
df_decomposed_trend_test = pd.DataFrame()
df_decomposed_seasonal_test = pd.DataFrame()

df_nan = df_train.isna()
df_nan_test = df_test.isna()

#df_mask_nan_resid = pd.DataFrame()
#for col in df_nan.columns:
#    df_mask_nan[col+"_resid"] = df_nan.loc[:,col]
#    df_mask_nan[col+"_trend"] = df_nan.loc[:,col]
#    df_mask_nan[col+"_seasonal"] = df_nan.loc[:,col]

In [5]:
def plot_ac(df, name):
    normalized = preprocessing.scale([np.array(df.fillna(0))])
    corr = signal.correlate(normalized, normalized, mode='full')
    plt.plot(corr[0], 'o-', markersize=2)
    plt.axhline(2/np.sqrt(len(df)), ls=':')
    plt.axhline(-2/np.sqrt(len(df)), ls=':')
    plt.title(name + ' auto-correlation')
    plt.show()

In [6]:
def decompose_and_plot(ts, flag_plot = False):
    ts_decomposed = decompose.decompose(ts)
    if flag_plot:
        ts_decomposed.plot()
        plt.show()
        print("Resid is white noise? ", nature.isWhiteNoise(ts_decomposed.resid))
        print("Resid is seasonal noise? ", nature.isSeasonal(ts_decomposed.resid))
        print("Resid is trended noise? ", nature.isTrended(ts_decomposed.resid))
        print("Resid is seasonal and trended noise? ", nature.isTrendedAndSeasonal(ts_decomposed.resid))
        plot_ac(ts_decomposed.resid, "Resid")
        plot_ac(ts_decomposed.trend, "Trend")
        plot_ac(ts_decomposed.seasonal, "Seasonal")
    return ts_decomposed.resid, ts_decomposed.trend, ts_decomposed.seasonal
    

In [7]:
for col in df_filled.columns:
    resid, trend, seasonal = decompose_and_plot(df_filled.loc[:,col])
    df_decomposed_resid[col] = resid
    df_decomposed_trend[col] = trend
    df_decomposed_seasonal[col] = seasonal

In [8]:
for col in df_filled_test.columns:
    resid, trend, seasonal = decompose_and_plot(df_filled_test.loc[:,col])
    df_decomposed_resid_test[col] = resid
    df_decomposed_trend_test[col] = trend
    df_decomposed_seasonal_test[col] = seasonal

In [9]:
df_final_resid = df_decomposed_resid.mask(df_nan == True, np.nan)
df_final_trend = df_decomposed_trend.mask(df_nan == True, np.nan)
df_final_seasonal = df_decomposed_seasonal.mask(df_nan == True, np.nan)

df_final_resid_test = df_decomposed_resid_test.mask(df_nan_test == True, np.nan)
df_final_trend_test = df_decomposed_trend_test.mask(df_nan_test == True, np.nan)
df_final_seasonal_test = df_decomposed_seasonal_test.mask(df_nan_test == True, np.nan)

In [10]:
df_white_noise_train, df_white_noise_test =  meanImputation.input(df_final_resid, df_final_resid_test)

In [11]:
df_seasonal_train, df_seasonal_test = splineInterpolation.input(df_final_seasonal, df_final_seasonal_test)

In [12]:
df_trended_train, df_trended_test = forwardFilling.input(df_final_trend, df_final_trend_test)

In [13]:
df_final_train = df_white_noise_train + df_seasonal_train + df_trended_train
df_final_test = df_white_noise_test + df_seasonal_test + df_trended_test

In [14]:
df_final_train

Unnamed: 0,sp500_Close,sp500_Volume,vix_Close,vix_Volume,dji_Close,dji_Volume,ndx_Close,ndx_Volume,n225_Close,n225_Volume,...,hsi_Close,hsi_Volume,n100_Close,n100_Volume,Overall EMV Tracker,infectious_daily_infect_emv_index,GPR,trade_US Trade Policy Uncertainty,trade_Japanese Trade Policy Uncertainty,trade_Trade Policy EMV Fraction
2000-01-07,1431.932646,3.031014e+08,22.570398,0.0,10816.675933,1.548123e+08,3586.881407,1.432710e+09,17188.782154,-21406.021766,...,16242.906597,-2.818552e+08,1042.028792,-7.058666e+07,16.140358,1.136238,-110.262523,31.331588,55.846886,0.007242
2000-01-14,1435.148573,3.024809e+08,22.639629,0.0,10827.929670,1.714830e+08,3602.407249,1.543878e+09,16161.755809,-26798.384403,...,16367.703556,-2.972095e+08,1046.013542,-8.287447e+07,16.311955,0.662445,-27.079003,38.325080,57.573267,0.011234
2000-01-21,1440.525857,1.029892e+09,22.065281,0.0,10839.458979,2.058204e+08,3625.594743,1.855342e+09,17253.873291,5221.817062,...,16525.255664,1.114493e+08,1050.418406,-1.853869e+07,16.713200,0.714108,34.370857,44.447182,60.156857,0.014532
2000-01-28,1433.677528,1.164465e+09,22.230324,0.0,10748.429743,2.287507e+08,3616.337331,1.929922e+09,17137.333726,17909.014864,...,16345.384885,1.696251e+08,1043.948590,1.163269e+07,17.292280,0.462206,76.801110,49.716085,63.375871,0.017196
2000-02-04,1426.321809,1.350010e+09,23.291117,0.0,10706.555429,2.506210e+08,3598.309555,1.940536e+09,17022.205888,10175.594535,...,16186.679641,1.803921e+08,1038.258474,3.599756e+07,17.997382,0.822584,102.925807,54.149977,67.008525,0.019283
2000-02-11,1426.205604,1.204788e+09,23.003351,0.0,10733.604949,2.139375e+08,3578.562110,1.843676e+09,17010.017341,7592.222282,...,16114.800673,5.843230e+07,1040.523292,1.053733e+07,18.776695,0.937660,115.458998,57.767049,70.833035,0.020853
2000-02-18,1422.236451,1.147484e+09,23.055603,0.0,10712.265940,2.046200e+08,3564.045927,1.842670e+09,16998.354957,14629.742886,...,16138.481873,5.321275e+07,1039.376059,1.499686e+07,19.578405,0.451994,117.114736,60.585491,74.627617,0.021962
2000-02-25,1420.085998,9.953825e+08,23.186741,0.0,10713.372821,2.048085e+08,3554.555219,1.782469e+09,16974.006185,19794.578051,...,15986.909341,-2.763798e+07,1037.322393,1.652448e+07,20.350699,0.500413,110.607072,62.623493,78.170487,0.022670
2000-03-03,1424.362131,8.268907e+08,22.811845,0.0,10735.154622,2.124979e+08,3561.444241,1.760846e+09,17041.926500,9147.558820,...,16015.986340,-2.005553e+08,1038.924380,-1.392092e+06,21.041765,0.518807,98.650057,63.899243,81.239860,0.023035
2000-03-10,1423.807927,1.029769e+09,22.987135,0.0,10725.354799,2.270570e+08,3558.591194,1.838785e+09,17188.372206,13943.822557,...,15986.560884,-1.191403e+08,1040.252369,2.117907e+07,21.599791,0.450210,83.957743,64.430933,83.613952,0.023115


In [15]:
### USE KALMAN FILTER TO SMOOTH ALL DATA (ONLY VISUALIZATION PURPOSE) ###

smoother_train = KalmanSmoother(component='level_longseason', 
                          component_noise={'level':0.1, 'longseason':0.1}, 
                          n_longseasons=365)
smoother_train.smooth(df_final_train.T)

smoother_test = KalmanSmoother(component='level_longseason', 
                          component_noise={'level':0.1, 'longseason':0.1}, 
                          n_longseasons=365)
smoother_test.smooth(df_final_test.T)

<tsmoothie.smoother.KalmanSmoother>

In [16]:
df_filled = pd.DataFrame(data = smoother_train.smooth_data.T, index = df_train.index, columns= df_train.columns)
df_filled_test = pd.DataFrame(data = smoother_test.smooth_data.T, index = df_test.index, columns= df_test.columns)

In [17]:
df_filled

Unnamed: 0,sp500_Close,sp500_Volume,vix_Close,vix_Volume,dji_Close,dji_Volume,ndx_Close,ndx_Volume,n225_Close,n225_Volume,...,hsi_Close,hsi_Volume,n100_Close,n100_Volume,Overall EMV Tracker,infectious_daily_infect_emv_index,GPR,trade_US Trade Policy Uncertainty,trade_Japanese Trade Policy Uncertainty,trade_Trade Policy EMV Fraction
2000-01-07,1421.900770,6.288783e+08,22.297591,0.0,10713.504249,1.829247e+08,3575.477734,1.622835e+09,16818.618359,-9046.908821,...,16183.878687,-1.266724e+08,1034.669935,-4.289938e+07,16.646474,0.827331,-17.366801,40.387866,59.540621,0.012198
2000-01-14,1426.197628,7.021191e+08,22.399208,0.0,10741.746309,1.898701e+08,3586.223670,1.671181e+09,16810.117793,-6802.144493,...,16238.019621,-9.476578e+07,1038.489302,-3.710442e+07,16.885609,0.766818,2.188999,42.642008,60.785507,0.013359
2000-01-21,1428.583145,8.552925e+08,22.452107,0.0,10752.037430,2.004889e+08,3593.262425,1.744903e+09,16929.562317,-555.925460,...,16264.631121,-2.235443e+07,1040.707889,-2.215454e+07,17.239439,0.727145,27.612028,45.759935,62.673060,0.014945
2000-01-28,1428.459039,9.734933e+08,22.581740,0.0,10744.136054,2.100335e+08,3593.361965,1.796429e+09,16982.387369,4534.574607,...,16237.524734,2.328519e+07,1040.888222,-7.932834e+06,17.698360,0.690065,51.686448,49.139570,65.063373,0.016613
2000-02-04,1427.171181,1.053396e+09,22.781004,0.0,10734.680544,2.158230e+08,3588.394036,1.821125e+09,17002.448342,6947.884523,...,16187.272441,3.962600e+07,1040.360871,2.364680e+06,18.238171,0.698556,70.730960,52.401690,67.789780,0.018164
2000-02-11,1425.934673,1.073842e+09,22.877550,0.0,10730.168772,2.146392e+08,3580.973094,1.821800e+09,17016.780283,8712.147046,...,16135.594640,2.777565e+07,1040.159247,5.920155e+06,18.825609,0.682243,83.321464,55.310607,70.669961,0.019491
2000-02-18,1424.527051,1.067952e+09,22.948228,0.0,10724.301220,2.135824e+08,3573.567961,1.817961e+09,17030.688481,10696.061609,...,16086.563075,9.759941e+06,1039.790874,8.534719e+06,19.422074,0.614858,89.464128,57.723510,73.513910,0.020543
2000-02-25,1423.462219,1.046016e+09,22.996725,0.0,10720.184698,2.143054e+08,3567.604371,1.809046e+09,17049.288196,11888.122551,...,16025.666661,-1.697191e+07,1039.412343,9.838240e+06,19.986288,0.580081,90.053844,59.558324,76.130375,0.021309
2000-03-03,1422.958748,1.034074e+09,23.006527,0.0,10716.786449,2.169152e+08,3563.790620,1.805317e+09,17081.167676,11493.327985,...,15971.074700,-4.158479e+07,1039.359473,9.785320e+06,20.476420,0.561290,86.510026,60.773702,78.333053,0.021800
2000-03-10,1422.061983,1.063443e+09,23.054596,0.0,10709.082238,2.203951e+08,3559.988216,1.810357e+09,17119.112321,11562.171857,...,15906.087014,-3.440878e+07,1039.301973,1.194845e+07,20.852101,0.551059,80.517072,61.357129,79.947762,0.022041


In [18]:
# Subsample para weekly

# Generating weekly sampled dataset
def generate_weekly(df, initial_friday):
    df_friday = initial_friday #datetime(2000,1,7)

    df_weekly = pd.DataFrame(columns = df.columns)
    
    while df_friday <= df.tail(1).index:
        df_weekly = df_weekly.append(df.loc[df_friday])
        df_friday+=timedelta(days=7)
    return df_weekly

In [19]:
df_train_weekly = generate_weekly(df_filled, datetime(2000,1,7))
df_test_weekly = generate_weekly(df_filled_test, datetime(2015,1,2))

In [20]:
df_train_weekly.to_csv(r'output\/CompleteIndexesWeeklyTrainDecompKalman.csv', index = True)
df_test_weekly.to_csv(r'output\/CompleteIndexesWeeklyTestDecompKalman.csv', index = True)

In [21]:
df_train_weekly.head(50)

Unnamed: 0,sp500_Close,sp500_Volume,vix_Close,vix_Volume,dji_Close,dji_Volume,ndx_Close,ndx_Volume,n225_Close,n225_Volume,...,hsi_Close,hsi_Volume,n100_Close,n100_Volume,Overall EMV Tracker,infectious_daily_infect_emv_index,GPR,trade_US Trade Policy Uncertainty,trade_Japanese Trade Policy Uncertainty,trade_Trade Policy EMV Fraction
2000-01-07,1421.90077,628878300.0,22.297591,0.0,10713.504249,182924700.0,3575.477734,1622835000.0,16818.618359,-9046.908821,...,16183.878687,-126672400.0,1034.669935,-42899380.0,16.646474,0.827331,-17.366801,40.387866,59.540621,0.012198
2000-01-14,1426.197628,702119100.0,22.399208,0.0,10741.746309,189870100.0,3586.22367,1671181000.0,16810.117793,-6802.144493,...,16238.019621,-94765780.0,1038.489302,-37104420.0,16.885609,0.766818,2.188999,42.642008,60.785507,0.013359
2000-01-21,1428.583145,855292500.0,22.452107,0.0,10752.03743,200488900.0,3593.262425,1744903000.0,16929.562317,-555.92546,...,16264.631121,-22354430.0,1040.707889,-22154540.0,17.239439,0.727145,27.612028,45.759935,62.67306,0.014945
2000-01-28,1428.459039,973493300.0,22.58174,0.0,10744.136054,210033500.0,3593.361965,1796429000.0,16982.387369,4534.574607,...,16237.524734,23285190.0,1040.888222,-7932834.0,17.69836,0.690065,51.686448,49.13957,65.063373,0.016613
2000-02-04,1427.171181,1053396000.0,22.781004,0.0,10734.680544,215823000.0,3588.394036,1821125000.0,17002.448342,6947.884523,...,16187.272441,39626000.0,1040.360871,2364680.0,18.238171,0.698556,70.73096,52.40169,67.78978,0.018164
2000-02-11,1425.934673,1073842000.0,22.87755,0.0,10730.168772,214639200.0,3580.973094,1821800000.0,17016.780283,8712.147046,...,16135.59464,27775650.0,1040.159247,5920155.0,18.825609,0.682243,83.321464,55.310607,70.669961,0.019491
2000-02-18,1424.527051,1067952000.0,22.948228,0.0,10724.30122,213582400.0,3573.567961,1817961000.0,17030.688481,10696.061609,...,16086.563075,9759941.0,1039.790874,8534719.0,19.422074,0.614858,89.464128,57.72351,73.51391,0.020543
2000-02-25,1423.462219,1046016000.0,22.996725,0.0,10720.184698,214305400.0,3567.604371,1809046000.0,17049.288196,11888.122551,...,16025.666661,-16971910.0,1039.412343,9838240.0,19.986288,0.580081,90.053844,59.558324,76.130375,0.021309
2000-03-03,1422.958748,1034074000.0,23.006527,0.0,10716.786449,216915200.0,3563.79062,1805317000.0,17081.167676,11493.327985,...,15971.0747,-41584790.0,1039.359473,9785320.0,20.47642,0.56129,86.510026,60.773702,78.333053,0.0218
2000-03-10,1422.061983,1063443000.0,23.054596,0.0,10709.082238,220395100.0,3559.988216,1810357000.0,17119.112321,11562.171857,...,15906.087014,-34408780.0,1039.301973,11948450.0,20.852101,0.551059,80.517072,61.357129,79.947762,0.022041


In [22]:
df

Unnamed: 0,sp500_Close,sp500_Volume,vix_Close,vix_Volume,dji_Close,dji_Volume,ndx_Close,ndx_Volume,n225_Close,n225_Volume,...,hsi_Close,hsi_Volume,n100_Close,n100_Volume,Overall EMV Tracker,infectious_daily_infect_emv_index,GPR,trade_US Trade Policy Uncertainty,trade_Japanese Trade Policy Uncertainty,trade_Trade Policy EMV Fraction
2000-01-07,1420.333984,1.068760e+09,25.016000,0.0,11250.781836,1.825620e+08,3542.894043,1.598166e+09,18476.772461,0.000000,...,16169.606250,0.000000e+00,949.868006,0.000000e+00,,0.252857,,,,
2000-01-14,1448.648023,1.033940e+09,21.684000,0.0,11587.958008,1.962560e+08,3611.343994,1.609134e+09,18829.544922,0.000000,...,15720.128125,0.000000e+00,957.357996,0.000000e+00,,0.214286,,,,
2000-01-21,1449.492493,1.113750e+09,21.447500,0.0,11413.272461,1.975950e+08,3810.092468,1.753105e+09,19083.530078,0.000000,...,15392.563867,0.000000e+00,958.146008,0.000000e+00,,0.110000,,,,
2000-01-28,1394.874023,1.106420e+09,23.960000,0.0,10967.587890,1.956360e+08,3616.111963,1.772902e+09,19141.585938,0.000000,...,15560.411914,0.000000e+00,948.610010,0.000000e+00,24.412899,0.490000,,71.914247,117.577146,0.033573
2000-02-04,1412.439990,1.041000e+09,23.014001,0.0,10992.404102,1.768760e+08,3744.364014,1.581506e+09,19618.308203,0.000000,...,15736.035157,0.000000e+00,969.562000,0.000000e+00,,0.741429,34.391162,,,
2000-02-11,1416.323975,1.020160e+09,22.886000,0.0,10726.277930,1.666060e+08,4004.691992,1.784898e+09,19883.024902,0.000000,...,16818.415528,0.000000e+00,1012.094006,0.000000e+00,,0.311429,,,,
2000-02-18,1382.802002,1.023060e+09,23.996000,0.0,10506.685938,1.837200e+08,4014.451953,1.799752e+09,19620.780078,0.000000,...,16900.180469,0.000000e+00,999.685999,0.000000e+00,,0.235714,,,,
2000-02-25,1349.912506,1.063475e+09,24.832500,0.0,10121.330078,2.031000e+08,4142.712402,1.858660e+09,19568.640234,0.000000,...,16642.794141,0.000000e+00,986.443994,0.000000e+00,,0.315714,,,,
2000-03-03,1376.918018,1.170760e+09,21.992000,0.0,10167.401953,2.076680e+08,4283.041895,2.078572e+09,19950.787500,0.000000,...,17043.903906,0.000000e+00,1018.281995,0.000000e+00,19.913437,0.561429,54.098793,50.352413,58.499365,0.026471
2000-03-10,1382.071972,1.161580e+09,22.616000,0.0,9952.522266,2.117760e+08,4493.422070,2.038220e+09,19784.024219,0.000000,...,17808.887500,0.000000e+00,1039.794019,0.000000e+00,,0.348571,,,,
