In [117]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin
import pandas as pd
import numpy as np
import datetime
from sklearn.pipeline import FeatureUnion

# Split training and testing dataset

In [118]:
data=pd.read_pickle('data.pkl')

In [119]:
train=data[data.index<=datetime.datetime(2015, 5, 4)]
test=data[data.index>datetime.datetime(2015, 5, 4)]

In [120]:
train.head()

Unnamed: 0_level_0,CPI,NOS,PCR,OIL,SI,BDI,resid,IC,TERM,VRP
AsOfDate,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1990-06-08,0.046737,0.016941,-1.622923,,,1278.0,-0.013576,,0.5,7.197727
1990-06-11,0.046737,0.016941,-1.615366,,,1264.0,-0.013576,,0.49,5.607431
1990-06-12,0.046737,0.016941,-1.618868,,,1254.0,-0.013576,,0.49,6.403252
1990-06-13,0.046737,0.016941,-1.625316,,,1243.0,-0.013576,,0.47,7.793147
1990-06-14,0.046737,0.016941,-1.620454,,,1232.0,-0.013576,,0.46,7.665179


# Prepocess

In [121]:
#pipelines to select columns
class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names]
#de-median
class DeMedian(BaseEstimator, TransformerMixin):
    def __init__(self): # no *args or **kargs
        self.median=0
    def fit(self, X):
        self.median=X.median()
        return self  # nothing else to do
    def transform(self, X):
        return X-self.median
#take log
class Log(BaseEstimator, TransformerMixin):
    def __init__(self): 
        return
    def fit(self, X):
        return self
    def transform(self, X):
        return np.log(X)

In [122]:
demedian_list=['VRP','IC']
log_median_list=['BDI']
other_list=[i for i in train.columns if i not in demedian_list+log_median_list]
demedian_pipeline = Pipeline([
        ('selector', DataFrameSelector(demedian_list)),
        ('demedian', DeMedian()),
    ])

log_median_pipeline = Pipeline([
        ('selector', DataFrameSelector(log_median_list)),
        ('log', Log()),
        ('demedian', DeMedian()),
    ])
other_pipeline=Pipeline([
    ('selector', DataFrameSelector(other_list)),
])
full_pipeline = FeatureUnion(transformer_list=[
        ("demedian_pipline", demedian_pipeline),
        ("log_median_pipeline", log_median_pipeline),
        ("other_pipeline", other_pipeline),
        
    ])

In [123]:
train_prepared=full_pipeline.fit_transform(train)

In [124]:
train_prepared

array([[ 0.73708797,         nan, -0.1528084 , ...,         nan,
        -0.01357599,  0.5       ],
       [-0.85320834,         nan, -0.16382346, ...,         nan,
        -0.01357599,  0.49      ],
       [-0.0573871 ,         nan, -0.17176631, ...,         nan,
        -0.01357599,  0.49      ],
       ...,
       [-0.64222115, -4.3       , -0.92404402, ...,  0.98136784,
        -0.01269891,  2.04      ],
       [-2.57190703, -6.98      , -0.93083521, ...,  0.53942159,
        -0.01269891,  2.11      ],
       [-2.33401527, -5.21      , -0.93083521, ...,  0.48065865,
        -0.01269891,  2.14      ]])

# Tables

In [125]:
df_prepared=pd.DataFrame(train_prepared,columns=demedian_list+log_median_list+other_list)

## Table1

In [136]:
summary=df_prepared.describe()
summary.loc['Stdev']=df_prepared.std()
summary.loc['Skewness']=df_prepared.skew()
summary.loc['Kurtosis']=df_prepared.kurt()
summary

Unnamed: 0,VRP,IC,BDI,CPI,NOS,PCR,OIL,SI,resid,TERM
count,6275.0,1841.0,6275.0,6275.0,6275.0,6275.0,6101.0,499.0,6275.0,6272.0
mean,0.43035,-0.450951,0.15752,0.025519,-0.016805,-1.36111,0.239927,0.537009,0.00523,1.865515
std,3.458404,10.93876,0.617765,0.012576,0.038633,0.363751,0.390506,0.234051,0.025816,1.161928
min,-24.511136,-39.68,-1.073412,-0.019588,-0.181167,-2.260664,-0.818794,0.105541,-0.052577,-0.95
25%,-1.740792,-7.35,-0.278546,0.017475,-0.039262,-1.609558,0.000371,0.378742,-0.009389,0.91
50%,0.0,0.0,0.0,0.026211,-0.017843,-1.412,0.211595,0.493194,0.004489,2.0
75%,2.131014,5.98,0.461827,0.031899,0.002775,-1.14089,0.510608,0.63271,0.018684,2.79
max,23.839293,46.33,2.069401,0.063796,0.196459,-0.3331,1.172076,1.636367,0.091062,3.87
Stdev,3.458404,10.93876,0.617765,0.012576,0.038633,0.363751,0.390506,0.234051,0.025816,1.161928
Skewness,0.703677,0.13775,0.950532,-0.162354,0.143467,0.648741,0.027659,1.497838,0.601011,-0.290813


## Table2

In [128]:
df_prepared.corr()

Unnamed: 0,VRP,IC,BDI,CPI,NOS,PCR,OIL,SI,resid,TERM
VRP,1.0,0.351058,-0.108147,-0.098929,-0.209388,0.031323,-0.174315,0.454565,0.075844,0.092356
IC,0.351058,1.0,-0.409923,-0.162414,-0.235873,0.040777,-0.430414,0.061569,0.341942,0.398803
BDI,-0.108147,-0.409923,1.0,0.249523,0.224709,-0.442579,0.46645,-0.052131,0.302608,-0.071969
CPI,-0.098929,-0.162414,0.249523,1.0,0.390285,-0.53457,0.396217,0.004746,-0.324996,-0.23853
NOS,-0.209388,-0.235873,0.224709,0.390285,1.0,-0.141374,0.37283,0.024798,-0.161129,-0.238081
PCR,0.031323,0.040777,-0.442579,-0.53457,-0.141374,1.0,-0.33052,0.010744,-0.313343,-0.23859
OIL,-0.174315,-0.430414,0.46645,0.396217,0.37283,-0.33052,1.0,-0.033448,0.013124,-0.231822
SI,0.454565,0.061569,-0.052131,0.004746,0.024798,0.010744,-0.033448,1.0,-0.069644,-0.053362
resid,0.075844,0.341942,0.302608,-0.324996,-0.161129,-0.313343,0.013124,-0.069644,1.0,0.542635
TERM,0.092356,0.398803,-0.071969,-0.23853,-0.238081,-0.23859,-0.231822,-0.053362,0.542635,1.0
