In [22]:
import numpy as np
import pandas as pd
import scipy.sparse
import xgboost as xgb
import mix_pandas as mix
import predict as predict_mix
import db_column_name as db

%matplotlib inline
import matplotlib.pyplot as plt
%config InlineBackend.figure_format = 'svg'

from pylab import rcParams
rcParams['figure.figsize'] = 10, 7


cn = db.ColumnName()

target_minT = pd.read_csv('./data/31286_103.csv')
mix.set_index_date(target_minT, cn.date)

X = pd.read_csv('./data/character_31286.csv')
mix.set_index_date(X, cn.date)

X = X.drop([cn.point], axis=1)
X = X[[x for x in X.columns if 'avg' in x or 
       x == cn.offset]]
    
X = mix.mean_day(X)
target_minT.index = target_minT.index.round('D')

X = X.drop([cn.offset], axis=1)

target_minT = target_minT.reindex(X.index)
target_minT = mix.clean(target_minT)
X = X.reindex(target_minT.index)
X = mix.clean(X)

target_minT = mix.winsorized(target_minT, cn.value, [0.05, 0.95])
X = X.reindex(target_minT.index)
print(X.shape)


(1517, 54)


In [76]:
from sklearn.feature_selection import SelectFromModel

params = {
    'verbosity':0,
    'max_depth': 4,
}
reg = xgb.XGBRegressor(**params)

predict = predict_mix.predict_model_split(reg, X, target_minT, cn.value, 5)

thresholds = np.sort(reg.feature_importances_)
print(thresholds)

kbestThresh = thresholds[-6]
print(kbestThresh)

selection = SelectFromModel(reg, threshold=kbestThresh, prefit=True)
X_select = pd.DataFrame(selection.transform(X), index=X.index)
X_select.head()

[0.         0.         0.         0.         0.00065515 0.00130412
 0.00139251 0.00155481 0.00172957 0.00192759 0.00198138 0.00230405
 0.00269075 0.00274006 0.00274344 0.00284488 0.00286665 0.00288078
 0.00294297 0.00304642 0.00308306 0.00316551 0.00339382 0.00343858
 0.00364127 0.00368519 0.00385468 0.00400265 0.00407761 0.00424604
 0.00443598 0.00444396 0.00457574 0.00473974 0.00477609 0.00493203
 0.00495371 0.00509614 0.00523624 0.00539967 0.0056315  0.00647573
 0.00767317 0.00814446 0.01003198 0.01219271 0.01243705 0.01317914
 0.01560092 0.01833217 0.03214768 0.05609927 0.07790106 0.60337025]
0.015600917


Unnamed: 0,0,1,2,3,4,5
2013-01-03,0.0,-28.59545,17058.8,59.68625,-34.6015,-29.6305
2013-01-04,0.0,-26.42975,17146.1,54.69745,-32.32505,-27.20305
2013-01-05,0.0,-29.51275,17178.3,69.9306,-35.31215,-31.8754
2013-01-06,0.000681,-31.34385,17171.85,71.6305,-37.154,-33.3226
2013-01-08,0.117,-30.326,17182.0,71.39275,-35.5122,-31.78385


In [77]:
params = {
    'verbosity':0,
    'max_depth': 4,
#     'learning_rate': 0.06,
#     'min_child_weight': 4,
    'gamma': 6
    
}

reg = xgb.XGBRegressor(**params)

predict = predict_mix.predict_model_split(reg, X_select, target_minT, cn.value, 5)
for train, test in predict:
    print("Train size {}".format(train.shape[0]))
    predict_mix.print_mean(train[[cn.value]], train[['prediction']])
    print("Test size {}".format(test.shape[0]))
    predict_mix.print_mean(test[[cn.value]], test[['prediction']])
    print()
# 22 17 33 47 14    

Train size 257
Mean squared error 1.92321
Mean absolute error 1.08901
Median absolute error 0.91240
Test size 252
Mean squared error 22.28215
Mean absolute error 3.63695
Median absolute error 2.88305

Train size 509
Mean squared error 3.13758
Mean absolute error 1.34990
Median absolute error 1.10754
Test size 252
Mean squared error 16.68820
Mean absolute error 3.02522
Median absolute error 2.14326

Train size 761
Mean squared error 3.82657
Mean absolute error 1.48280
Median absolute error 1.16335
Test size 252
Mean squared error 33.48372
Mean absolute error 4.22160
Median absolute error 3.15712

Train size 1013
Mean squared error 4.78092
Mean absolute error 1.65181
Median absolute error 1.27971
Test size 252
Mean squared error 46.74152
Mean absolute error 5.53315
Median absolute error 4.37094

Train size 1265
Mean squared error 5.53013
Mean absolute error 1.77586
Median absolute error 1.41037
Test size 252
Mean squared error 13.68868
Mean absolute error 3.02491
Median absolute error 2.

In [78]:
from sklearn.preprocessing import PolynomialFeatures

pl = PolynomialFeatures(degree=2, include_bias=False)
X_polynomial = pd.DataFrame(pl.fit_transform(X_select), index=X.index)
X_polynomial.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,17,18,19,20,21,22,23,24,25,26
2013-01-03,0.0,-28.59545,17058.8,59.68625,-34.6015,-29.6305,0.0,-0.0,0.0,0.0,...,291002700.0,1018176.0,-590260.0682,-505460.7734,3562.448439,-2065.233779,-1768.533431,1197.263802,1025.259746,877.96653
2013-01-04,0.0,-26.42975,17146.1,54.69745,-32.32505,-27.20305,0.0,-0.0,0.0,0.0,...,293988700.0,937847.9,-554248.539805,-466426.215605,2991.811037,-1768.097806,-1487.937467,1044.908858,879.339951,740.005929
2013-01-05,0.0,-29.51275,17178.3,69.9306,-35.31215,-31.8754,0.0,-0.0,0.0,0.0,...,295094000.0,1201289.0,-606602.706345,-547565.18382,4890.288816,-2469.399837,-2229.065847,1246.947938,1125.588906,1016.041125
2013-01-06,0.000681,-31.34385,17171.85,71.6305,-37.154,-33.3226,4.642719e-07,-0.021357,11.700469,0.048807,...,294872400.0,1230028.0,-638002.9149,-572210.68881,5130.92853,-2661.359597,-2386.914499,1380.419716,1238.06788,1110.395671
2013-01-08,0.117,-30.326,17182.0,71.39275,-35.5122,-31.78385,0.01368906,-3.54815,2010.298295,8.35297,...,295221100.0,1226670.0,-610170.6204,-546110.1107,5096.924753,-2535.313617,-2269.136457,1261.116349,1128.714438,1010.213121


In [79]:
params = {
    'verbosity':0,
    'max_depth': 4,
    'gamma': 6    
}

reg = xgb.XGBRegressor(**params)
predict = predict_mix.predict_model_split(reg, X_polynomial, target_minT, cn.value, 5)
for train, test in predict:
    print("Train size {}".format(train.shape[0]))
    predict_mix.print_mean(train[[cn.value]], train[['prediction']])
    print("Test size {}".format(test.shape[0]))
    predict_mix.print_mean(test[[cn.value]], test[['prediction']])
    print()
# 19 16 35 42 14


Train size 257
Mean squared error 1.33591
Mean absolute error 0.91817
Median absolute error 0.80333
Test size 252
Mean squared error 19.00174
Mean absolute error 3.37584
Median absolute error 2.71621

Train size 509
Mean squared error 2.30760
Mean absolute error 1.16512
Median absolute error 0.96220
Test size 252
Mean squared error 16.94259
Mean absolute error 3.05445
Median absolute error 2.15434

Train size 761
Mean squared error 3.27873
Mean absolute error 1.36792
Median absolute error 1.09143
Test size 252
Mean squared error 35.07128
Mean absolute error 4.22955
Median absolute error 2.98648

Train size 1013
Mean squared error 3.92738
Mean absolute error 1.49350
Median absolute error 1.13158
Test size 252
Mean squared error 41.95081
Mean absolute error 5.33069
Median absolute error 4.76062

Train size 1265
Mean squared error 4.94430
Mean absolute error 1.66522
Median absolute error 1.27465
Test size 252
Mean squared error 14.39850
Mean absolute error 3.08426
Median absolute error 2.