In [1]:
import numpy as np
import pandas as pd
import scipy.sparse
import xgboost as xgb
import mix_pandas as mix
import predict as predict_mix
import db_column_name as db

%matplotlib inline
import matplotlib.pyplot as plt
%config InlineBackend.figure_format = 'svg'

from pylab import rcParams
rcParams['figure.figsize'] = 10, 7


cn = db.ColumnName()

target_minT = pd.read_csv('./data/31286_103.csv')
mix.set_index_date(target_minT, cn.date)

X = pd.read_csv('./data/character_31286.csv')
mix.set_index_date(X, cn.date)

X = X.drop([cn.point], axis=1)
X = X[[x for x in X.columns if 'avg' in x or 
       x == cn.offset]]
    
X = mix.mean_day(X)
target_minT.index = target_minT.index.round('D')

X = X.drop([cn.offset], axis=1)

target_minT = target_minT.reindex(X.index)
target_minT = mix.clean(target_minT)
X = X.reindex(target_minT.index)
X = mix.clean(X)

print(X.shape)

target_minT = mix.winsorized(target_minT, cn.value, [0.05, 0.95])
X = X.reindex(target_minT.index)
print(X.shape)

(1810, 54)
(1517, 54)


In [3]:
default_params = {
    'verbosity':0,
    'max_depth': 3,
    
    'min_child_weight': 3.01,
    'learning_rate': 0.03,
}
reg = xgb.XGBRegressor(**default_params)

predict = predict_mix.predict_model_split(reg, X, target_minT, cn.value, 5)

for train, test in predict:
    print("Train size {}".format(train.shape[0]))
    predict_mix.print_mean(train[[cn.value]], train[['prediction']])
    print("Test size {}".format(test.shape[0]))
    predict_mix.print_mean(test[[cn.value]], test[['prediction']])
    print()
    
    # 14.7 7.3 17.7 11.1 14.1
    

Train size 257
Mean squared error 3.31215
Mean absolute error 1.45067
Median absolute error 1.13293
Test size 252
Mean squared error 19.88791
Mean absolute error 3.36513
Median absolute error 2.46425

Train size 509
Mean squared error 5.60569
Mean absolute error 1.82454
Median absolute error 1.39542
Test size 252
Mean squared error 16.63750
Mean absolute error 3.06315
Median absolute error 2.39016

Train size 761
Mean squared error 7.36498
Mean absolute error 2.07253
Median absolute error 1.68023
Test size 252
Mean squared error 34.89189
Mean absolute error 4.15924
Median absolute error 2.86925

Train size 1013
Mean squared error 8.47457
Mean absolute error 2.23750
Median absolute error 1.79525
Test size 252
Mean squared error 49.86385
Mean absolute error 5.65444
Median absolute error 5.21088

Train size 1265
Mean squared error 9.52681
Mean absolute error 2.36544
Median absolute error 1.89568
Test size 252
Mean squared error 19.60786
Mean absolute error 3.69753
Median absolute error 3.

In [14]:
X_3mean = X.resample('2D').mean()
X_3mean = mix.clean(X_3mean)
target_3mean = target_minT.resample('2D').mean()
target_3mean = mix.clean(target_3mean)

params = {
    'verbosity':0,
    'max_depth': 3,
    
    'learning_rate': 0.03,
    'min_child_weight': 3,
}
reg_mean = xgb.XGBRegressor(**params)

predict = predict_mix.predict_model_split(reg_mean, X_3mean, target_3mean, cn.value, 5)
for train, test in predict:
    print("Train size {}".format(train.shape[0]))
    predict_mix.print_mean(train[[cn.value]], train[['prediction']])
    print("Test size {}".format(test.shape[0]))
    predict_mix.print_mean(test[[cn.value]], test[['prediction']])
    print()

Train size 83
Mean squared error 1.91123
Mean absolute error 0.99251
Median absolute error 0.55147
Test size 81
Mean squared error 27.90297
Mean absolute error 4.80492
Median absolute error 5.19563

Train size 164
Mean squared error 2.71347
Mean absolute error 1.26867
Median absolute error 0.98728
Test size 81
Mean squared error 9.08681
Mean absolute error 2.25445
Median absolute error 1.78672

Train size 245
Mean squared error 3.17663
Mean absolute error 1.39574
Median absolute error 1.14819
Test size 81
Mean squared error 14.74562
Mean absolute error 3.05870
Median absolute error 2.53795

Train size 326
Mean squared error 4.09256
Mean absolute error 1.56394
Median absolute error 1.15385
Test size 81
Mean squared error 14.88930
Mean absolute error 2.73003
Median absolute error 2.23774

Train size 407
Mean squared error 4.92164
Mean absolute error 1.69798
Median absolute error 1.33657
Test size 81
Mean squared error 16.49371
Mean absolute error 3.29790
Median absolute error 2.61506

