In [1]:
import numpy as np
import pandas as pd
import scipy.sparse
import xgboost as xgb
import mix_pandas as mix
import predict as predict_mix
import db_column_name as db

%matplotlib inline
import matplotlib.pyplot as plt
%config InlineBackend.figure_format = 'svg'

from pylab import rcParams
rcParams['figure.figsize'] = 10, 7


cn = db.ColumnName()

target_minT = pd.read_csv('./data/31286_103.csv')
mix.set_index_date(target_minT, cn.date)

X = pd.read_csv('./data/character_31286.csv')
mix.set_index_date(X, cn.date)

X = X.drop([cn.point], axis=1)
X = X[[x for x in X.columns if 'avg' in x or 
       x == cn.offset]]

X = X[X[cn.offset] == 69]
X = X[X.index.hour == 21]
print(X.shape)

# X = mix.mean_day(X)
# target_minT.index = target_minT.index.round('D')

X = X.drop([cn.offset], axis=1)

target_minT = target_minT.reindex(X.index)
target_minT = mix.clean(target_minT)
X = X.reindex(target_minT.index)
X = mix.clean(X)
print(X.shape)

target_minT = target_minT.iloc[3:] # remove on change

target_minT = mix.winsorized(target_minT, cn.value, [0.05, 0.95], 5)
X = X.reindex(target_minT.index)

(1245, 55)
(1195, 54)


In [4]:
default_params = {
    'verbosity':0,
    'max_depth': 3,
    'min_child_weight': 3.01,
}
reg = xgb.XGBRegressor(**default_params)

predict = predict_mix.predict_model_split(reg, X, target_minT, cn.value, 5)

for train, test in predict:
    print("Train size {}".format(train.shape[0]))
    predict_mix.print_mean(train[[cn.value]], train[['prediction']])
    print("Test size {}".format(test.shape[0]))
    predict_mix.print_mean(test[[cn.value]], test[['prediction']])
    print()

Train size 164
Mean squared error 0.26606
Mean absolute error 0.38018
Median absolute error 0.27665
Explained variance score 0.99749
Coefficient of determination 0.99749
Test size 163
Mean squared error 24.13153
Mean absolute error 3.95416
Median absolute error 3.56162
Explained variance score 0.88677
Coefficient of determination 0.88086

Train size 327
Mean squared error 1.41685
Mean absolute error 0.88860
Median absolute error 0.69560
Explained variance score 0.99096
Coefficient of determination 0.99096
Test size 163
Mean squared error 39.66218
Mean absolute error 4.32107
Median absolute error 2.33082
Explained variance score 0.68895
Coefficient of determination 0.66909

Train size 490
Mean squared error 2.56432
Mean absolute error 1.19866
Median absolute error 0.91133
Explained variance score 0.98365
Coefficient of determination 0.98365
Test size 163
Mean squared error 71.18797
Mean absolute error 7.16050
Median absolute error 7.32545
Explained variance score 0.45944
Coefficient of 

In [3]:
X_3mean = X.resample('2D').mean()
X_3mean = mix.clean(X_3mean)
target_3mean = target_minT.resample('2D').mean()
target_3mean = mix.clean(target_3mean)

params = {
    'verbosity':0,
    'max_depth': 3,
    
    'learning_rate': 0.03,
    'min_child_weight': 3,
}
reg_mean = xgb.XGBRegressor(**params)

predict = predict_mix.predict_model_split(reg_mean, X_3mean, target_3mean, cn.value, 5)
for train, test in predict:
    print("Train size {}".format(train.shape[0]))
    predict_mix.print_mean(train[[cn.value]], train[['prediction']])
    print("Test size {}".format(test.shape[0]))
    predict_mix.print_mean(test[[cn.value]], test[['prediction']])
    print()

Train size 107
Mean squared error 2.90922
Mean absolute error 1.21958
Median absolute error 0.66536
Explained variance score 0.97310
Coefficient of determination 0.96983
Test size 102
Mean squared error 32.07604
Mean absolute error 4.91214
Median absolute error 5.45391
Explained variance score 0.86535
Coefficient of determination 0.84334

Train size 209
Mean squared error 4.08306
Mean absolute error 1.56119
Median absolute error 1.22743
Explained variance score 0.97466
Coefficient of determination 0.97366
Test size 102
Mean squared error 31.52651
Mean absolute error 4.14835
Median absolute error 3.02683
Explained variance score 0.77449
Coefficient of determination 0.76245

Train size 311
Mean squared error 5.14840
Mean absolute error 1.77927
Median absolute error 1.38862
Explained variance score 0.96760
Coefficient of determination 0.96708
Test size 102
Mean squared error 76.21677
Mean absolute error 7.48309
Median absolute error 8.49510
Explained variance score 0.53053
Coefficient of 