In [1]:
import numpy as np
import pandas as pd
import scipy.sparse
import xgboost as xgb
import mix_pandas as mix
import predict as predict_mix
import db_column_name as db

%matplotlib inline
import matplotlib.pyplot as plt
%config InlineBackend.figure_format = 'svg'

from pylab import rcParams
rcParams['figure.figsize'] = 10, 7


cn = db.ColumnName()

target_minT = pd.read_csv('./data/31286_103.csv')
mix.set_index_date(target_minT, cn.date)

X = pd.read_csv('./data/character_31286.csv')
mix.set_index_date(X, cn.date)

X = X.drop([cn.point], axis=1)
X = X[[x for x in X.columns if 'avg' in x or 
       x == cn.offset]]

X = X[X[cn.offset] == 69]
X = X[X.index.hour == 21]
print(X.shape)

# X = mix.mean_day(X)
# target_minT.index = target_minT.index.round('D')

X = X.drop([cn.offset], axis=1)

target_minT = target_minT.reindex(X.index)
target_minT = mix.clean(target_minT)
X = X.reindex(target_minT.index)
X = mix.clean(X)
print(X.shape)

target_minT = target_minT.iloc[3:] # remove on change

target_minT = mix.winsorized(target_minT, cn.value, [0.05, 0.95], 5)
X = X.reindex(target_minT.index)
print(X.shape)

(1245, 55)
(1195, 54)
(979, 54)


In [16]:
default_params = {
    'verbosity':0,
    'max_depth': 3,
    
    'min_child_weight': 3.01,
#     'learning_rate': 0.03,
}
reg = xgb.XGBRegressor(**default_params)

predict = predict_mix.predict_model_split(reg, X, target_minT, cn.value, 5)

for train, test in predict:
    print("Train size {}".format(train.shape[0]))
    predict_mix.print_mean(train[[cn.value]], train[['prediction']])
    print("Test size {}".format(test.shape[0]))
    predict_mix.print_mean(test[[cn.value]], test[['prediction']])
    print()

# from sklearn.model_selection import GridSearchCV
# from sklearn.model_selection import TimeSeriesSplit
# param_test1 = {
#  'max_depth':range(3,10,2),
#  'min_child_weight':range(1,6,2)
# }
# gsearch1 = GridSearchCV(
#     estimator=reg, param_grid = param_test1, scoring='neg_mean_squared_error',
#     cv=TimeSeriesSplit(5))
# y_test = target_minT[target_minT.index.year==2015]
# X_test = X[X.index.year==2015]
# gsearch1.fit(X_test, y_test)

# gsearch1.best_params_, gsearch1.best_score_

Train size 164
Mean squared error 0.26606
Mean absolute error 0.38018
Median absolute error 0.27665
Test size 163
Mean squared error 24.13153
Mean absolute error 3.95416
Median absolute error 3.56162

Train size 327
Mean squared error 1.41685
Mean absolute error 0.88860
Median absolute error 0.69560
Test size 163
Mean squared error 39.66218
Mean absolute error 4.32107
Median absolute error 2.33082

Train size 490
Mean squared error 2.56432
Mean absolute error 1.19866
Median absolute error 0.91133
Test size 163
Mean squared error 71.18797
Mean absolute error 7.16050
Median absolute error 7.32545

Train size 653
Mean squared error 3.30974
Mean absolute error 1.38626
Median absolute error 1.11761
Test size 163
Mean squared error 9.96027
Mean absolute error 2.58002
Median absolute error 2.43907

Train size 816
Mean squared error 3.75057
Mean absolute error 1.49582
Median absolute error 1.17084
Test size 163
Mean squared error 12.44035
Mean absolute error 2.86262
Median absolute error 2.600

In [3]:
X_3mean = X.resample('2D').mean()
X_3mean = mix.clean(X_3mean)
target_3mean = target_minT.resample('2D').mean()
target_3mean = mix.clean(target_3mean)

params = {
    'verbosity':0,
    'max_depth': 3,
    
    'learning_rate': 0.03,
    'min_child_weight': 3,
}
reg_mean = xgb.XGBRegressor(**params)

predict = predict_mix.predict_model_split(reg_mean, X_3mean, target_3mean, cn.value, 5)
for train, test in predict:
    print("Train size {}".format(train.shape[0]))
    predict_mix.print_mean(train[[cn.value]], train[['prediction']])
    print("Test size {}".format(test.shape[0]))
    predict_mix.print_mean(test[[cn.value]], test[['prediction']])
    print()

Train size 107
Mean squared error 2.90922
Mean absolute error 1.21958
Median absolute error 0.66536
Test size 102
Mean squared error 32.07604
Mean absolute error 4.91214
Median absolute error 5.45391

Train size 209
Mean squared error 4.08306
Mean absolute error 1.56119
Median absolute error 1.22743
Test size 102
Mean squared error 31.52651
Mean absolute error 4.14835
Median absolute error 3.02683

Train size 311
Mean squared error 5.14840
Mean absolute error 1.77927
Median absolute error 1.38862
Test size 102
Mean squared error 76.21677
Mean absolute error 7.48309
Median absolute error 8.49510

Train size 413
Mean squared error 6.69978
Mean absolute error 2.06240
Median absolute error 1.74114
Test size 102
Mean squared error 12.47893
Mean absolute error 2.95413
Median absolute error 2.86967

Train size 515
Mean squared error 6.60869
Mean absolute error 2.08390
Median absolute error 1.69185
Test size 102
Mean squared error 15.86181
Mean absolute error 3.34071
Median absolute error 2.97