In [1]:
import numpy as np
import pandas as pd
import scipy.sparse
import xgboost as xgb
import mix_pandas as mix
import predict as predict_mix
import db_column_name as db

%matplotlib inline
import matplotlib.pyplot as plt
%config InlineBackend.figure_format = 'svg'

from pylab import rcParams
rcParams['figure.figsize'] = 10, 10


cn = db.ColumnName()

target_minT = pd.read_csv('./data/31286_103.csv')
mix.set_index_date(target_minT, cn.date)

X = pd.read_csv('./data/character_31286.csv')
mix.set_index_date(X, cn.date)

X = X.drop([cn.point], axis=1)
X = X[[x for x in X.columns if 'avg' in x or 
       x == cn.offset]]

X = X[X[cn.offset] == 69]
X = X[X.index.hour == 21]
print(X.shape)

# X = mix.mean_day(X)
# target_minT.index = target_minT.index.round('D')

X = X.drop([cn.offset], axis=1)

target_minT = target_minT.reindex(X.index)
target_minT = mix.clean(target_minT)
X = X.reindex(target_minT.index)
X = mix.clean(X)
print(X.shape)

target_minT = target_minT.iloc[3:] # remove on change

target_minT = mix.winsorized(target_minT, cn.value, [0.05, 0.95], 5)
X = X.reindex(target_minT.index)
print(X.shape)

(1245, 55)
(1195, 54)
(979, 54)


In [2]:
corr = X.corr()
upper = corr.where(~np.tril(np.ones(corr.shape)).astype(np.bool))
to_drop = [col for col in upper.columns if any(upper[col] > 0.9)]
print(to_drop)
X_drop = X.drop(to_drop, axis=1)
X_drop.head()

['avg tdd925', 'avg t925', 'avg slp', 'avg rainnc', 'avg t500', 'avg advTdd850', 'avg td2', 'avg td925', 'avg advRh850', 'avg t850', 'avg slptend', 'avg td850']


Unnamed: 0_level_0,avg td500,avg rh925,avg v10,avg wdir850,avg snownc,avg defV500,avg t2,avg u850,avg wdir10,avg u10,...,avg defSt500,avg w500,avg rainc,avg wspd850,avg w850,avg dudy500,avg u500,avg rot500,avg advRh2,avg tdd500
actual_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2014-06-20 21:00:00,-16.3832,57.4869,-2.13775,13.4974,0.0,8e-05,16.7912,-3.64175,316.37,2.04152,...,-3.5e-05,-0.063531,7.72829,15.6132,0.031968,9e-06,-2.82198,-2.5e-05,-0.000769,1.97802
2014-10-28 21:00:00,-53.211,33.1061,-3.02335,45.9554,0.0,3.3e-05,-15.389,-4.11547,333.192,1.51222,...,9e-06,-0.104186,0.0,5.96798,0.037972,-3.7e-05,-11.3359,8e-05,-0.003336,29.4622
2014-10-30 21:00:00,-38.1176,54.4846,-0.650154,306.213,0.011389,8.4e-05,-22.3096,14.1386,286.141,1.41981,...,-1.7e-05,0.026363,0.0,17.5158,0.045806,-1.3e-05,21.1847,-5.3e-05,-0.000783,6.96273
2014-11-01 21:00:00,-34.8647,84.8004,-2.2329,121.954,0.049867,5.7e-05,-9.84371,-3.49329,25.601,-1.05809,...,2.9e-05,0.001384,0.0,4.17964,0.035269,-3.8e-05,12.2031,4.3e-05,-0.000395,5.30676
2014-11-02 21:00:00,-43.3544,93.3471,-0.492943,135.109,4.4041,2.7e-05,-6.28352,-3.26421,215.005,0.362182,...,-1.5e-05,-0.016323,0.0,4.66506,0.019166,-1.3e-05,5.90434,4e-05,-0.000135,7.17927


In [4]:
params = {
    'verbosity':0,
    'max_depth': 4,
    
    'min_child_weight': 6,
#     'learning_rate': 0.03,
}

reg = xgb.XGBRegressor(**params)

predict = predict_mix.predict_model_split(reg, X_drop, target_minT, cn.value, 5)
for train, test in predict:
    print("Train size {}".format(train.shape[0]))
    predict_mix.print_mean(train[[cn.value]], train[['prediction']])
    print("Test size {}".format(test.shape[0]))
    predict_mix.print_mean(test[[cn.value]], test[['prediction']])
    print()

Train size 164
Mean squared error 0.35584
Mean absolute error 0.37387
Median absolute error 0.23496
Explained variance score 0.99664
Coefficient of determination 0.99664
Test size 163
Mean squared error 20.48908
Mean absolute error 3.58482
Median absolute error 3.00597
Explained variance score 0.91369
Coefficient of determination 0.89884

Train size 327
Mean squared error 0.49539
Mean absolute error 0.51687
Median absolute error 0.34488
Explained variance score 0.99684
Coefficient of determination 0.99684
Test size 163
Mean squared error 41.60720
Mean absolute error 4.52323
Median absolute error 2.69972
Explained variance score 0.67580
Coefficient of determination 0.65287

Train size 490
Mean squared error 0.98423
Mean absolute error 0.74821
Median absolute error 0.58799
Explained variance score 0.99373
Coefficient of determination 0.99373
Test size 163
Mean squared error 78.61994
Mean absolute error 7.53598
Median absolute error 7.80517
Explained variance score 0.48300
Coefficient of 

In [5]:
from sklearn.feature_selection import SelectFromModel



params = {
    'max_depth': 3,
    'min_child_weight': 3.01,
}
reg_importances = xgb.XGBRegressor(**params)
predict = predict_mix.predict_model_split(reg_importances, X, target_minT, cn.value, 5)

importances = pd.DataFrame(reg_importances.feature_importances_, index=X.columns, columns=['Score'])
importances = importances.sort_values(by=['Score'], ascending=False)


# xgb.plot_importance(reg, importance_type='gain')

slice_importances = importances.iloc[:10]
X_select = X.loc[:, slice_importances.index]
X_select.head()

Unnamed: 0_level_0,avg t2,avg t850,avg snownc,avg lfc,avg td2,avg rh2,avg td850,avg rh850,avg t925,avg td925
actual_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2014-06-20 21:00:00,16.7912,15.3813,0.0,2659.22,11.9033,73.5068,6.08178,54.2185,18.4872,9.81721
2014-10-28 21:00:00,-15.389,-5.31635,0.0,18037.1,-19.0136,73.8041,-20.8658,30.171,-4.0069,-18.1666
2014-10-30 21:00:00,-22.3096,-13.0799,0.011389,17846.5,-20.5491,100.0,-20.9479,52.5306,-13.2968,-20.6106
2014-11-01 21:00:00,-9.84371,-11.6014,0.049867,17805.3,-11.2649,89.4535,-14.1217,81.9789,-8.19739,-10.366
2014-11-02 21:00:00,-6.28352,-9.55655,4.4041,17782.2,-7.29548,92.6628,-10.7606,91.0236,-6.60454,-7.51845


In [6]:
params = {
    'verbosity':0,
    'max_depth': 4,
    
    'min_child_weight': 6,
#     'learning_rate': 0.03,
}

reg = xgb.XGBRegressor(**params)

predict = predict_mix.predict_model_split(reg, X_select, target_minT, cn.value, 5)
for train, test in predict:
    print("Train size {}".format(train.shape[0]))
    predict_mix.print_mean(train[[cn.value]], train[['prediction']])
    print("Test size {}".format(test.shape[0]))
    predict_mix.print_mean(test[[cn.value]], test[['prediction']])
    print()

Train size 164
Mean squared error 0.97613
Mean absolute error 0.67949
Median absolute error 0.47932
Explained variance score 0.99080
Coefficient of determination 0.99080
Test size 163
Mean squared error 17.50278
Mean absolute error 3.26742
Median absolute error 2.82992
Explained variance score 0.92050
Coefficient of determination 0.91358

Train size 327
Mean squared error 1.88343
Mean absolute error 1.00000
Median absolute error 0.70793
Explained variance score 0.98798
Coefficient of determination 0.98798
Test size 163
Mean squared error 40.97219
Mean absolute error 5.12601
Median absolute error 4.11301
Explained variance score 0.65817
Coefficient of determination 0.65817

Train size 490
Mean squared error 2.51954
Mean absolute error 1.17088
Median absolute error 0.85887
Explained variance score 0.98394
Coefficient of determination 0.98394
Test size 163
Mean squared error 83.36572
Mean absolute error 7.76231
Median absolute error 8.24188
Explained variance score 0.45439
Coefficient of 

In [7]:
from sklearn.preprocessing import PolynomialFeatures

pl = PolynomialFeatures(degree=2, include_bias=False)
X_polynomial = pd.DataFrame(pl.fit_transform(X_select), index=X.index)
X_polynomial.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,55,56,57,58,59,60,61,62,63,64
actual_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2014-06-20 21:00:00,16.7912,15.3813,0.0,2659.22,11.9033,73.5068,6.08178,54.2185,18.4872,9.81721,...,36.988048,329.744989,112.435083,59.706111,2939.645742,1002.348253,532.2744,341.776564,181.492725,96.377612
2014-10-28 21:00:00,-15.389,-5.31635,0.0,18037.1,-19.0136,73.8041,-20.8658,30.171,-4.0069,-18.1666,...,435.38161,-629.542052,83.607174,379.060642,910.289241,-120.89218,-548.104489,16.055248,72.79175,330.025356
2014-10-30 21:00:00,-22.3096,-13.0799,0.011389,17846.5,-20.5491,100.0,-20.9479,52.5306,-13.2968,-20.6106,...,438.814514,-1100.405756,278.540037,431.748788,2759.463936,-698.488882,-1082.687184,176.80489,274.055026,424.796832
2014-11-01 21:00:00,-9.84371,-11.6014,0.049867,17805.3,-11.2649,89.4535,-14.1217,81.9789,-8.19739,-10.366,...,199.422411,-1157.681432,115.761082,146.385542,6720.540045,-672.013015,-849.793277,67.197203,84.974145,107.453956
2014-11-02 21:00:00,-6.28352,-9.55655,4.4041,17782.2,-7.29548,92.6628,-10.7606,91.0236,-6.60454,-7.51845,...,115.790512,-979.46855,71.068813,80.903033,8285.295757,-601.169007,-684.356385,43.619949,49.655904,56.52709


In [8]:
params = {
    'verbosity':0,
    'max_depth': 4,
    'gamma': 6    
}

reg = xgb.XGBRegressor(**params)
predict = predict_mix.predict_model_split(reg, X_polynomial, target_minT, cn.value, 5)
for train, test in predict:
    print("Train size {}".format(train.shape[0]))
    predict_mix.print_mean(train[[cn.value]], train[['prediction']])
    print("Test size {}".format(test.shape[0]))
    predict_mix.print_mean(test[[cn.value]], test[['prediction']])
    print()

Train size 164
Mean squared error 0.94839
Mean absolute error 0.72187
Median absolute error 0.52208
Explained variance score 0.99106
Coefficient of determination 0.99106
Test size 163
Mean squared error 21.76859
Mean absolute error 3.77637
Median absolute error 3.36785
Explained variance score 0.89867
Coefficient of determination 0.89252

Train size 327
Mean squared error 1.68534
Mean absolute error 0.99900
Median absolute error 0.79305
Explained variance score 0.98924
Coefficient of determination 0.98924
Test size 163
Mean squared error 48.04042
Mean absolute error 5.95122
Median absolute error 5.60005
Explained variance score 0.60375
Coefficient of determination 0.59919

Train size 490
Mean squared error 1.94893
Mean absolute error 1.05282
Median absolute error 0.79942
Explained variance score 0.98757
Coefficient of determination 0.98757
Test size 163
Mean squared error 78.65001
Mean absolute error 7.57097
Median absolute error 7.75794
Explained variance score 0.48817
Coefficient of 