In [74]:
import numpy as np
import pandas as pd
import scipy.sparse
import xgboost as xgb
import mix_pandas as mix
import predict as predict_mix
import db_column_name as db

%matplotlib inline
import matplotlib.pyplot as plt
%config InlineBackend.figure_format = 'svg'

from pylab import rcParams
rcParams['figure.figsize'] = 10, 7


cn = db.ColumnName()

target_minT = pd.read_csv('./data/31286_103.csv')
mix.set_index_date(target_minT, cn.date)

X = pd.read_csv('./data/character_31286.csv')
mix.set_index_date(X, cn.date)

X = X.drop([cn.point], axis=1)
X = X[[x for x in X.columns if 'avg' in x or 
       x == cn.offset]]
    
X = mix.mean_day(X)
target_minT.index = target_minT.index.round('D')

X = X.drop([cn.offset], axis=1)

target_minT = target_minT.reindex(X.index)
target_minT = mix.clean(target_minT)
X = X.reindex(target_minT.index)
X = mix.clean(X)

print(X.shape)

(1810, 54)


In [75]:
from sklearn.linear_model import LinearRegression, Lasso, RidgeCV

predict = predict_mix.predict_model_split(LinearRegression(), X, target_minT, cn.value, 5)
for train, test in predict:
    print("Train size {}".format(train.shape[0]))
    predict_mix.print_mean(train[[cn.value]], train[['prediction']])
    print("Test size {}".format(test.shape[0]))
    predict_mix.print_mean(test[[cn.value]], test[['prediction']])
    print()
    # test.plot(ylim=(-40, 40))

Train size 305
Mean squared error 11.31282
Mean absolute error 2.38519
Median absolute error 1.82815
Test size 301
Mean squared error 155514053310.51224
Mean absolute error 22733.47630
Median absolute error 2.42907

Train size 606
Mean squared error 12.94494
Mean absolute error 2.61230
Median absolute error 2.08921
Test size 301
Mean squared error 15.06739
Mean absolute error 3.02352
Median absolute error 2.46595

Train size 907
Mean squared error 13.18064
Mean absolute error 2.67868
Median absolute error 2.02059
Test size 301
Mean squared error 231.57739
Mean absolute error 11.19373
Median absolute error 7.32078

Train size 1208
Mean squared error 26.63799
Mean absolute error 3.85389
Median absolute error 3.05554
Test size 301
Mean squared error 726.64255
Mean absolute error 21.17420
Median absolute error 17.42243

Train size 1509
Mean squared error 49.57631
Mean absolute error 5.86920
Median absolute error 5.42753
Test size 301
Mean squared error 103.78405
Mean absolute error 8.23679

In [76]:
target_w = mix.winsorized(target_minT, cn.value, [0.1, 0.80])
X_w = X.reindex(target_w.index)

In [77]:
predict = predict_mix.predict_model_split(LinearRegression(), X_w, target_w, cn.value, 5)
for train, test in predict:
    print("Train size {}".format(train.shape[0]))
    predict_mix.print_mean(train[[cn.value]], train[['prediction']])
    print("Test size {}".format(test.shape[0]))
    predict_mix.print_mean(test[[cn.value]], test[['prediction']])
    print()
    # test.plot(style=".")

Train size 229
Mean squared error 6.68808
Mean absolute error 2.07201
Median absolute error 1.68865
Test size 228
Mean squared error 15.29686
Mean absolute error 2.94277
Median absolute error 2.34924

Train size 457
Mean squared error 7.98147
Mean absolute error 2.19949
Median absolute error 1.79882
Test size 228
Mean squared error 13.03147
Mean absolute error 2.77998
Median absolute error 2.21132

Train size 685
Mean squared error 8.98231
Mean absolute error 2.30481
Median absolute error 1.83383
Test size 228
Mean squared error 1106953492697747.62500
Mean absolute error 24708827.95920
Median absolute error 44889322.43828

Train size 913
Mean squared error 20.78643
Mean absolute error 3.49343
Median absolute error 2.80129
Test size 228
Mean squared error 783.99248
Mean absolute error 22.11890
Median absolute error 18.64271

Train size 1141
Mean squared error 44.93565
Mean absolute error 5.68704
Median absolute error 5.41280
Test size 228
Mean squared error 115.76448
Mean absolute error

In [78]:
lasso = Lasso(alpha=5, tol=0.001)

predict = predict_mix.predict_model_split(lasso, X_w, target_w, cn.value, 5)
for train, test in predict:
    print("Train size {}".format(train.shape[0]))
    predict_mix.print_mean(train[[cn.value]], train[['prediction']])
    print("Test size {}".format(test.shape[0]))
    predict_mix.print_mean(test[[cn.value]], test[['prediction']])
    print()
    # test.plot(style=".")



Train size 229
Mean squared error 10.93105
Mean absolute error 2.61451
Median absolute error 2.11203
Test size 228
Mean squared error 14.66785
Mean absolute error 2.94960
Median absolute error 2.39150

Train size 457
Mean squared error 11.53566
Mean absolute error 2.64032
Median absolute error 2.05755
Test size 228
Mean squared error 15.43740
Mean absolute error 2.96453
Median absolute error 2.35639

Train size 685
Mean squared error 12.53574
Mean absolute error 2.72179
Median absolute error 2.11641
Test size 228
Mean squared error 4434.41929
Mean absolute error 50.01550
Median absolute error 64.66043

Train size 913
Mean squared error 27.15005
Mean absolute error 3.98680
Median absolute error 3.23429
Test size 228
Mean squared error 622.77507
Mean absolute error 20.64348
Median absolute error 19.74771

Train size 1141
Mean squared error 50.56670
Mean absolute error 6.09361
Median absolute error 5.88121
Test size 228
Mean squared error 48.20229
Mean absolute error 5.66554
Median absolu

