In [1]:
import numpy as np
import pandas as pd
import scipy.sparse
import xgboost as xgb
import mix_pandas as mix
import predict as predict_mix
import db_column_name as db

%matplotlib inline
import matplotlib.pyplot as plt
%config InlineBackend.figure_format = 'svg'

from pylab import rcParams
rcParams['figure.figsize'] = 10, 7


cn = db.ColumnName()

target_minT = pd.read_csv('./data/31286_103.csv')
mix.set_index_date(target_minT, cn.date)

X = pd.read_csv('./data/character_31286.csv')
mix.set_index_date(X, cn.date)

X = X.drop([cn.point], axis=1)
X = X[[x for x in X.columns if 'avg' in x or 
       x == cn.offset]]
    
X = mix.mean_day(X)
target_minT.index = target_minT.index.round('D')

X = X.drop([cn.offset], axis=1)

target_minT = target_minT.reindex(X.index)
target_minT = mix.clean(target_minT)
X = X.reindex(target_minT.index)
X = mix.clean(X)

target_minT = mix.winsorized(target_minT, cn.value, [0.05, 0.95])
X = X.reindex(target_minT.index)
print(X.shape)


(1517, 54)


In [2]:
X = X[X.index.year < 2016]
target_minT = target_minT[target_minT.index.year < 2016]
print(X.shape)

(818, 54)


In [4]:
from sklearn.linear_model import LinearRegression, Lasso, RidgeCV

predict = predict_mix.predict_model_split(LinearRegression(), X, target_minT, cn.value, 5)
for train, test in predict:
    print("Train size {}".format(train.shape[0]))
    predict_mix.print_mean(train[[cn.value]], train[['prediction']])
    print("Test size {}".format(test.shape[0]))
    predict_mix.print_mean(test[[cn.value]], test[['prediction']])
    print()

Train size 138
Mean squared error 5.77232
Mean absolute error 1.92098
Median absolute error 1.71034
Test size 136
Mean squared error 21.93442
Mean absolute error 3.73301
Median absolute error 3.23564

Train size 274
Mean squared error 6.85471
Mean absolute error 2.09460
Median absolute error 1.79728
Test size 136
Mean squared error 91598697841.75485
Mean absolute error 25954.84901
Median absolute error 2.06724

Train size 410
Mean squared error 7.29691
Mean absolute error 2.10023
Median absolute error 1.64537
Test size 136
Mean squared error 19.86829
Mean absolute error 3.26155
Median absolute error 2.62334

Train size 546
Mean squared error 8.65934
Mean absolute error 2.29737
Median absolute error 1.84333
Test size 136
Mean squared error 12.94804
Mean absolute error 2.76527
Median absolute error 2.38496

Train size 682
Mean squared error 9.22692
Mean absolute error 2.33165
Median absolute error 1.86379
Test size 136
Mean squared error 14.07580
Mean absolute error 2.97560
Median absolu

In [8]:
lasso = Lasso(alpha=5, tol=0.001)

predict = predict_mix.predict_model_split(lasso, X, target_minT, cn.value, 5)
for train, test in predict:
    print("Train size {}".format(train.shape[0]))
    predict_mix.print_mean(train[[cn.value]], train[['prediction']])
    print("Test size {}".format(test.shape[0]))
    predict_mix.print_mean(test[[cn.value]], test[['prediction']])
    print()
    # test.plot(style=".")



Train size 138
Mean squared error 8.80328
Mean absolute error 2.37742
Median absolute error 2.02186
Test size 136
Mean squared error 24.90841
Mean absolute error 4.13225
Median absolute error 3.38528

Train size 274
Mean squared error 10.56489
Mean absolute error 2.58347
Median absolute error 2.11871
Test size 136
Mean squared error 12.82862
Mean absolute error 2.66833
Median absolute error 2.09462

Train size 410
Mean squared error 10.62899
Mean absolute error 2.53128
Median absolute error 1.98802
Test size 136
Mean squared error 18.63690
Mean absolute error 3.31074
Median absolute error 2.54739

Train size 546
Mean squared error 11.98781
Mean absolute error 2.67876
Median absolute error 2.03142
Test size 136
Mean squared error 13.79657
Mean absolute error 2.68313
Median absolute error 1.90922

Train size 682
Mean squared error 12.36259
Mean absolute error 2.67504
Median absolute error 2.04128
Test size 136
Mean squared error 22.17259
Mean absolute error 3.78020
Median absolute error 



In [9]:
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LinearRegression, Lasso
from stepwise import stepwise_selection

selectColumns = stepwise_selection(X, target_minT)
X_select = X.loc[:, selectColumns]

X_select.head()

will be corrected to return the positional minimum in the future.
Use 'series.values.argmin' to get the position of the minimum now.
  best_feature = new_pval.argmin()


Add  avg td2                        with p-value 0.0
Add  avg rh2                        with p-value 3.65094e-23
Add  avg dudy500                    with p-value 1.72192e-12
Add  avg v850                       with p-value 1.11523e-07
Add  avg u500                       with p-value 7.62448e-09
Add  avg v10                        with p-value 4.26742e-09
Add  avg advT2                      with p-value 4.3426e-06
Add  avg lfc                        with p-value 3.42287e-05
Drop avg v850                       with p-value 0.0795595


will be corrected to return the positional maximum in the future.
Use 'series.values.argmax' to get the position of the maximum now.
  worst_feature = pvalues.argmax()


Unnamed: 0,avg td2,avg rh2,avg dudy500,avg u500,avg v10,avg advT2,avg lfc
2013-01-03,-34.6015,56.202,2e-06,-2.346295,-3.449845,-0.000594,17058.8
2013-01-04,-32.32505,57.4688,-7e-06,-2.604466,-2.995815,-0.000611,17146.1
2013-01-05,-35.31215,57.06995,6e-06,-7.25115,-4.02915,-0.000278,17178.3
2013-01-06,-37.154,56.4544,-4.2e-05,-6.219925,-4.04663,-0.000392,17171.85
2013-01-08,-35.5122,60.43055,-1.5e-05,-4.10986,-3.67926,-0.000464,17182.0


In [10]:
predict = predict_mix.predict_model_split(LinearRegression(), X_select, target_minT, cn.value, 5)
for train, test in predict:
    print("Train size {}".format(train.shape[0]))
    predict_mix.print_mean(train[[cn.value]], train[['prediction']])
    print("Test size {}".format(test.shape[0]))
    predict_mix.print_mean(test[[cn.value]], test[['prediction']])
    print()
    
# 16.3 8.4 16.5 12.3 15.3  

Train size 138
Mean squared error 7.73377
Mean absolute error 2.13116
Median absolute error 1.68869
Test size 136
Mean squared error 16.29366
Mean absolute error 3.28792
Median absolute error 2.94934

Train size 274
Mean squared error 8.64822
Mean absolute error 2.29851
Median absolute error 1.83068
Test size 136
Mean squared error 8.36729
Mean absolute error 2.28118
Median absolute error 1.87624

Train size 410
Mean squared error 8.41540
Mean absolute error 2.25507
Median absolute error 1.73970
Test size 136
Mean squared error 16.54398
Mean absolute error 3.05361
Median absolute error 2.42193

Train size 546
Mean squared error 9.81954
Mean absolute error 2.41402
Median absolute error 1.91175
Test size 136
Mean squared error 12.33867
Mean absolute error 2.66652
Median absolute error 2.06319

Train size 682
Mean squared error 10.30514
Mean absolute error 2.46048
Median absolute error 1.93369
Test size 136
Mean squared error 15.30420
Mean absolute error 3.07858
Median absolute error 2.35