In [2]:
import numpy as np
import pandas as pd
import scipy.sparse
import xgboost as xgb
import mix_pandas as mix
import predict as predict_mix
import db_column_name as db

%matplotlib inline
import matplotlib.pyplot as plt
%config InlineBackend.figure_format = 'svg'

from pylab import rcParams
rcParams['figure.figsize'] = 10, 7


cn = db.ColumnName()

target_minT = pd.read_csv('./data/31286_103.csv')
mix.set_index_date(target_minT, cn.date)

X = pd.read_csv('./data/character_31286.csv')
mix.set_index_date(X, cn.date)

X = X.drop([cn.point], axis=1)
X = X[[x for x in X.columns if 'avg' in x or 
       x == cn.offset]]

X = X[X[cn.offset] == 69]
X = X[X.index.hour == 21]
print(X.shape)

# X = mix.mean_day(X)
# target_minT.index = target_minT.index.round('D')

X = X.drop([cn.offset], axis=1)

target_minT = target_minT.reindex(X.index)
target_minT = mix.clean(target_minT)
X = X.reindex(target_minT.index)
X = mix.clean(X)
print(X.shape)

target_minT = target_minT.iloc[3:] # remove on change

target_minT = mix.winsorized(target_minT, cn.value, [0.05, 0.95], 5)
X = X.reindex(target_minT.index)
print(X.shape)

(1245, 55)
(1195, 54)
(979, 54)


In [6]:
from sklearn.linear_model import LinearRegression, Lasso, RidgeCV

predict = predict_mix.predict_model_split(LinearRegression(), X, target_minT, cn.value, 5)
for train, test in predict:
    print("Train size {}".format(train.shape[0]))
    predict_mix.print_mean(train[[cn.value]], train[['prediction']])
    print("Test size {}".format(test.shape[0]))
    predict_mix.print_mean(test[[cn.value]], test[['prediction']])
    print()

Train size 164
Mean squared error 8.09336
Mean absolute error 2.19135
Median absolute error 1.62552
Test size 163
Mean squared error 25.62611
Mean absolute error 4.18874
Median absolute error 3.72096

Train size 327
Mean squared error 10.57242
Mean absolute error 2.56344
Median absolute error 2.15397
Test size 163
Mean squared error 5075411572713706.00000
Mean absolute error 65551310.19407
Median absolute error 77426546.89923

Train size 490
Mean squared error 26.66813
Mean absolute error 3.96104
Median absolute error 3.11944
Test size 163
Mean squared error 796.50021
Mean absolute error 24.65002
Median absolute error 24.54896

Train size 653
Mean squared error 38.79072
Mean absolute error 5.04624
Median absolute error 4.09525
Test size 163
Mean squared error 83.13400
Mean absolute error 5.35997
Median absolute error 3.37880

Train size 816
Mean squared error 35.37743
Mean absolute error 4.64149
Median absolute error 3.63699
Test size 163
Mean squared error 28.85482
Mean absolute error

In [8]:
lasso = Lasso(alpha=5, tol=0.001)

predict = predict_mix.predict_model_split(lasso, X, target_minT, cn.value, 5)
for train, test in predict:
    print("Train size {}".format(train.shape[0]))
    predict_mix.print_mean(train[[cn.value]], train[['prediction']])
    print("Test size {}".format(test.shape[0]))
    predict_mix.print_mean(test[[cn.value]], test[['prediction']])
    print()
    # test.plot(style=".")



Train size 164
Mean squared error 12.37245
Mean absolute error 2.63223
Median absolute error 1.99496
Test size 163
Mean squared error 19.24634
Mean absolute error 3.61970
Median absolute error 3.37329

Train size 327
Mean squared error 13.24526
Mean absolute error 2.79441
Median absolute error 2.28381
Test size 163
Mean squared error 250.84859
Mean absolute error 12.69916
Median absolute error 11.22717

Train size 490
Mean squared error 34.80907
Mean absolute error 4.78556
Median absolute error 4.26727
Test size 163
Mean squared error 414.19956
Mean absolute error 18.50522
Median absolute error 19.00003

Train size 653
Mean squared error 41.46617
Mean absolute error 5.19374
Median absolute error 4.38162
Test size 163
Mean squared error 22.74767
Mean absolute error 3.55582
Median absolute error 2.47854

Train size 816
Mean squared error 36.70484
Mean absolute error 4.68980
Median absolute error 3.64193
Test size 163
Mean squared error 17.98522
Mean absolute error 3.57759
Median absolute

In [9]:
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LinearRegression, Lasso
from stepwise import stepwise_selection

selectColumns = stepwise_selection(X, target_minT)
X_select = X.loc[:, selectColumns]

X_select.head()

  return getattr(obj, method)(*args, **kwds)
The current behaviour of 'Series.argmin' is deprecated, use 'idxmin'
instead.
The behavior of 'argmin' will be corrected to return the positional
minimum in the future. For now, use 'series.values.argmin' or
'np.argmin(np.array(values))' to get the position of the minimum
row.
  best_feature = new_pval.argmin()


Add  avg t2                         with p-value 2.30822e-264
Add  avg rh2                        with p-value 2.22681e-56
Add  avg v850                       with p-value 4.69371e-10
Add  avg u500                       with p-value 0.00231836
Add  avg rot500                     with p-value 0.00957776


Unnamed: 0_level_0,avg t2,avg rh2,avg v850,avg u500,avg rot500
actual_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2014-06-20 21:00:00,16.7912,73.5068,-15.08,-2.82198,-2.5e-05
2014-10-28 21:00:00,-15.389,73.8041,-4.27446,-11.3359,8e-05
2014-10-30 21:00:00,-22.3096,100.0,-10.2639,21.1847,-5.3e-05
2014-11-01 21:00:00,-9.84371,89.4535,2.18367,12.2031,4.3e-05
2014-11-02 21:00:00,-6.28352,92.6628,3.07034,5.90434,4e-05


In [10]:
predict = predict_mix.predict_model_split(LinearRegression(), X_select, target_minT, cn.value, 5)
for train, test in predict:
    print("Train size {}".format(train.shape[0]))
    predict_mix.print_mean(train[[cn.value]], train[['prediction']])
    print("Test size {}".format(test.shape[0]))
    predict_mix.print_mean(test[[cn.value]], test[['prediction']])
    print()
    
# 16.3 8.4 16.5 12.3 15.3  

Train size 164
Mean squared error 12.96202
Mean absolute error 2.64366
Median absolute error 1.87312
Test size 163
Mean squared error 16.18719
Mean absolute error 3.24427
Median absolute error 2.84261

Train size 327
Mean squared error 13.23825
Mean absolute error 2.77780
Median absolute error 2.12382
Test size 163
Mean squared error 627.24019
Mean absolute error 20.83204
Median absolute error 20.27015

Train size 490
Mean squared error 44.07482
Mean absolute error 5.53000
Median absolute error 5.02905
Test size 163
Mean squared error 190.35868
Mean absolute error 12.38325
Median absolute error 12.46096

Train size 653
Mean squared error 42.47259
Mean absolute error 5.29771
Median absolute error 4.52335
Test size 163
Mean squared error 20.90832
Mean absolute error 3.51831
Median absolute error 2.77283

Train size 816
Mean squared error 37.32745
Mean absolute error 4.76260
Median absolute error 3.90912
Test size 163
Mean squared error 16.33273
Mean absolute error 3.31838
Median absolute