In [1]:
import numpy as np
import pandas as pd
import scipy.sparse
import xgboost as xgb
import mix_pandas as mix
import predict as predict_mix
import db_column_name as db

%matplotlib inline
import matplotlib.pyplot as plt
%config InlineBackend.figure_format = 'svg'

from pylab import rcParams
rcParams['figure.figsize'] = 10, 7


cn = db.ColumnName()

target_minT = pd.read_csv('./data/31286_103.csv')
mix.set_index_date(target_minT, cn.date)

X = pd.read_csv('./data/character_31286.csv')
mix.set_index_date(X, cn.date)

X = X.drop([cn.point], axis=1)
X = X[[x for x in X.columns if 'avg' in x or 
       x == cn.offset]]
    
X = mix.mean_day(X)
target_minT.index = target_minT.index.round('D')

X = X.drop([cn.offset], axis=1)

target_minT = target_minT.reindex(X.index)
target_minT = mix.clean(target_minT)
X = X.reindex(target_minT.index)
X = mix.clean(X)

target_minT = mix.winsorized(target_minT, cn.value, [0.1, 0.80])
X = X.reindex(target_minT.index)
print(X.shape)


(1369, 54)


In [2]:
X = X[X.index.year < 2016]
target_minT = target_minT[target_minT.index.year < 2016]
print(X.shape)

(736, 54)


In [3]:
import statsmodels.api as sm

# from https://datascience.stackexchange.com/questions/24405/how-to-do-stepwise-regression-using-sklearn?rq=1
def stepwise_selection(X, y, 
                       initial_list=[], 
                       threshold_in=0.01, 
                       threshold_out = 0.05, 
                       verbose=True):
    """ Perform a forward-backward feature selection 
    based on p-value from statsmodels.api.OLS
    Arguments:
        X - pandas.DataFrame with candidate features
        y - list-like with the target
        initial_list - list of features to start with (column names of X)
        threshold_in - include a feature if its p-value < threshold_in
        threshold_out - exclude a feature if its p-value > threshold_out
        verbose - whether to print the sequence of inclusions and exclusions
    Returns: list of selected features 
    Always set threshold_in < threshold_out to avoid infinite looping.
    See https://en.wikipedia.org/wiki/Stepwise_regression for the details
    """
    included = list(initial_list)
    while True:
        changed=False
        # forward step
        excluded = list(set(X.columns)-set(included))
        new_pval = pd.Series(index=excluded)
        for new_column in excluded:
            model = sm.OLS(y, sm.add_constant(pd.DataFrame(X[included+[new_column]]))).fit()
            new_pval[new_column] = model.pvalues[new_column]
        best_pval = new_pval.min()
        if best_pval < threshold_in:
            best_feature = new_pval.argmin()
            included.append(best_feature)
            changed=True
            if verbose:
                print('Add  {:30} with p-value {:.6}'.format(best_feature, best_pval))

        # backward step
        model = sm.OLS(y, sm.add_constant(pd.DataFrame(X[included]))).fit()
        # use all coefs except intercept
        pvalues = model.pvalues.iloc[1:]
        worst_pval = pvalues.max() # null if pvalues is empty
        if worst_pval > threshold_out:
            changed=True
            worst_feature = pvalues.argmax()
            included.remove(worst_feature)
            if verbose:
                print('Drop {:30} with p-value {:.6}'.format(worst_feature, worst_pval))
        if not changed:
            break
    return included


In [4]:
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LinearRegression, Lasso

selectColumns = stepwise_selection(X, target_minT)
X_select = X.loc[:, selectColumns]

X_select.head()

  return getattr(obj, method)(*args, **kwds)
The current behaviour of 'Series.argmin' is deprecated, use 'idxmin'
instead.
The behavior of 'argmin' will be corrected to return the positional
minimum in the future. For now, use 'series.values.argmin' or
'np.argmin(np.array(values))' to get the position of the minimum
row.


Add  avg t2                         with p-value 0.0
Add  avg tdd2                       with p-value 1.11962e-12
Add  avg dudy500                    with p-value 6.27055e-12
Add  avg v850                       with p-value 5.25785e-09
Add  avg u500                       with p-value 3.13872e-06
Add  avg v10                        with p-value 9.26007e-07
Add  avg advT2                      with p-value 2.0861e-05
Add  avg lfc                        with p-value 0.00016039
Add  avg t500                       with p-value 0.000103352
Add  avg rh2                        with p-value 0.000403877


Unnamed: 0,avg t2,avg tdd2,avg dudy500,avg v850,avg u500,avg v10,avg advT2,avg lfc,avg t500,avg rh2
2013-01-03,-28.59545,6.006125,2e-06,-13.8658,-2.346295,-3.449845,-0.000594,17058.8,-40.9627,56.202
2013-01-04,-26.42975,5.895285,-7e-06,-10.749225,-2.604466,-2.995815,-0.000611,17146.1,-40.88725,57.4688
2013-01-05,-29.51275,5.79934,6e-06,-14.56615,-7.25115,-4.02915,-0.000278,17178.3,-41.31615,57.06995
2013-01-06,-31.34385,5.81015,-4.2e-05,-15.01285,-6.219925,-4.04663,-0.000392,17171.85,-42.6275,56.4544
2013-01-08,-30.326,5.186235,-1.5e-05,-15.4542,-4.10986,-3.67926,-0.000464,17182.0,-42.7425,60.43055


In [5]:
predict = predict_mix.predict_model_split(LinearRegression(), X_select, target_minT, cn.value, 5)
for train, test in predict:
    print("Train size {}".format(train.shape[0]))
    predict_mix.print_mean(train[[cn.value]], train[['prediction']])
    print("Test size {}".format(test.shape[0]))
    predict_mix.print_mean(test[[cn.value]], test[['prediction']])
    print()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item_labels[indexer[info_axis]]] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead



Train size 126
Mean squared error 7.55672
Mean absolute error 2.07618
Median absolute error 1.50859
Test size 122
Mean squared error 14.67905
Mean absolute error 2.98388
Median absolute error 2.52009

Train size 248
Mean squared error 8.52872
Mean absolute error 2.22596
Median absolute error 1.71028
Test size 122
Mean squared error 7.31216
Mean absolute error 2.17821
Median absolute error 1.97104

Train size 370
Mean squared error 7.95004
Mean absolute error 2.18135
Median absolute error 1.74207
Test size 122
Mean squared error 17.74742
Mean absolute error 3.21275
Median absolute error 2.51100

Train size 492
Mean squared error 9.54212
Mean absolute error 2.37486
Median absolute error 1.91452
Test size 122
Mean squared error 11.09497
Mean absolute error 2.50078
Median absolute error 2.19905

Train size 614
Mean squared error 9.82314
Mean absolute error 2.39900
Median absolute error 1.96894
Test size 122
Mean squared error 14.09876
Mean absolute error 2.93957
Median absolute error 2.248

In [10]:
params = {
    'verbosity':0,
    'max_depth': 3,
    
    'min_child_weight': 6.01,
    'gamma': 9,
}
reg = xgb.XGBRegressor(**params)
predict = predict_mix.predict_model_split(reg, X_select, target_minT, cn.value, 5)
for train, test in predict:
    print("Train size {}".format(train.shape[0]))
    predict_mix.print_mean(train[[cn.value]], train[['prediction']])
    print("Test size {}".format(test.shape[0]))
    predict_mix.print_mean(test[[cn.value]], test[['prediction']])
    print() 

Train size 126
Mean squared error 2.06264
Mean absolute error 1.07959
Median absolute error 0.84872
Test size 122
Mean squared error 26.97014
Mean absolute error 4.49231
Median absolute error 4.47534

Train size 248
Mean squared error 2.26541
Mean absolute error 1.15872
Median absolute error 0.89543
Test size 122
Mean squared error 11.85111
Mean absolute error 2.66610
Median absolute error 1.96814

Train size 370
Mean squared error 2.61948
Mean absolute error 1.21410
Median absolute error 0.95198
Test size 122
Mean squared error 16.71151
Mean absolute error 3.06579
Median absolute error 1.94675

Train size 492
Mean squared error 2.87275
Mean absolute error 1.27766
Median absolute error 0.98573
Test size 122
Mean squared error 16.76639
Mean absolute error 2.87383
Median absolute error 1.94189

Train size 614
Mean squared error 3.75187
Mean absolute error 1.48172
Median absolute error 1.11994
Test size 122
Mean squared error 17.14166
Mean absolute error 3.29303
Median absolute error 2.84