In [2]:
import numpy as np
import pandas as pd
import scipy.sparse
import xgboost as xgb
import mix_pandas as mix
import predict as predict_mix
import db_column_name as db

%matplotlib inline
import matplotlib.pyplot as plt
%config InlineBackend.figure_format = 'svg'

from pylab import rcParams
rcParams['figure.figsize'] = 10, 7


cn = db.ColumnName()

target_minT = pd.read_csv('./data/31286_103.csv')
mix.set_index_date(target_minT, cn.date)

X = pd.read_csv('./data/character_31286.csv')
mix.set_index_date(X, cn.date)

X = X.drop([cn.point], axis=1)
X = X[[x for x in X.columns if 'avg' in x or 
       x == cn.offset]]
    
X = mix.mean_day(X)
target_minT.index = target_minT.index.round('D')

X = X.drop([cn.offset], axis=1)

target_minT = target_minT.reindex(X.index)
target_minT = mix.clean(target_minT)
X = X.reindex(target_minT.index)
X = mix.clean(X)

target_minT = mix.winsorized(target_minT, cn.value, [0.05, 0.95])
X = X.reindex(target_minT.index)
print(X.shape)

(1517, 54)


In [3]:
X = X[X.index.year < 2016]
target_minT = target_minT[target_minT.index.year < 2016]
print(X.shape)

(818, 54)


In [4]:
import pywt

def idwt(cA, cD):
    return  pywt.idwt(cA, cD, wavelet='db1', mode='constant')

cA, cD = pywt.dwt(target_minT.loc[:, cn.value].values, 
                  wavelet='db1', mode='constant')
# Approximation and detail coefficients.

cA, cD = pd.DataFrame(cA, columns=['c']), pd.DataFrame(cD, columns=['c'])

XA = mix.mean_pair(X)

x = []
size = X.shape[0]
for i in range(0, size, 2):
    if i + 1 < size:
            save = X.iloc[i:i + 2].diff(-1).fillna(0)
            save = save.iloc[0].values
            # iloc not include end in slice
            x.append(save)
    else:
        x.append(np.zeros(X.shape[1]))

XD = pd.DataFrame(x, columns=X.columns)

# XD = mix.diff_pair(X)

print(XA.shape)
print(XD.shape)

(409, 54)
(409, 54)


In [8]:
from sklearn.model_selection import TimeSeriesSplit

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

params = {
    'verbosity':0,
    'max_depth': 4,

    'learning_rate': 0.05,
    'min_child_weight': 6,
    'gamma': 12,
}

reg = xgb.XGBRegressor(**params)
predict_xa = predict_mix.predict_model_split(reg, XA, cA, 'c', 5)
predict_xd = predict_mix.predict_model_split(reg, XD, cD, 'c', 5)

for a, d in zip(predict_xa, predict_xd):
    # foo
    train_size = a[0].size
    test_size = a[1].size
    target_train = target_minT.iloc[0:train_size]
    target_test = target_minT.iloc[train_size - 1:train_size + test_size - 1]
    
    target_train['prediction'] = idwt(a[0].loc[:, 'prediction'], d[0].loc[:, 'prediction'])

    target_test['prediction'] = idwt(a[1].loc[:, 'prediction'], d[1].loc[:, 'prediction'])
    
    print("Train")
    predict_mix.print_mean(target_train[[cn.value]], target_train[['prediction']])
    print("Test")
    predict_mix.print_mean(target_test[[cn.value]], target_test[['prediction']])
    print()

Train
Mean squared error 1.66698
Mean absolute error 0.96796
Median absolute error 0.72543
Test
Mean squared error 47.10500
Mean absolute error 6.06038
Median absolute error 6.05283

Train
Mean squared error 1.76473
Mean absolute error 0.96584
Median absolute error 0.74295
Test
Mean squared error 14.14659
Mean absolute error 2.77477
Median absolute error 2.18175

Train
Mean squared error 1.72371
Mean absolute error 0.95676
Median absolute error 0.69336
Test
Mean squared error 18.20758
Mean absolute error 3.40923
Median absolute error 3.00700

Train
Mean squared error 1.70499
Mean absolute error 0.97775
Median absolute error 0.75722
Test
Mean squared error 23.11527
Mean absolute error 3.37837
Median absolute error 2.26793

Train
Mean squared error 1.95549
Mean absolute error 1.02853
Median absolute error 0.78084
Test
Mean squared error 22.30374
Mean absolute error 3.73372
Median absolute error 3.18694

