In [1]:
import numpy as np
import pandas as pd
import scipy.sparse
import xgboost as xgb
import mix_pandas as mix
import predict as predict_mix
import db_column_name as db

%matplotlib inline
import matplotlib.pyplot as plt
%config InlineBackend.figure_format = 'svg'

from pylab import rcParams
rcParams['figure.figsize'] = 10, 7


cn = db.ColumnName()

target_minT = pd.read_csv('./data/31286_103.csv')
mix.set_index_date(target_minT, cn.date)

X = pd.read_csv('./data/character_31286.csv')
mix.set_index_date(X, cn.date)

X = X.drop([cn.point], axis=1)
X = X[[x for x in X.columns if 'avg' in x or 
       x == cn.offset]]
    
X = X[X[cn.offset] == 69]
X = X[X.index.hour == 21]
print(X.shape)

    
# X = mix.mean_day(X)
# target_minT.index = target_minT.index.round('D')

X = X.drop([cn.offset], axis=1)

target_minT = target_minT.reindex(X.index)
target_minT = mix.clean(target_minT)
X = X.reindex(target_minT.index)
X = mix.clean(X)

target_minT = mix.winsorized(target_minT, cn.value, [0.05, 0.95], 5)
X = X.reindex(target_minT.index)
print(X.shape)

(1245, 55)
(982, 54)


In [2]:
import pywt

def idwt(cA, cD):
    return  pywt.idwt(cA, cD, wavelet='db1', mode='constant')

cA, cD = pywt.dwt(target_minT.loc[:, cn.value].values, 
                  wavelet='db1', mode='constant')
# Approximation and detail coefficients.

cA, cD = pd.DataFrame(cA, columns=['c']), pd.DataFrame(cD, columns=['c'])

XA = mix.mean_pair(X)

x = []
size = X.shape[0]
for i in range(0, size, 2):
    if i + 1 < size:
            save = X.iloc[i:i + 2].diff(-1).fillna(0)
            save = save.iloc[0].values
            # iloc not include end in slice
            x.append(save)
    else:
        x.append(np.zeros(X.shape[1]))

XD = pd.DataFrame(x, columns=X.columns)

# XD = mix.diff_pair(X)

print(XA.shape)
print(XD.shape)

(491, 54)
(491, 54)


In [4]:
from sklearn.model_selection import TimeSeriesSplit

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

params = {
    'verbosity':0,
    'max_depth': 4,

    'learning_rate': 0.05,
    'min_child_weight': 6,
    'gamma': 12,
}

reg = xgb.XGBRegressor(**params)
predict_xa = predict_mix.predict_model_split(reg, XA, cA, 'c', 5)
predict_xd = predict_mix.predict_model_split(reg, XD, cD, 'c', 5)

for a, d in zip(predict_xa, predict_xd):
    # foo
    train_size = a[0].size
    test_size = a[1].size
    target_train = target_minT.iloc[0:train_size]
    target_test = target_minT.iloc[train_size - 1:train_size + test_size - 1]
    
    target_train['prediction'] = idwt(a[0].loc[:, 'prediction'], d[0].loc[:, 'prediction'])

    target_test['prediction'] = idwt(a[1].loc[:, 'prediction'], d[1].loc[:, 'prediction'])
    
    print("Train")
    predict_mix.print_mean(target_train[[cn.value]], target_train[['prediction']])
    print("Test")
    predict_mix.print_mean(target_test[[cn.value]], target_test[['prediction']])
    print()

Train
Mean squared error 2.01071
Mean absolute error 0.99006
Median absolute error 0.76604
Explained variance score 0.98252
Coefficient of determination 0.98249
Test
Mean squared error 25.68673
Mean absolute error 4.09719
Median absolute error 3.71314
Explained variance score 0.88563
Coefficient of determination 0.87269

Train
Mean squared error 1.69858
Mean absolute error 0.95618
Median absolute error 0.69611
Explained variance score 0.98923
Coefficient of determination 0.98922
Test
Mean squared error 53.14613
Mean absolute error 5.05543
Median absolute error 3.00534
Explained variance score 0.54678
Coefficient of determination 0.52256

Train
Mean squared error 1.78165
Mean absolute error 0.99462
Median absolute error 0.76952
Explained variance score 0.98864
Coefficient of determination 0.98863
Test
Mean squared error 91.16791
Mean absolute error 8.20762
Median absolute error 8.64484
Explained variance score 0.32603
Coefficient of determination -0.22537

Train
Mean squared error 2.170