In [5]:
import numpy as np
import pandas as pd
import scipy.sparse
import xgboost as xgb
import mix_pandas as mix
import predict as predict_mix
import db_column_name as db
import processing 

%matplotlib inline
import matplotlib.pyplot as plt
%config InlineBackend.figure_format = 'svg'

from pylab import rcParams
rcParams['figure.figsize'] = 10, 7


cn = db.ColumnName()

target_minT = pd.read_csv('./data/31286_103.csv')
mix.set_index_date(target_minT, cn.date)

X = pd.read_csv('./data/character_31286.csv')
mix.set_index_date(X, cn.date)

X = X.drop([cn.point], axis=1)
X = X[[x for x in X.columns if 'avg' in x or 
       x == cn.offset]]

X = X[X[cn.offset] == 69]
X = X[X.index.hour == 21]
print(X.shape)


X = mix.mean_day(X)
target_minT.index = target_minT.index.round('D')

X = X.drop([cn.offset], axis=1)

target_minT = target_minT.reindex(X.index)
target_minT = mix.clean(target_minT)
X = X.reindex(target_minT.index)
X = mix.clean(X)

target_minT = mix.winsorized(target_minT, cn.value, [0.05, 0.95], 5)
X = X.reindex(target_minT.index)
# X.head()

(1245, 55)


In [11]:
from scipy.fftpack import ifft, idct

fft_X, fft_target = processing.fft_target(X, target_minT, cn.value, 'index')        
        
        
params = {
    'max_depth': 4,
    'learning_rate': 0.09,
    'min_child_weight': 6,
}
reg = xgb.XGBRegressor(**params)

predict_am = predict_mix.predict_model_split(reg, fft_X, fft_target, 'amplitude', 5)
predict_ph = predict_mix.predict_model_split(reg, fft_X, fft_target, 'phase', 5)

for am, ph in zip(predict_am, predict_ph):
    real_target_train = target_minT.loc[am[0][cn.date]]
    real_target_test = target_minT.loc[am[1][cn.date]]
    
    Y = processing.reshapeToComplex(am[0].loc[:, 'prediction'], ph[0].loc[:, 'prediction'])
    real_target_train['FFT prediction'] = ifft(Y).real

    Y = processing.reshapeToComplex(am[1].loc[:, 'prediction'], ph[1].loc[:, 'prediction'])
    real_target_test['FFT prediction'] = ifft(Y).real
    
    print("Train")
    predict_mix.print_mean(real_target_train[[cn.value]], real_target_train[['FFT prediction']])
    print("Test")
    predict_mix.print_mean(real_target_test[[cn.value]], real_target_test[['FFT prediction']])
    print()

Train
Mean squared error 0.12428
Mean absolute error 0.26724
Median absolute error 0.18159
Explained variance score 0.99877
Coefficient of determination 0.99877
Test
Mean squared error 35.59138
Mean absolute error 5.04989
Median absolute error 4.95448
Explained variance score 0.83457
Coefficient of determination 0.82174

Train
Mean squared error 0.57227
Mean absolute error 0.56389
Median absolute error 0.41179
Explained variance score 0.99628
Coefficient of determination 0.99628
Test
Mean squared error 45.71490
Mean absolute error 5.47925
Median absolute error 4.74163
Explained variance score 0.79855
Coefficient of determination 0.63373

Train
Mean squared error 1.08442
Mean absolute error 0.77121
Median absolute error 0.55925
Explained variance score 0.99305
Coefficient of determination 0.99303
Test
Mean squared error 81.66003
Mean absolute error 7.73220
Median absolute error 8.05464
Explained variance score 0.50179
Coefficient of determination 0.02240

Train
Mean squared error 1.9663

In [8]:
dct_X, dct_target = processing.dct_target(X, target_minT, cn.value, {'norm':'ortho'})  


params = {
    'max_depth': 4,
    'learning_rate': 0.09,
    'min_child_weight': 6,
#     'subsample':0.8, 
#     'colsample_bytree':0.8,
#     'gamma': 12,
}

reg = xgb.XGBRegressor(**params)

predict = predict_mix.predict_model_split(reg, dct_X, dct_target, 'dct', 5)

for train, test in predict:
    target_train = target_minT.loc[train[cn.date]]
    target_test = target_minT.loc[test[cn.date]]
    
    Y = train[['prediction']].values
    target_train['DCT prediction'] = idct(Y, norm='ortho')

    Y = test[['prediction']].values
    target_test['DCT prediction'] = idct(Y, norm='ortho')
    
    print("Train")
    predict_mix.print_mean(target_train[[cn.value]], target_train[['DCT prediction']])
    print("Test")
    predict_mix.print_mean(target_test[[cn.value]], target_test[['DCT prediction']])
    print()

Train
Mean squared error 0.10370
Mean absolute error 0.23805
Median absolute error 0.16848
Explained variance score 0.99898
Coefficient of determination 0.99898
Test
Mean squared error 33.26360
Mean absolute error 4.87036
Median absolute error 4.69616
Explained variance score 0.84914
Coefficient of determination 0.83340

Train
Mean squared error 0.67315
Mean absolute error 0.60980
Median absolute error 0.48280
Explained variance score 0.99562
Coefficient of determination 0.99562
Test
Mean squared error 38.44508
Mean absolute error 4.59269
Median absolute error 3.51188
Explained variance score 0.74832
Coefficient of determination 0.69198

Train
Mean squared error 1.15373
Mean absolute error 0.78787
Median absolute error 0.56621
Explained variance score 0.99259
Coefficient of determination 0.99259
Test
Mean squared error 69.85124
Mean absolute error 7.07457
Median absolute error 7.12305
Explained variance score 0.51478
Coefficient of determination 0.16377

Train
Mean squared error 2.3005