In [92]:
import pandas as pd

import mix as mix
import db_column_name as db

import numpy as np
pd.set_option('precision', 10)

import scipy.sparse
import pickle
import xgboost as xgb

%matplotlib inline
import seaborn as sns
import matplotlib.pyplot as plt
%config InlineBackend.figure_format = 'svg'

from pylab import rcParams
# rcParams['figure.figsize'] = 7,5

# offset can be 9, 21. 33, 45, 57, 69
cn = db.ColumnName()

target_minT = pd.read_csv('./data/31286_103.csv')
mix.set_index_date(target_minT, cn.date)
target_minT = target_minT.sort_index()

X = pd.read_csv('./data/character_31286.csv')
mix.set_index_date(X, cn.date)
X = X.sort_index()

X = X.drop([cn.point], axis=1)
X = X[[x for x in X.columns if 'avg' in x or 
       x == cn.offset]]


# X = X[X[cn.offset] == 69]
# X = mix.year_less_eq(X, 2016)

X = X.groupby([X.index.year, 
           X.index.month, 
           X.index.day]).mean()

a = pd.DataFrame(X.index.tolist(), 
                 columns=['year','month','day'])
X.index = pd.to_datetime(a)

X = X.drop([cn.offset], axis=1)

target_minT.index = target_minT.index.round('D')

X[cn.value] = target_minT
X = mix.clean_dataset(X)

print(X.shape)

(1810, 55)


In [93]:
X['winsorized'] = X[cn.value]
for index, row in X.iterrows(): 
    
    offset_day = pd.to_timedelta(5, unit='day')
    start_date = index - offset_day
    end_date = index + offset_day
    
    s = X.iloc[(X.index >= start_date) & (X.index <= end_date)]
    s = s[[cn.value]]
    
    q = s.quantile([0.05, 0.95])
    
    v = row[cn.value]
    if (v < q.iloc[0, 0] or v > q.iloc[1, 0]):
        X.drop([index], inplace=True)
    
print(X.shape)
# X[['winsorized', cn.value]].plot(style='.')

(1345, 56)


In [94]:
X[cn.value] = X['winsorized']
X = X.drop(['winsorized'], axis=1)

In [131]:
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import cross_validate

def fit(params, X, target, label_v):   
    print("Fit timeSeriesSplit")
    for train_index, test_index in TimeSeriesSplit(5).split(X):   
        test_index = [i for i in range(0, X.shape[0]) if i not in train_index]  

        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        target_train, target_test = target.iloc[train_index], target.iloc[test_index]
        print("Train size : {} and test size : {}".format(X_train.shape[0], X_test.shape[0]))
        
        reg = xgb.XGBRegressor(**params)

        reg.fit(X_train, target_train)

        target_test.loc[:, 'XGB prediction'] = reg.predict(X_test)
        target_train.loc[:, 'XGB prediction'] = reg.predict(X_train)
    
        mix.print_mean(target_test, target_train, label_v, 'XGB prediction') 
        
def fit_on_year(params, X_clear, target_minT, year):
    print("Fir on one year")
    X_train, X_test = X_clear[X_clear.index.year == year], X_clear[X_clear.index.year != year]
    target_train, target_test = target_minT.loc[X_train.index], target_minT.loc[X_test.index]
    print("Train size : {} and test size : {}".format(X_train.shape[0], X_test.shape[0]))

    reg = xgb.XGBRegressor(**params)

    reg.fit(X_train, target_train)

    target_test.loc[:, 'XGB prediction'] = reg.predict(X_test)
    target_train.loc[:, 'XGB prediction'] = reg.predict(X_train)
    mix.print_mean(target_test, target_train, cn.value, 'XGB prediction') 
    
#     target_test.plot(style=".")
    
    
default_params = {
    'verbosity':0,
    'max_depth': 6,
    
    'learning_rate': 0.05,
#     'min_child_weight': 2,
#     'subsample':0.8, 
#     'colsample_bytree':0.8,
    'gamma': 16,
}

X_clear, target_minT = mix.getTarget(X)
# fit(default_params, X_clear, target_minT, cn.value)


params_one_year = {
    'verbosity':0,
    'max_depth': 5,
    
    'learning_rate': 0.05,
    'min_child_weight': 6,
#     'subsample':0.8, 
#     'colsample_bytree':0.8,
#     'gamma': 16,
}
fit_on_year(params_one_year, X_clear, target_minT, 2016)

Fir on one year
Train size : 275 and test size : 1070
Mean squared error on train 0.4149867168498243 and test 21.51979994918009


In [137]:
X_ = X.resample('3D').mean()
X_ = mix.clean_dataset(X_)

params_mean = {
    'verbosity':0,
    'max_depth': 4,
    
    'learning_rate': 0.07,
    'min_child_weight': 10,
#     'subsample':0.9, 
#     'colsample_bytree':0.8,
#     'gamma': 7,
}

X_3mean, target_3mean = mix.getTarget(X_)
fit(params_mean, X_3mean, target_3mean, cn.value)

fit_on_year(params_mean, X_3mean, target_3mean, 2018)

Fit timeSeriesSplit
Train size : 104 and test size : 495
Mean squared error on train 0.15919259860779927 and test 47.04572816230289
Train size : 203 and test size : 396
Mean squared error on train 0.2956102540699622 and test 63.889006962874625
Train size : 302 and test size : 297
Mean squared error on train 0.5009030176122053 and test 57.86842328345396
Train size : 401 and test size : 198
Mean squared error on train 0.7500755935110011 and test 64.07858943910234
Train size : 500 and test size : 99
Mean squared error on train 1.1038015880288825 and test 16.022720140622383


In [149]:
def fft_(X):
    N = X.shape[0]
    y = X[[cn.value]]
    y.reset_index(inplace=True)
    
    Y = np.fft.fft(y[[cn.value]].values)
    index = np.fft.fftfreq(N)
    
    amplit_Y  = np.abs(Y)
    phase_Y = np.angle(Y)

    fft_target = pd.DataFrame(np.hstack((amplit_Y, phase_Y)), columns=['amplitude', 'phase']) 
    fft_target.loc[:, cn.date] = y['index']
    fft_target.set_index(index, inplace=True)
    fft_X = X.set_index(index)
    
    return fft_X, fft_target

fft_X, fft_target = fft_(X)
fft_X.drop([cn.value], inplace=True, axis=1)
real_X, real_target = mix.getTarget(X)

def com(radii, angles):
    return radii * np.around(np.exp(1j*angles), 10)

def complex_(r, a):
    Y_ = np.array(com(r, a).values)
    Y_ = Y_.reshape((Y_.size, 1))
    return Y_

params = {
    'verbosity':0,
    'max_depth': 5,
    
    'learning_rate': 0.05,
    'min_child_weight': 6,
#     'subsample':0.8, 
#     'colsample_bytree':0.8,
#     'gamma': 12,
}

for train_index, test_index in TimeSeriesSplit(5).split(fft_X):
    test_index = [i for i in range(0, fft_X.shape[0]) if i not in train_index]  

    fft_X_train, fft_X_test = fft_X.iloc[train_index], fft_X.iloc[test_index]
    fft_target_train, fft_target_test = fft_target.iloc[train_index], fft_target.iloc[test_index]
    print("Train size : {} and test size : {}".format(fft_X_train.shape[0], fft_X_test.shape[0]))
    
    real_target_train = real_target.loc[fft_target_train[cn.date]]
    real_target_test = real_target.loc[fft_target_test[cn.date]]
    
    reg = xgb.XGBRegressor(**params)
    reg.fit(fft_X_train, fft_target_train.loc[:, 'amplitude'])

    fft_target_test['Predict am'] = reg.predict(fft_X_test)
    fft_target_train['Predict am'] = reg.predict(fft_X_train)

    Y = complex_(fft_target_train.loc[:, 'Predict am'], fft_target_train.loc[:, 'phase'])
    real_target_train['FFT prediction'] = np.fft.ifft(Y).real

    Y = complex_(fft_target_test.loc[:, 'Predict am'], fft_target_test.loc[:, 'phase'])
    real_target_test['FFT prediction'] = np.fft.ifft(Y).real

    mix.print_mean(real_target_test, real_target_train, cn.value, 'FFT prediction')



Train size : 225 and test size : 1120
Mean squared error on train 0.2828700925151236 and test 41.91398933297258
Train size : 449 and test size : 896


  if getattr(data, 'base', None) is not None and \
  data.base is not None and isinstance(data, np.ndarray) \


Mean squared error on train 0.8669398073666773 and test 53.052260189825624
Train size : 673 and test size : 672


  if getattr(data, 'base', None) is not None and \
  data.base is not None and isinstance(data, np.ndarray) \


Mean squared error on train 1.5309101866801662 and test 54.837362523954525
Train size : 897 and test size : 448


  if getattr(data, 'base', None) is not None and \
  data.base is not None and isinstance(data, np.ndarray) \


Mean squared error on train 1.7534929090738978 and test 71.320018620157
Train size : 1121 and test size : 224


  if getattr(data, 'base', None) is not None and \
  data.base is not None and isinstance(data, np.ndarray) \


Mean squared error on train 2.079495022239914 and test 14.770041930705961
