In [41]:
import pandas as pd

import mix as mix
import db_column_name as db

import numpy as np
pd.set_option('precision', 10)

import scipy.sparse
import pickle
import xgboost as xgb

%matplotlib inline
import seaborn as sns
import matplotlib.pyplot as plt
%config InlineBackend.figure_format = 'svg'

from pylab import rcParams
rcParams['figure.figsize'] = 20,10

# offset can be 9, 21. 33, 45, 57, 69
cn = db.ColumnName()

target_minT = pd.read_csv('./data/31286_103.csv')
mix.set_index_date(target_minT, cn.date)
target_minT = target_minT.sort_index()

X = pd.read_csv('./data/character_31286.csv')
mix.set_index_date(X, cn.date)
X = X.sort_index()

X = X.drop([cn.point], axis=1)
X = X[[x for x in X.columns if 'avg' in x or 
       x == cn.offset]]

X = X.groupby([X.index.year, 
           X.index.month, 
           X.index.day]).mean()
a = pd.DataFrame(X.index.tolist(), 
                 columns=['year','month','day'])
X.index = pd.to_datetime(a)
X = X.drop([cn.offset], axis=1)

target_minT.index = target_minT.index.round('D')

X[cn.value] = target_minT
X = mix.clean_dataset(X)

print(X.shape)

(1810, 55)


In [42]:
X['winsorized'] = X[cn.value]
for index, row in X.iterrows(): 
    
    offset_day = pd.to_timedelta(5, unit='day')
    start_date = index - offset_day
    end_date = index + offset_day
    
    s = X.iloc[(X.index >= start_date) & (X.index <= end_date)]
    s = s[[cn.value]]
    
    q = s.quantile([0.1, 0.80])
    
    v = row[cn.value]
    if (v < q.iloc[0, 0]):
        X.loc[index, 'winsorized'] = q.iloc[0, 0]
    elif (v > q.iloc[1, 0]):
        X.loc[index, 'winsorized'] = q.iloc[1, 0]
    
# X[['winsorized', cn.value]].plot(style='.')

In [43]:
X[cn.value] = X['winsorized']
X = X.drop(['winsorized'], axis=1)

In [44]:
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import cross_validate

def fit(params, X, target):
    reg = xgb.XGBRegressor(**params)

    scores = cross_validate(reg, X, target, 
                        scoring=['neg_mean_squared_error', 'neg_median_absolute_error', 'neg_mean_absolute_error'],
                        cv=TimeSeriesSplit(10), 
                        return_train_score=True)
    print()
    print("Cross validate test mean squared error : {}".format(scores['test_neg_mean_squared_error']))
    print()
    print("Cross validate train mean squared error : {}".format(scores['train_neg_mean_squared_error']))
    print()
    print("Cross validate test mean median absolute error : {}".format(scores['test_neg_median_absolute_error']))
    print()
    print("Cross validate train mean median absolute error : {}".format(scores['train_neg_median_absolute_error']))

    
default_params = {
    'verbosity':0,
    'max_depth': 4,
    
    'learning_rate': 0.09,
    'min_child_weight': 6,
    'subsample':0.8, 
    'colsample_bytree':0.8,
    'gamma': 16,
}

X_clear, target_minT = mix.getTarget(X)
fit(default_params, X_clear, target_minT)



Cross validate test mean squared error : [-25.55635682 -10.43952444 -24.77250429 -11.78253331 -15.13415279
 -65.09807874 -52.51214934 -17.08560747 -13.41637466  -9.11633933]

Cross validate train mean squared error : [-1.34047901 -1.29065682 -1.48994949 -1.8859372  -2.10760312 -2.37743797
 -2.76512495 -3.12403464 -3.42772926 -3.85611925]

Cross validate test mean median absolute error : [-4.82776089 -1.65562139 -2.61124504 -2.08518081 -2.67502299 -2.95949335
 -5.2790432  -2.66671221 -2.73017421 -1.84698048]

Cross validate train mean median absolute error : [-0.63990345 -0.72693601 -0.79639465 -0.84213772 -0.88881044 -0.96324134
 -1.05097934 -1.07788138 -1.12365894 -1.19750862]


In [47]:
X_ = X.resample('3D').mean()
X_ = mix.clean_dataset(X_)

params_mean = {
    'verbosity':0,
    'max_depth': 4,
    
    'learning_rate': 0.09,
    'min_child_weight': 6,
#     'subsample':0.9, 
#     'colsample_bytree':0.8,
    'gamma': 12,
}

X_3mean, target_3mean = mix.getTarget(X_)
fit(params_mean, X_3mean, target_3mean)


Cross validate test mean squared error : [-24.96574706  -5.988681   -13.43240133  -5.6567297  -12.39570799
 -52.38259194 -53.07626979 -13.77429252 -14.87735063  -7.75142376]

Cross validate train mean squared error : [-1.28235769 -1.19283124 -1.16538668 -1.22758472 -1.17282559 -1.29687994
 -1.39568884 -1.49685979 -1.58953219 -1.57165541]

Cross validate test mean median absolute error : [-5.07988348 -1.91103319 -2.40177011 -1.84570805 -2.04781628 -2.28911222
 -4.68767765 -2.85968265 -2.53195186 -2.11628695]

Cross validate train mean median absolute error : [-0.47210641 -0.68983059 -0.67773809 -0.63064944 -0.69763279 -0.74054038
 -0.74304291 -0.78796549 -0.76853728 -0.76854785]
