In [1]:
import pandas as pd
import numpy as np

import matplotlib.pylab as plt
%matplotlib inline

from tqdm import tqdm, tqdm_notebook
pd.set_option("display.max_rows", 100)
pd.set_option('display.max_columns', 200)

from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression, ElasticNet, Ridge
fname = 'data.csv'

from ts_features import init_data, add_hand_feats, add_diffs, add_shifts
from ts_features import add_time_depended_rolling, add_full_history_diff
from ts_validation import validate_sklearn_model, validate_model_by_pentate, validate_model_by_triplets
from ts_validation import greedy_add_del_strategy, greedy_add_strategy
from helper import print_importances

In [2]:
short_agg_periods = [6, 60, 360]
oneday_agg_periods = [6, 60, 360, 720, 1410]
twoweeks_agg_periods = [6, 60, 360, 720, 1410, 2820, 7050, 14100]

month_days_periods = [1,2,3,4,5,10,15,20]
month_agg_periods = list(map(lambda x: x * 1410, month_days_periods))
print('month_agg_periods: {}'.format(month_agg_periods))

merged_agg_periods = [6, 60, 360, 720, 1410, 2820, 4230, 5640, 7050, 14100, 21150]


valid_ratio = 0.25
test_ratio = 0.25
triplets = [
    (0.5, 0.25, 0.25), 
    (0.6, 0.2, 0.2), 
    (0.7,0.15,0.15),
#     (0.65, 0.2, 0.15)
]
droprows = 7050
# droprows = 28200

month_agg_periods: [1410, 2820, 4230, 5640, 7050, 14100, 21150, 28200]


In [3]:
model = Ridge(alpha=10)

In [4]:
# usecols = []

# data = init_data(fname)

# hand_crafted_cols = add_hand_feats(data)
# usecols.extend(hand_crafted_cols)

# xcols = add_time_depended_rolling(data, 'xprice', oneday_agg_periods, np.mean, 'mean')
# for col in xcols:
#     data[col] = data.xprice - data[col]
# usecols.extend(xcols)

# ycols = add_time_depended_rolling(data, 'yprice', oneday_agg_periods, np.mean, 'mean')
# for col in ycols:
#     data[col] = data.yprice - data[col]
# usecols.extend(ycols)

# usecols.append(add_full_history_diff(data, 'xprice'))
# usecols.append(add_full_history_diff(data, 'yprice'))
# usecols.append(add_full_history_diff(data, 'yx_relation'))
# usecols.append(add_full_history_diff(data, 'xy_geom'))

# validate_sklearn_model(model, data, usecols, valid_ratio, test_ratio, droprows);
# data.to_pickle('init_data.pkl')

data = pd.read_pickle('init_data.pkl')
usecols = [
    'ydiff_from_closing','xdiff_from_closing', 
    'yrel_from_closing','xrel_from_closing',
#     'ydiff_from_opening','xdiff_from_opening',
#     'yrel_from_opening','xrel_from_opening',
    'xprice_time_mean_6','yprice_time_mean_6',
    'xprice_time_mean_60','yprice_time_mean_60',
    'xprice_time_mean_360','yprice_time_mean_360',
    'xprice_time_mean_720','yprice_time_mean_720',
    'xprice_time_mean_1410','yprice_time_mean_1410',
#     'xprice_full_history_diff',
#     'yprice_full_history_diff',
#     'yx_relation_full_history_diff',
#     'xy_geom_full_history_diff'
]
# print(validate_model_by_triplets(model, data, usecols, triplets, droprows))
validate_model_by_pentate(model, data, usecols, droprows)

Unnamed: 0,train_50_percent,train_60_percent,train_70_percent,train_80_percent,train_90_percent,min_stats,max_stats,avg
mse,0.021624,0.020635,0.016504,0.021818,0.015049,0.015053,0.02182,0.018936
r2,0.326764,0.151728,-1.880555,0.824074,0.38594,-1.880859,0.824219,-0.178345


## zscore for yprice

In [5]:
agg_col = 'yprice'
new_cols = add_time_depended_rolling(data, agg_col, oneday_agg_periods, np.std, 'std')
std_reg_const = 0.1

for col in new_cols:
    data[col] = data[col].fillna(0) + std_reg_const

['yprice_time_std_6', 'yprice_time_std_60', 'yprice_time_std_360', 'yprice_time_std_720', 'yprice_time_std_1410']


In [6]:
zscore_cols = []
for lag in oneday_agg_periods:
    current_mean = data['{}_time_mean_{}'.format(agg_col, lag)]
    current_std = data['{}_time_std_{}'.format(agg_col, lag)]
    colname = '{}_time_zscore_{}'.format(agg_col, lag)
    data[colname] = current_mean / current_std
    zscore_cols.append(colname)

selected_cols = greedy_add_strategy(model, data, usecols, zscore_cols, 
                                 valid_ratio, test_ratio, droprows)
validate_model_by_pentate(model, data, selected_cols, droprows)

added yprice_time_zscore_360: r2: 0.3262
added yprice_time_zscore_1410: r2: 0.39983
added yprice_time_zscore_720: r2: 0.42076


Unnamed: 0,train_50_percent,train_60_percent,train_70_percent,train_80_percent,train_90_percent,min_stats,max_stats,avg
mse,0.021647,0.020572,0.016276,0.021585,0.014972,0.014969,0.021652,0.018814
r2,0.219368,0.457369,-0.472972,1.886111,0.897005,-0.4729,1.885742,0.628418


In [7]:
print(selected_cols)

['ydiff_from_closing', 'xdiff_from_closing', 'yrel_from_closing', 'xrel_from_closing', 'xprice_time_mean_6', 'yprice_time_mean_6', 'xprice_time_mean_60', 'yprice_time_mean_60', 'xprice_time_mean_360', 'yprice_time_mean_360', 'xprice_time_mean_720', 'yprice_time_mean_720', 'xprice_time_mean_1410', 'yprice_time_mean_1410', 'yprice_time_zscore_360', 'yprice_time_zscore_1410', 'yprice_time_zscore_720']


In [20]:
print_importances(model, selected_cols)

xprice_time_mean_6                       18.35%            0.23
yprice_time_mean_360                     16.00%            -0.2
xprice_time_mean_360                     10.87%            0.13
xprice_time_mean_720                     10.76%           -0.13
xprice_time_mean_1410                    9.64%            0.12
yprice_time_mean_6                       5.94%          -0.073
yprice_time_mean_1410                    5.84%          -0.072
yprice_time_mean_720                     5.66%            0.07
xprice_time_mean_60                      5.58%           0.069
yprice_time_mean_60                      4.69%          -0.058
xdiff_from_closing                       1.80%          -0.022
yprice_time_zscore_720                   1.77%           0.022
yprice_time_zscore_360                   1.41%           0.017
ydiff_from_closing                       0.56%          0.0069
yprice_time_zscore_1410                  0.37%          0.0046
yrel_from_closing                        0.36%     

## zscore for yx_rel

In [51]:
agg_col = 'yx_relation'
new_cols = add_time_depended_rolling(data, agg_col, oneday_agg_periods, np.mean, 'mean')

for col in new_cols:
    data[col] = data[agg_col] - data[col]

['yx_relation_time_mean_6', 'yx_relation_time_mean_60', 'yx_relation_time_mean_360', 'yx_relation_time_mean_720', 'yx_relation_time_mean_1410']


In [103]:
agg_col = 'yx_relation'
new_cols = add_time_depended_rolling(data, agg_col, map(lambda x: x * 4, oneday_agg_periods), np.std, 'std')
std_reg_const = 0.001

for col in new_cols:
    data[col] = data[col].fillna(0) + std_reg_const

['yx_relation_time_std_24', 'yx_relation_time_std_240', 'yx_relation_time_std_1440', 'yx_relation_time_std_2880', 'yx_relation_time_std_5640']


In [104]:
zscore_cols = []
for lag in oneday_agg_periods:
    current_mean = data['{}_time_mean_{}'.format(agg_col, lag)]
    current_std = data['{}_time_std_{}'.format(agg_col, lag * 4)]
    colname = '{}_time_zscore_{}'.format(agg_col, lag)
    data[colname] = current_mean / current_std
    zscore_cols.append(colname)

added_cols = greedy_add_strategy(model, data, selected_cols.copy(), zscore_cols, 
                                 valid_ratio, test_ratio, droprows)
validate_model_by_pentate(model, data, added_cols, droprows)

added yx_relation_time_zscore_6: r2: 0.42819
added yx_relation_time_zscore_360: r2: 0.43314
added yx_relation_time_zscore_1410: r2: 0.4475
added yx_relation_time_zscore_720: r2: 0.49077


Unnamed: 0,train_50_percent,train_60_percent,train_70_percent,train_80_percent,train_90_percent,min_stats,max_stats,avg
mse,0.02166,0.020545,0.016258,0.0216,0.01499,0.014992,0.021667,0.018814
r2,0.157567,0.586214,-0.361222,1.814647,0.780375,-0.361328,1.814453,0.632812


In [101]:
added_cols = [
#     'yx_relation_time_zscore_6',
    'yx_relation_time_zscore_60',
#     'yx_relation_time_zscore_360',
#     'yx_relation_time_zscore_720',
    'yx_relation_time_zscore_1410'
#     'yx_relation_time_std_6',
#     'yx_relation_time_std_60',
#     'yx_relation_time_std_360', 
#     'yx_relation_time_std_720',
#     'yx_relation_time_std_1410'
]
validate_model_by_pentate(model, data, selected_cols + added_cols, droprows)

Unnamed: 0,train_50_percent,train_60_percent,train_70_percent,train_80_percent,train_90_percent,min_stats,max_stats,avg
mse,0.021652,0.020573,0.01629,0.021606,0.014971,0.014969,0.021652,0.018814
r2,0.193919,0.450812,-0.562675,1.789125,0.903669,-0.5625,1.789062,0.571777


In [85]:
data[added_cols].std()

yx_relation_time_zscore_6       0.588646
yx_relation_time_zscore_60      1.005944
yx_relation_time_zscore_360     1.147955
yx_relation_time_zscore_720     1.154843
yx_relation_time_zscore_1410    1.134246
dtype: float64

## ema for yprice

## zscore for xprice

In [13]:
# agg_col = 'xprice'
# new_cols = add_time_depended_rolling(data, agg_col, oneday_agg_periods, np.std, 'std')
# std_reg_const = 1

# for col in new_cols:
#     data[col] = data[col].fillna(0) + std_reg_const

['xprice_time_std_6', 'xprice_time_std_60', 'xprice_time_std_360', 'xprice_time_std_720', 'xprice_time_std_1410']


In [12]:
# zscore_cols = []
# for lag in oneday_agg_periods:
#     current_mean = data['{}_time_mean_{}'.format(agg_col, lag)]
#     current_std = data['{}_time_std_{}'.format(agg_col, lag)]
#     colname = '{}_time_zscore_{}'.format(agg_col, lag)
#     data[colname] = current_mean / current_std
#     zscore_cols.append(colname)

# selected_cols = greedy_add_strategy(model, data, selected_cols, zscore_cols, 
#                                  valid_ratio, test_ratio, droprows)
# validate_model_by_pentate(model, data, selected_cols, droprows)

added xprice_time_zscore_60: r2: 0.41918


Unnamed: 0,train_50_percent,train_60_percent,train_70_percent,train_80_percent,train_90_percent,min_stats,max_stats,avg
mse,0.021537,0.020677,0.0163,0.021582,0.014994,0.014992,0.021576,0.018814
r2,0.728314,-0.04988,-0.623453,1.89547,0.749149,-0.623535,1.895508,0.567383


In [18]:
# from ts_features import add_rsi

# rsi_spread_cols = add_rsi(data, 'yx_spread', twoweeks_agg_periods)
# usecols.extend(rsi_spread_cols)

# rsi_yxrel_cols = add_rsi(data, 'yx_relation', twoweeks_agg_periods)
# usecols.extend(rsi_yxrel_cols)

# rsi_xyrel_cols = add_rsi(data, 'xy_relation', twoweeks_agg_periods)
# usecols.extend(rsi_xyrel_cols)

# rsi_geom_cols = add_rsi(data, 'xy_geom', twoweeks_agg_periods)
# usecols.extend(rsi_geom_cols)

# rsi_garmonic_cols = add_rsi(data, 'xy_garmonic', twoweeks_agg_periods)
# usecols.extend(rsi_garmonic_cols)

In [16]:
# agg_col = 'yprice_time_mean_360'
# new_cols = add_diffs(data, agg_col, [360, 720, 1080, 1410])
# # usecols.extend(new_cols)

# added_cols = [
# #     'yprice_time_mean_360_diff_360',
# #     'yprice_time_mean_360_diff_720',
# #     'yprice_time_mean_360_diff_1080',
# #     'yprice_time_mean_360_diff_1410'
# ]

# # selected_cols = greedy_add_strategy(model, data, usecols, new_cols,
# #                                     valid_ratio, test_ratio, droprows)

# validate_model_by_pentate(model, data, usecols + added_cols, droprows)

In [68]:
agg_col = 'xprice_time_mean_360'
new_cols = add_diffs(data, agg_col, month_agg_periods)
usecols.extend(new_cols)

selected_cols = greedy_add_strategy(model, data, selected_cols, new_cols,
                                    valid_ratio, test_ratio, droprows)

validate_sklearn_model(model, data, selected_cols, valid_ratio, test_ratio, droprows);

['xprice_time_mean_360_diff_1410', 'xprice_time_mean_360_diff_2820', 'xprice_time_mean_360_diff_4230', 'xprice_time_mean_360_diff_5640', 'xprice_time_mean_360_diff_7050', 'xprice_time_mean_360_diff_14100', 'xprice_time_mean_360_diff_21150', 'xprice_time_mean_360_diff_28200']
added xprice_time_mean_360_diff_1410: r2: -0.58474
added xprice_time_mean_360_diff_7050: r2: -0.58439
added xprice_time_mean_360_diff_28200: r2: -0.5632
added xprice_time_mean_360_diff_14100: r2: -0.54401
Data shapes:  (143116, 28) (85657, 28) (85657, 28)

Valid MSE: 		 0.020145
Valid R2 (x100): 	 -0.54401

Test MSE: 		 0.018163
Test R2 (x100): 	 1.4395


In [69]:
agg_col = 'yx_relation'
new_cols = add_time_depended_rolling(data, agg_col, oneday_agg_periods, np.mean, 'mean')
for col in new_cols:
    data[col] = data[col] / (data[agg_col] + 1e-3)
usecols.extend(new_cols)

selected_cols = greedy_add_strategy(model, data, selected_cols, new_cols,
                                    valid_ratio, test_ratio, droprows)
validate_sklearn_model(model, data, selected_cols, valid_ratio, test_ratio, droprows);

['yx_relation_time_mean_6', 'yx_relation_time_mean_60', 'yx_relation_time_mean_360', 'yx_relation_time_mean_720', 'yx_relation_time_mean_1410']
added yx_relation_time_mean_6: r2: -0.54401
added yx_relation_time_mean_360: r2: -0.544
added yx_relation_time_mean_60: r2: -0.544
added yx_relation_time_mean_720: r2: -0.544
Data shapes:  (143116, 32) (85657, 32) (85657, 32)

Valid MSE: 		 0.020145
Valid R2 (x100): 	 -0.544

Test MSE: 		 0.018163
Test R2 (x100): 	 1.4394


In [19]:
selected_cols=[elem for elem in selected_cols if elem not in new_cols]

In [62]:
agg_col = 'yx_relation'
new_cols = add_diffs(data, agg_col, twoweeks_agg_periods)

usecols.extend(new_cols)

selected_cols = greedy_add_strategy(model, data, selected_cols, new_cols,
                                    valid_ratio, test_ratio, droprows)
validate_sklearn_model(model, data, selected_cols, valid_ratio, test_ratio, droprows);

['yx_relation_diff_6', 'yx_relation_diff_60', 'yx_relation_diff_360', 'yx_relation_diff_720', 'yx_relation_diff_1410', 'yx_relation_diff_2820', 'yx_relation_diff_7050', 'yx_relation_diff_14100']
added yx_relation_diff_60: r2: 1.0848
added yx_relation_diff_360: r2: 1.0922
Data shapes:  (194510, 32) (68526, 32) (51394, 32)

Valid MSE: 		 0.019377
Valid R2 (x100): 	 1.0922

Test MSE: 		 0.015762
Test R2 (x100): 	 0.97178


In [13]:
agg_col = 'yx_relation'
std_reg_const = 0.1
new_cols = add_time_depended_rolling(data, agg_col, oneday_agg_periods, np.std, 'std')
for col in new_cols:
    data[col] = data[col].fillna(0) + std_reg_const
usecols.extend(new_cols)

selected_cols = greedy_add_strategy(model, data, selected_cols, new_cols,
                                    valid_ratio, test_ratio, droprows)
validate_sklearn_model(model, data, selected_cols, valid_ratio, test_ratio, droprows);

['yx_relation_time_std_6', 'yx_relation_time_std_60', 'yx_relation_time_std_360', 'yx_relation_time_std_720', 'yx_relation_time_std_1410']
Data shapes:  (143115, 35) (102789, 35) (68526, 35)

Valid MSE: 		 0.022652
Valid R2 (x100): 	 -15.558

Test MSE: 		 0.018436
Test R2 (x100): 	 1.0791


In [15]:
agg_col = 'yx_relation'
zscore_cols = []
for lag in oneday_agg_periods:
    current_mean = data['{}_time_mean_{}'.format(agg_col, lag)]
    current_std = data['{}_time_std_{}'.format(agg_col, lag)]
    colname = '{}_time_zscore_{}'.format(agg_col, lag)
    data[colname] = current_mean / current_std
    zscore_cols.append(colname)

added_cols = greedy_add_strategy(model, data, selected_cols, zscore_cols, 
                                 valid_ratio, test_ratio, droprows)
validate_sklearn_model(model, data, added_cols, valid_ratio, test_ratio, droprows);

added yx_relation_time_zscore_6: r2: -15.558
added yx_relation_time_zscore_360: r2: -15.556
added yx_relation_time_zscore_720: r2: -15.556
Data shapes:  (143115, 38) (102789, 38) (68526, 38)

Valid MSE: 		 0.022651
Valid R2 (x100): 	 -15.556

Test MSE: 		 0.018437
Test R2 (x100): 	 1.0761


In [11]:
# agg_col = 'xy_geom'
# new_cols = add_time_depended_rolling(data, agg_col, oneday_agg_periods, np.mean, 'mean')
# for col in new_cols:
#     data[col] = data[col] - data[agg_col]
# usecols.extend(new_cols)

# selected_cols = greedy_add_strategy(model, data, selected_cols, new_cols,
#                                     valid_ratio, test_ratio, droprows)
# validate_sklearn_model(model, data, selected_cols, valid_ratio, test_ratio, droprows);

In [12]:
# agg_col = 'yx_relation_time_mean_360'
# new_cols = add_diffs(data, agg_col, month_agg_periods)
# usecols.extend(new_cols)

# selected_cols = greedy_add_strategy(model, data, selected_cols, new_cols,
#                                     valid_ratio, test_ratio, droprows)
# validate_sklearn_model(model, data, selected_cols, valid_ratio, test_ratio, droprows);

In [13]:
# agg_col = 'xy_geom'
# new_cols = add_time_depended_rolling(data, agg_col, [60, 360, 1410], np.std, 'std')
# for col in new_cols:
#     data[col] = data[col].fillna(0)
# usecols.extend(new_cols)

# selected_cols = greedy_add_strategy(model, data, selected_cols, new_cols,
#                                     valid_ratio, test_ratio, droprows)
# validate_sklearn_model(model, data, selected_cols, valid_ratio, test_ratio, droprows);

In [14]:
# agg_col = 'xy_relation_time_mean_60'
# new_cols = add_diffs(data, agg_col, month_agg_periods)

# usecols.extend(new_cols)
# validate_sklearn_model(model, data, usecols, valid_ratio, test_ratio, droprows);

In [15]:
# agg_col = 'yprice'
# # new_cols = add_time_depended_rolling(data, agg_col, [60, 360, 1410], np.std, 'std')
# data['yprice_time_normbot_60'] = data.yprice_time_mean_60 - 2 * data.yprice_time_std_60.fillna(0)
# data['yprice_time_normtop_60'] = data.yprice_time_mean_60 + 2 * data.yprice_time_std_60.fillna(0)
# data['yprice_time_normbot_360'] = data.yprice_time_mean_360 - 2 * data.yprice_time_std_360.fillna(0)
# data['yprice_time_normtop_360'] = data.yprice_time_mean_360 + 2 * data.yprice_time_std_360.fillna(0)
# data['yprice_time_normbot_1410'] = data.yprice_time_mean_1410 - 2 * data.yprice_time_std_1410.fillna(0)
# data['yprice_time_normtop_1410'] = data.yprice_time_mean_1410 + 2 * data.yprice_time_std_1410.fillna(0)
# norm_cols = [
#     'yprice_time_normbot_60', 
#     'yprice_time_normtop_60',
#     'yprice_time_normbot_360', 
#     'yprice_time_normtop_360',
#     'yprice_time_normbot_1410',
#     'yprice_time_normtop_1410'
# ]
# for col in new_cols:
#     data[col] = data[col].fillna(0)
# validate_sklearn_model(model, data, selected_cols + norm_cols, valid_ratio, test_ratio, droprows);

In [16]:
agg_col = 'yprice'
new_cols = add_time_depended_rolling(data, agg_col, [6, 60, 360, 1410], np.max, 'max')

for col in new_cols:
    data[col] = data[col] - data[agg_col
                                ]
# validate_sklearn_model(model, data, selected_cols + new_cols, valid_ratio, test_ratio, droprows);
added_cols = greedy_add_strategy(model, data, selected_cols, new_cols, 
                                 valid_ratio, test_ratio, droprows)

validate_sklearn_model(model, data, added_cols, valid_ratio, test_ratio, droprows);

['yprice_time_max_6', 'yprice_time_max_60', 'yprice_time_max_360', 'yprice_time_max_1410']
added yprice_time_max_6: r2: 1.0393
Data shapes:  (194510, 34) (68526, 34) (51394, 34)

Valid MSE: 		 0.019387
Valid R2 (x100): 	 1.0393

Test MSE: 		 0.015695
Test R2 (x100): 	 1.3925


In [18]:
agg_col = 'yprice_time_max_60'
new_cols = add_shifts(data, agg_col, [60, 120, 180, 240])
usecols.extend(new_cols)

added_cols = greedy_add_strategy(model, data, added_cols, new_cols,
                                valid_ratio, test_ratio, droprows)

validate_sklearn_model(model, data, added_cols, valid_ratio, test_ratio, droprows);

['yprice_time_max_60_lag_60', 'yprice_time_max_60_lag_120', 'yprice_time_max_60_lag_180', 'yprice_time_max_60_lag_240']
added yprice_time_max_60_lag_180: r2: 1.059
Data shapes:  (194510, 35) (68526, 35) (51394, 35)

Valid MSE: 		 0.019383
Valid R2 (x100): 	 1.059

Test MSE: 		 0.015691
Test R2 (x100): 	 1.4162


In [19]:
selected_cols = added_cols.copy()

In [20]:
# agg_col = 'yprice'
# new_cols = add_time_depended_rolling(data, agg_col, [6, 60, 360, 1410], np.min, 'min')

# for col in new_cols:
#     data[col] = data[col] - data[agg_col
#                                 ]
# # validate_sklearn_model(model, data, selected_cols + new_cols, valid_ratio, test_ratio, droprows);
# added_cols = greedy_add_strategy(model, data, selected_cols, new_cols,
#                                     valid_ratio, test_ratio, droprows)

# validate_sklearn_model(model, data, added_cols, valid_ratio, test_ratio, droprows);


['yprice_time_min_6', 'yprice_time_min_60', 'yprice_time_min_360', 'yprice_time_min_1410']
Data shapes:  (194510, 35) (68526, 35) (51394, 35)

Valid MSE: 		 0.019383
Valid R2 (x100): 	 1.059

Test MSE: 		 0.015691
Test R2 (x100): 	 1.4162


In [78]:
agg_col = 'yprice'
# new_cols = add_time_depended_rolling(data, agg_col, oneday_agg_periods, np.std, 'std')
# data['yprice_time_zscore_60'] = data.yprice_time_mean_60 / (data.yprice_time_std_60.fillna(1) + 1e-5)
data['yprice_time_zscore_360'] = data.yprice_time_mean_360 / (data.yprice_time_std_360.fillna(1) + 0.1)
# data['yprice_time_zscore_1410'] = data.yprice_time_mean_1410 / (data.yprice_time_std_1410.fillna(1) + 1e-5)

zscore_cols = [
#     'yprice_time_zscore_60', 
    'yprice_time_zscore_360',
#     'yprice_time_zscore_1410', 
]

added_cols = greedy_add_strategy(model, data, selected_cols, zscore_cols, 
                                 valid_ratio, test_ratio, droprows)
validate_sklearn_model(model, data, added_cols, valid_ratio, test_ratio, droprows);

added yprice_time_zscore_360: r2: 1.9878
Data shapes:  (194510, 36) (68526, 36) (51394, 36)

Valid MSE: 		 0.019201
Valid R2 (x100): 	 1.9878

Test MSE: 		 0.015652
Test R2 (x100): 	 1.6602


In [80]:
valid_ratio, test_ratio

(0.2, 0.15)

In [24]:
validate_sklearn_model(model, data, selected_cols + ['yprice_time_zscore_360'], valid_ratio, test_ratio, droprows);

Data shapes:  (194510, 36) (68526, 36) (51394, 36)

Valid MSE: 		 0.019221
Valid R2 (x100): 	 1.8884

Test MSE: 		 0.01569
Test R2 (x100): 	 1.4207


In [28]:
selected_cols = added_cols.copy()

In [30]:
# agg_col = 'xprice'
# new_cols = add_time_depended_rolling(data, agg_col, oneday_agg_periods, np.std, 'std')
# data['xprice_time_zscore_60'] = data.yprice_time_mean_60 / (data.yprice_time_std_60.fillna(1) + 1e-5)
# data['xprice_time_zscore_360'] = data.yprice_time_mean_360 / (data.yprice_time_std_360.fillna(1) + 1e-5)
# data['xprice_time_zscore_1410'] = data.yprice_time_mean_1410 / (data.yprice_time_std_1410.fillna(1) + 1e-5)

# zscore_cols = [
#     'xprice_time_zscore_60', 
#     'xprice_time_zscore_360',
#     'xprice_time_zscore_1410', 
# ]

# added_cols = greedy_add_strategy(model, data, selected_cols, zscore_cols, 
#                                  valid_ratio, test_ratio, droprows)
# validate_sklearn_model(model, data, selected_cols + ['xprice_time_zscore_360'], valid_ratio, test_ratio, droprows);

['xprice_time_std_6', 'xprice_time_std_60', 'xprice_time_std_360', 'xprice_time_std_720', 'xprice_time_std_1410']
Data shapes:  (194510, 37) (68526, 37) (51394, 37)

Valid MSE: 		 0.019221
Valid R2 (x100): 	 1.8884

Test MSE: 		 0.01569
Test R2 (x100): 	 1.4207


In [34]:
# agg_col = 'yprice_time_zscore_360'
# new_cols = add_shifts(data, agg_col, month_agg_periods)
# usecols.extend(new_cols)

# added_cols = greedy_add_strategy(model, data, selected_cols, new_cols,
#                                 valid_ratio, test_ratio, droprows)

# validate_sklearn_model(model, data, added_cols, valid_ratio, test_ratio, droprows);

['yprice_time_zscore_360_lag_1410', 'yprice_time_zscore_360_lag_2820', 'yprice_time_zscore_360_lag_4230', 'yprice_time_zscore_360_lag_5640', 'yprice_time_zscore_360_lag_7050', 'yprice_time_zscore_360_lag_14100', 'yprice_time_zscore_360_lag_21150', 'yprice_time_zscore_360_lag_28200']
added yprice_time_zscore_360_lag_28200: r2: 1.9145
Data shapes:  (194510, 37) (68526, 37) (51394, 37)

Valid MSE: 		 0.019216
Valid R2 (x100): 	 1.9145

Test MSE: 		 0.015709
Test R2 (x100): 	 1.3032


In [14]:
print_importances(model, selected_cols)

xy_geom_full_history_diff                13.39%           -0.28
xprice_time_mean_6                       11.31%            0.24
yprice_time_mean_360                     7.22%           -0.15
xprice_full_history_diff                 7.10%            0.15
xprice_time_mean_720                     7.09%           -0.15
yprice_time_mean_720                     6.43%            0.14
yprice_full_history_diff                 6.17%            0.13
xprice_time_mean_1410                    6.07%            0.13
xprice_time_mean_360                     5.08%            0.11
yprice_time_mean_1410                    4.02%          -0.085
yrel_from_opening                        3.89%           0.082
xprice_time_mean_60                      3.13%           0.066
yprice_time_mean_6                       2.64%          -0.056
yprice_time_mean_360_diff_1410           1.32%          -0.028
yprice_time_mean_60                      1.31%          -0.028
xrel_from_opening                        1.25%       

In [28]:
selected_cols = greedy_add_del_strategy(model, data, usecols, valid_ratio, test_ratio, droprows)

removed yprice_time_mean_360_diff_2820: r2: 0.95017
removed xprice_time_mean_360_diff_4230: r2: 0.99257
removed yprice_time_mean_60: r2: 1.0282
removed xy_geom_time_mean_60: r2: 1.2378
removed yprice_time_mean_6: r2: 1.2725
removed xprice_time_mean_6: r2: 1.3136
removed xprice_time_mean_360_diff_5640: r2: 1.3487
removed xprice_time_mean_360_diff_14100: r2: 1.3803
removed yprice_time_mean_360_diff_21150: r2: 1.4096
removed yprice_time_mean_360_diff_4230: r2: 1.4122
removed xy_relation_time_mean_60_diff_7050: r2: 1.4135
removed xrel_from_opening: r2: 1.4145
removed xy_relation_time_mean_60_diff_2820: r2: 1.4153
removed xy_relation_time_mean_360_diff_2820: r2: 1.4155
removed xy_relation_time_mean_60: r2: 1.416
removed xy_relation_time_mean_60_diff_2820: r2: 1.4163
removed xy_relation_time_mean_60_diff_21150: r2: 1.4167
removed xy_relation_time_mean_60_diff_14100: r2: 1.417
removed yx_relation_full_history_diff: r2: 1.4173
removed xy_relation_time_mean_360_diff_4230: r2: 1.4175
removed xy_

In [30]:
selected_cols

['ydiff_from_closing',
 'xdiff_from_closing',
 'xrel_from_closing',
 'ydiff_from_closing',
 'xdiff_from_closing',
 'xprice_time_mean_60',
 'xprice_time_mean_360',
 'xprice_time_mean_720',
 'xprice_time_mean_1410',
 'yprice_time_mean_360',
 'yprice_time_mean_720',
 'yprice_time_mean_1410',
 'xprice_full_history_diff',
 'yprice_full_history_diff',
 'xy_geom_full_history_diff',
 'yprice_time_mean_360_diff_1410',
 'yprice_time_mean_360_diff_5640',
 'yprice_time_mean_360_diff_14100',
 'yprice_time_mean_360_diff_28200',
 'yprice_time_mean_360_diff_14100',
 'xprice_time_mean_360_diff_1410',
 'xprice_time_mean_360_diff_2820',
 'xprice_time_mean_360_diff_21150',
 'xprice_time_mean_360_diff_28200',
 'xy_relation_time_mean_6',
 'xy_relation_time_mean_360',
 'xy_relation_time_mean_720',
 'xy_relation_time_mean_1410',
 'xy_geom_time_mean_6',
 'xy_geom_time_mean_360',
 'xy_geom_time_mean_720',
 'xy_geom_time_mean_1410',
 'xy_relation_time_mean_360_diff_1410',
 'xy_relation_time_mean_360_diff_7050',


In [29]:
validate_sklearn_model(model, data, selected_cols, valid_ratio, test_ratio, droprows);

Data shapes:  (194510, 36) (68526, 36) (51394, 36)

Valid MSE: 		 0.019312
Valid R2 (x100): 	 1.421

Test MSE: 		 0.015751
Test R2 (x100): 	 1.0392
