In [1]:
import pandas as pd
import numpy as np

import matplotlib.pylab as plt
%matplotlib inline

from tqdm import tqdm, tqdm_notebook
pd.set_option("display.max_rows", 100)
pd.set_option('display.max_columns', 200)

from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression, ElasticNet, Ridge
fname = 'data.csv'

from ts_features import init_data, add_hand_feats, add_diffs, add_shifts
from ts_features import add_ewma, add_intraday_ewma
from ts_features import add_time_depended_rolling, add_full_history_diff
from ts_validation import validate_sklearn_model, validate_model_by_pentate, validate_model_by_triplets
from ts_validation import greedy_add_del_strategy, greedy_add_strategy
from helper import print_importances

In [2]:
short_agg_periods = [6, 60, 360]
oneday_agg_periods = [6, 60, 360, 720, 1410]
twoweeks_agg_periods = [6, 60, 360, 720, 1410, 2820, 7050, 14100]

month_days_periods = [1,2,3,4,8,16]
month_agg_periods = list(map(lambda x: x * 1410, month_days_periods))
print('month_agg_periods: {}'.format(month_agg_periods))

merged_agg_periods = [6, 60, 360, 720, 1410, 2820, 4230, 5640, 7050, 14100, 21150]
oneday_agg_periods_4 = [4 * lag for lag in oneday_agg_periods]

valid_ratio = 0.25
test_ratio = 0.25
triplets = [
    (0.5, 0.25, 0.25), 
    (0.6, 0.2, 0.2), 
    (0.7,0.15,0.15),
#     (0.65, 0.2, 0.15)
]
# droprows = 7050
droprows = 22560

month_agg_periods: [1410, 2820, 4230, 5640, 11280, 22560]


In [3]:
model = Ridge(alpha=10)

In [4]:
# usecols = []

# data = init_data(fname)

# hand_crafted_cols = add_hand_feats(data)
# usecols.extend(hand_crafted_cols)

# xcols = add_time_depended_rolling(data, 'xprice', oneday_agg_periods, np.mean, 'mean')
# for col in xcols:
#     data[col] = data.xprice - data[col]
# usecols.extend(xcols)

# ycols = add_time_depended_rolling(data, 'yprice', oneday_agg_periods, np.mean, 'mean')
# for col in ycols:
#     data[col] = data.yprice - data[col]
# usecols.extend(ycols)

# usecols.append(add_full_history_diff(data, 'xprice'))
# usecols.append(add_full_history_diff(data, 'yprice'))
# usecols.append(add_full_history_diff(data, 'yx_relation'))
# usecols.append(add_full_history_diff(data, 'xy_geom'))

# validate_sklearn_model(model, data, usecols, valid_ratio, test_ratio, droprows);
# data.to_pickle('init_data.pkl')

data = pd.read_pickle('init_data.pkl')
usecols = [
    'ydiff_from_closing','xdiff_from_closing', 
    'yrel_from_closing','xrel_from_closing',
#     'ydiff_from_opening','xdiff_from_opening',
#     'yrel_from_opening','xrel_from_opening',
    'xprice_time_mean_6','yprice_time_mean_6',
    'xprice_time_mean_60','yprice_time_mean_60',
    'xprice_time_mean_360','yprice_time_mean_360',
    'xprice_time_mean_720','yprice_time_mean_720',
    'xprice_time_mean_1410','yprice_time_mean_1410',
#     'xprice_full_history_diff',
#     'yprice_full_history_diff',
#     'yx_relation_full_history_diff',
#     'xy_geom_full_history_diff'
]
# print(validate_model_by_triplets(model, data, usecols, triplets, droprows))
validate_model_by_pentate(model, data, usecols, droprows)

Unnamed: 0,train_50_percent,train_60_percent,train_70_percent,train_80_percent,train_90_percent,min_stats,max_stats,avg
mse,0.021598,0.020704,0.016417,0.021885,0.015022,0.015022,0.021881,0.018936
r2,0.445998,-0.180386,-1.343953,0.521177,0.564198,-1.34375,0.563965,-0.110413


## zscore for yprice

In [6]:
agg_col = 'yprice'
new_cols = add_time_depended_rolling(data, agg_col, oneday_agg_periods, np.std, 'std')
std_reg_const = 0.1

for col in new_cols:
    data[col] = data[col].fillna(0) + std_reg_const

['yprice_time_std_6', 'yprice_time_std_60', 'yprice_time_std_360', 'yprice_time_std_720', 'yprice_time_std_1410']


In [7]:
zscore_cols = []
for lag in oneday_agg_periods:
    current_mean = data['{}_time_mean_{}'.format(agg_col, lag)]
    current_std = data['{}_time_std_{}'.format(agg_col, lag)]
    colname = '{}_time_zscore_{}'.format(agg_col, lag)
    data[colname] = current_mean / current_std
    zscore_cols.append(colname)

selected_cols = greedy_add_strategy(model, data, usecols, zscore_cols, 
                                 valid_ratio, test_ratio, droprows)

validate_model_by_pentate(model, data, selected_cols, droprows)

added yprice_time_zscore_360: r2: 0.40996
added yprice_time_zscore_1410: r2: 0.46594
added yprice_time_zscore_720: r2: 0.47332


Unnamed: 0,train_50_percent,train_60_percent,train_70_percent,train_80_percent,train_90_percent,min_stats,max_stats,avg
mse,0.021629,0.020619,0.016214,0.021656,0.014956,0.014954,0.021652,0.018814
r2,0.29988,0.230784,-0.092851,1.559997,1.002801,-0.092834,1.55957,0.638184


In [7]:
print(selected_cols)

['ydiff_from_closing', 'xdiff_from_closing', 'yrel_from_closing', 'xrel_from_closing', 'xprice_time_mean_6', 'yprice_time_mean_6', 'xprice_time_mean_60', 'yprice_time_mean_60', 'xprice_time_mean_360', 'yprice_time_mean_360', 'xprice_time_mean_720', 'yprice_time_mean_720', 'xprice_time_mean_1410', 'yprice_time_mean_1410', 'yprice_time_zscore_360', 'yprice_time_zscore_1410']


In [8]:
print_importances(model, selected_cols)

xprice_time_mean_6                       18.55%            0.23
yprice_time_mean_360                     15.97%           -0.19
xprice_time_mean_720                     9.74%           -0.12
xprice_time_mean_1410                    9.06%            0.11
xprice_time_mean_360                     8.11%           0.099
xprice_time_mean_60                      7.31%           0.089
yprice_time_mean_60                      6.96%          -0.085
yprice_time_mean_720                     6.35%           0.077
yprice_time_mean_6                       5.50%          -0.067
yprice_time_mean_1410                    5.36%          -0.065
yprice_time_zscore_360                   2.10%           0.026
xdiff_from_closing                       1.77%          -0.022
yprice_time_zscore_720                   1.60%           0.019
xrel_from_closing                        0.63%          0.0077
ydiff_from_closing                       0.60%          0.0072
yrel_from_closing                        0.34%       

## Intraday EWMA for yprice

In [9]:
agg_col = 'yprice'
new_cols = add_intraday_ewma(data, agg_col, [3, 6,12,24,60,120,360,720])

for col in new_cols:
    data[col] = data[agg_col] - data[col]

['yprice_dayly_ewma_3', 'yprice_dayly_ewma_6', 'yprice_dayly_ewma_12', 'yprice_dayly_ewma_24', 'yprice_dayly_ewma_60', 'yprice_dayly_ewma_120', 'yprice_dayly_ewma_360', 'yprice_dayly_ewma_720']


In [10]:
data['ewma_dif_v1'] = data.yprice_dayly_ewma_12 - data.yprice_dayly_ewma_6
data['ewma_dif_v2'] = data.yprice_dayly_ewma_12 * data.yprice_dayly_ewma_6

data['ewma_dif_v3'] = data.yprice_dayly_ewma_60 - data.yprice_dayly_ewma_24
data['ewma_dif_v4'] = data.yprice_dayly_ewma_24 - data.yprice_dayly_ewma_3

data['ewma_dif_v5'] = data.yprice_dayly_ewma_720 * data.yprice_dayly_ewma_360

In [12]:
# selected_cols = greedy_add_strategy(model, data, selected_cols, new_cols, 
#                                  valid_ratio, test_ratio, droprows)

updated_cols = selected_cols + [
    'ewma_dif_v1',
    'ewma_dif_v2', 
    'ewma_dif_v3', 
    'ewma_dif_v4',
    'ewma_dif_v5',
#     'yprice_dayly_ewma_3',
#     'yprice_dayly_ewma_6',
    'yprice_dayly_ewma_12',
    'yprice_dayly_ewma_24',
    'yprice_dayly_ewma_60',
#     'yprice_dayly_ewma_360',
]

selected_cols = updated_cols.copy()
validate_model_by_pentate(model, data, updated_cols.copy(), droprows)

Unnamed: 0,train_50_percent,train_60_percent,train_70_percent,train_80_percent,train_90_percent,min_stats,max_stats,avg
mse,0.021698,0.020654,0.016175,0.021644,0.014966,0.014969,0.021698,0.018829
r2,-0.015656,0.057997,0.147929,1.616568,0.936548,-0.015656,1.616211,0.620605


## Lags of yprice and xprice

In [13]:
df = data.copy()

In [25]:
agg_col = 'yprice'
new_cols = add_shifts(data,'yprice',month_agg_periods)
for col in new_cols:
    data[col] = data[agg_col] - data[col]

['yprice_lag_1410', 'yprice_lag_2820', 'yprice_lag_4230', 'yprice_lag_5640', 'yprice_lag_11280', 'yprice_lag_22560']


In [34]:
updated_cols = selected_cols + [
    'yprice_lag_1410',
    'yprice_lag_2820', 
    'yprice_lag_4230', 
#     'yprice_lag_5640', 
#     'yprice_lag_11280', 
#     'yprice_lag_22560'
]
selected_cols = updated_cols.copy()
validate_model_by_pentate(model, data, updated_cols.copy(), droprows)

Unnamed: 0,train_50_percent,train_60_percent,train_70_percent,train_80_percent,train_90_percent,min_stats,max_stats,avg
mse,0.02175,0.020639,0.016207,0.021574,0.014876,0.014877,0.021744,0.018814
r2,-0.256366,0.130836,-0.05107,1.93304,1.53329,-0.256348,1.932617,0.709473


In [33]:
print_importances(model, selected_cols)

xprice_time_mean_6                       15.83%            0.22
yprice_dayly_ewma_60                     9.08%           -0.13
ewma_dif_v3                              8.44%           -0.12
xprice_time_mean_720                     8.24%           -0.12
xprice_time_mean_1410                    7.52%            0.11
xprice_time_mean_360                     7.22%             0.1
yprice_time_mean_360                     7.14%            -0.1
xprice_time_mean_60                      6.24%           0.089
yprice_time_mean_6                       4.79%          -0.068
yprice_time_mean_720                     4.44%           0.063
yprice_time_mean_1410                    4.20%           -0.06
yprice_dayly_ewma_12                     3.72%          -0.053
ewma_dif_v2                              2.52%          -0.036
yprice_time_zscore_360                   1.99%           0.028
ewma_dif_v1                              1.66%          -0.024
xdiff_from_closing                       1.42%        

In [37]:
agg_col = 'xprice'
new_cols = add_shifts(data,agg_col,month_agg_periods)
for col in new_cols:
    data[col] = data[agg_col] - data[col]

['xprice_lag_1410', 'xprice_lag_2820', 'xprice_lag_4230', 'xprice_lag_5640', 'xprice_lag_11280', 'xprice_lag_22560']


In [46]:
updated_cols = selected_cols + [
    'xprice_lag_1410',
    'xprice_lag_2820', 
    'xprice_lag_4230', 
#     'xprice_lag_5640', 
#     'xprice_lag_11280', 
#     'xprice_lag_22560'
]
validate_model_by_pentate(model, data, updated_cols.copy(), droprows)

Unnamed: 0,train_50_percent,train_60_percent,train_70_percent,train_80_percent,train_90_percent,min_stats,max_stats,avg
mse,0.021714,0.020657,0.016198,0.02159,0.014788,0.014786,0.021713,0.018784
r2,-0.090579,0.046328,0.006313,1.862652,2.118363,-0.090576,2.119141,0.853027


## zscore for yx_rel

In [60]:
agg_col = 'yx_relation'
new_cols = add_time_depended_rolling(data, agg_col, oneday_agg_periods, np.mean, 'mean')

for col in new_cols:
    data[col] = data[agg_col] - data[col]

['yx_relation_time_mean_6', 'yx_relation_time_mean_60', 'yx_relation_time_mean_360', 'yx_relation_time_mean_720', 'yx_relation_time_mean_1410']


In [61]:
agg_col = 'yx_relation'
new_cols = add_time_depended_rolling(data, agg_col, map(lambda x: x * 4, oneday_agg_periods), np.std, 'std')
std_reg_const = 0.001

for col in new_cols:
    data[col] = data[col].fillna(0) + std_reg_const

['yx_relation_time_std_24', 'yx_relation_time_std_240', 'yx_relation_time_std_1440', 'yx_relation_time_std_2880', 'yx_relation_time_std_5640']


In [65]:
zscore_cols = []
for lag in oneday_agg_periods:
    current_mean = data['{}_time_mean_{}'.format(agg_col, lag)]
    current_std = data['{}_time_std_{}'.format(agg_col, lag * 4)]
    colname = '{}_time_zscore_{}'.format(agg_col, lag)
    data[colname] = current_mean / current_std
    zscore_cols.append(colname)

added_cols = greedy_add_strategy(model, data, selected_cols.copy(), zscore_cols, 
                                 valid_ratio, test_ratio, droprows)
validate_model_by_pentate(model, data, added_cols, droprows)

added yx_relation_time_zscore_6: r2: 0.2817
added yx_relation_time_zscore_360: r2: 0.28635
added yx_relation_time_zscore_1410: r2: 0.2902
added yx_relation_time_zscore_60: r2: 0.29208
added yx_relation_time_zscore_720: r2: 0.3359


Unnamed: 0,train_50_percent,train_60_percent,train_70_percent,train_80_percent,train_90_percent,min_stats,max_stats,avg
mse,0.02172,0.020569,0.016219,0.021585,0.014995,0.014992,0.021713,0.018829
r2,-0.117854,0.473017,-0.124877,1.883428,0.743337,-0.124878,1.883789,0.65918


In [72]:
added_cols = [
#     'yx_relation_time_zscore_6',
#     'yx_relation_time_zscore_60',
#     'yx_relation_time_zscore_360',
#     'yx_relation_time_zscore_720',
#     'yx_relation_time_zscore_1410'
#     'yx_relation_time_std_6',
#     'yx_relation_time_std_60',
#     'yx_relation_time_std_360', 
#     'yx_relation_time_std_720',
#     'yx_relation_time_std_1410'
]
validate_model_by_pentate(model, data, selected_cols + added_cols, droprows)

Unnamed: 0,train_50_percent,train_60_percent,train_70_percent,train_80_percent,train_90_percent,min_stats,max_stats,avg
mse,0.0217,0.020596,0.016235,0.021567,0.014979,0.014977,0.021698,0.018829
r2,-0.026945,0.341965,-0.222462,1.965554,0.849643,-0.222412,1.96582,0.664551


In [85]:
data[added_cols].std()

yx_relation_time_zscore_6       0.588646
yx_relation_time_zscore_60      1.005944
yx_relation_time_zscore_360     1.147955
yx_relation_time_zscore_720     1.154843
yx_relation_time_zscore_1410    1.134246
dtype: float64

## ema for yprice

In [90]:
agg_col = 'yx_relation'
new_cols = add_intraday_ewma(data, agg_col, [3,6,12,24,60])

agg_mean = data.loc[data.day < 28, agg_col].mean()
agg_std = data.loc[data.day < 28, agg_col].std() + std_reg_const

for col in new_cols:
    data[col] = (data[col] - agg_mean) / agg_std


['yx_relation_dayly_ewma_3', 'yx_relation_dayly_ewma_6', 'yx_relation_dayly_ewma_12', 'yx_relation_dayly_ewma_24', 'yx_relation_dayly_ewma_60']


In [64]:
data.yprice.rolling(6).ewma

AttributeError: 'Rolling' object has no attribute 'ewma'

In [107]:
# selected_cols = greedy_add_strategy(model, data, selected_cols, new_cols, 
#                                  valid_ratio, test_ratio, droprows)

updated_cols = selected_cols + [
#     'yx_relation_dayly_ewma_3',
#     'yx_relation_dayly_ewma_6',
    'yx_relation_dayly_ewma_12',
#     'yx_relation_dayly_ewma_24',
#     'yx_relation_dayly_ewma_60',
]
# selected_cols = updated_cols.copy()
validate_model_by_pentate(model, data, updated_cols.copy(), droprows)

Unnamed: 0,train_50_percent,train_60_percent,train_70_percent,train_80_percent,train_90_percent,min_stats,max_stats,avg
mse,0.021871,0.02061,0.016866,0.021487,0.015082,0.015083,0.021866,0.018982
r2,-0.813138,0.274729,-4.115274,2.327433,0.170667,-4.117188,2.328125,-0.563477


In [50]:
def add_ewma(df, column, windows):
    new_columns = []
    for window_size in windows:
        colname = '{}_ewma_{}'.format(column, window_size)
        df.loc[:, colname] = pd.Series.ewm(df[column], halflife=window_size).mean()
        new_columns.append(colname)
    print(new_columns)
    return new_columns

from ts_features import add_ewma
add_ewma(data, 'yprice', [2])

['yprice_ewma_2']


['yprice_ewma_2']

In [85]:
updated_cols.remove('yx_relation_dayly_ewma_60')

In [98]:
selected_cols.remove('yx_relation_dayly_ewma_60')

## zscore for xprice

In [13]:
# agg_col = 'xprice'
# new_cols = add_time_depended_rolling(data, agg_col, oneday_agg_periods, np.std, 'std')
# std_reg_const = 1

# for col in new_cols:
#     data[col] = data[col].fillna(0) + std_reg_const

['xprice_time_std_6', 'xprice_time_std_60', 'xprice_time_std_360', 'xprice_time_std_720', 'xprice_time_std_1410']


In [12]:
# zscore_cols = []
# for lag in oneday_agg_periods:
#     current_mean = data['{}_time_mean_{}'.format(agg_col, lag)]
#     current_std = data['{}_time_std_{}'.format(agg_col, lag)]
#     colname = '{}_time_zscore_{}'.format(agg_col, lag)
#     data[colname] = current_mean / current_std
#     zscore_cols.append(colname)

# selected_cols = greedy_add_strategy(model, data, selected_cols, zscore_cols, 
#                                  valid_ratio, test_ratio, droprows)
# validate_model_by_pentate(model, data, selected_cols, droprows)

added xprice_time_zscore_60: r2: 0.41918


Unnamed: 0,train_50_percent,train_60_percent,train_70_percent,train_80_percent,train_90_percent,min_stats,max_stats,avg
mse,0.021537,0.020677,0.0163,0.021582,0.014994,0.014992,0.021576,0.018814
r2,0.728314,-0.04988,-0.623453,1.89547,0.749149,-0.623535,1.895508,0.567383


In [18]:
# from ts_features import add_rsi

# rsi_spread_cols = add_rsi(data, 'yx_spread', twoweeks_agg_periods)
# usecols.extend(rsi_spread_cols)

# rsi_yxrel_cols = add_rsi(data, 'yx_relation', twoweeks_agg_periods)
# usecols.extend(rsi_yxrel_cols)

# rsi_xyrel_cols = add_rsi(data, 'xy_relation', twoweeks_agg_periods)
# usecols.extend(rsi_xyrel_cols)

# rsi_geom_cols = add_rsi(data, 'xy_geom', twoweeks_agg_periods)
# usecols.extend(rsi_geom_cols)

# rsi_garmonic_cols = add_rsi(data, 'xy_garmonic', twoweeks_agg_periods)
# usecols.extend(rsi_garmonic_cols)

In [16]:
# agg_col = 'yprice_time_mean_360'
# new_cols = add_diffs(data, agg_col, [360, 720, 1080, 1410])
# # usecols.extend(new_cols)

# added_cols = [
# #     'yprice_time_mean_360_diff_360',
# #     'yprice_time_mean_360_diff_720',
# #     'yprice_time_mean_360_diff_1080',
# #     'yprice_time_mean_360_diff_1410'
# ]

# # selected_cols = greedy_add_strategy(model, data, usecols, new_cols,
# #                                     valid_ratio, test_ratio, droprows)

# validate_model_by_pentate(model, data, usecols + added_cols, droprows)

In [68]:
agg_col = 'xprice_time_mean_360'
new_cols = add_diffs(data, agg_col, month_agg_periods)
usecols.extend(new_cols)

selected_cols = greedy_add_strategy(model, data, selected_cols, new_cols,
                                    valid_ratio, test_ratio, droprows)

validate_sklearn_model(model, data, selected_cols, valid_ratio, test_ratio, droprows);

['xprice_time_mean_360_diff_1410', 'xprice_time_mean_360_diff_2820', 'xprice_time_mean_360_diff_4230', 'xprice_time_mean_360_diff_5640', 'xprice_time_mean_360_diff_7050', 'xprice_time_mean_360_diff_14100', 'xprice_time_mean_360_diff_21150', 'xprice_time_mean_360_diff_28200']
added xprice_time_mean_360_diff_1410: r2: -0.58474
added xprice_time_mean_360_diff_7050: r2: -0.58439
added xprice_time_mean_360_diff_28200: r2: -0.5632
added xprice_time_mean_360_diff_14100: r2: -0.54401
Data shapes:  (143116, 28) (85657, 28) (85657, 28)

Valid MSE: 		 0.020145
Valid R2 (x100): 	 -0.54401

Test MSE: 		 0.018163
Test R2 (x100): 	 1.4395


In [69]:
agg_col = 'yx_relation'
new_cols = add_time_depended_rolling(data, agg_col, oneday_agg_periods, np.mean, 'mean')
for col in new_cols:
    data[col] = data[col] / (data[agg_col] + 1e-3)
usecols.extend(new_cols)

selected_cols = greedy_add_strategy(model, data, selected_cols, new_cols,
                                    valid_ratio, test_ratio, droprows)
validate_sklearn_model(model, data, selected_cols, valid_ratio, test_ratio, droprows);

['yx_relation_time_mean_6', 'yx_relation_time_mean_60', 'yx_relation_time_mean_360', 'yx_relation_time_mean_720', 'yx_relation_time_mean_1410']
added yx_relation_time_mean_6: r2: -0.54401
added yx_relation_time_mean_360: r2: -0.544
added yx_relation_time_mean_60: r2: -0.544
added yx_relation_time_mean_720: r2: -0.544
Data shapes:  (143116, 32) (85657, 32) (85657, 32)

Valid MSE: 		 0.020145
Valid R2 (x100): 	 -0.544

Test MSE: 		 0.018163
Test R2 (x100): 	 1.4394


In [19]:
selected_cols=[elem for elem in selected_cols if elem not in new_cols]

In [62]:
agg_col = 'yx_relation'
new_cols = add_diffs(data, agg_col, twoweeks_agg_periods)

usecols.extend(new_cols)

selected_cols = greedy_add_strategy(model, data, selected_cols, new_cols,
                                    valid_ratio, test_ratio, droprows)
validate_sklearn_model(model, data, selected_cols, valid_ratio, test_ratio, droprows);

['yx_relation_diff_6', 'yx_relation_diff_60', 'yx_relation_diff_360', 'yx_relation_diff_720', 'yx_relation_diff_1410', 'yx_relation_diff_2820', 'yx_relation_diff_7050', 'yx_relation_diff_14100']
added yx_relation_diff_60: r2: 1.0848
added yx_relation_diff_360: r2: 1.0922
Data shapes:  (194510, 32) (68526, 32) (51394, 32)

Valid MSE: 		 0.019377
Valid R2 (x100): 	 1.0922

Test MSE: 		 0.015762
Test R2 (x100): 	 0.97178


In [13]:
agg_col = 'yx_relation'
std_reg_const = 0.1
new_cols = add_time_depended_rolling(data, agg_col, oneday_agg_periods, np.std, 'std')
for col in new_cols:
    data[col] = data[col].fillna(0) + std_reg_const
usecols.extend(new_cols)

selected_cols = greedy_add_strategy(model, data, selected_cols, new_cols,
                                    valid_ratio, test_ratio, droprows)
validate_sklearn_model(model, data, selected_cols, valid_ratio, test_ratio, droprows);

['yx_relation_time_std_6', 'yx_relation_time_std_60', 'yx_relation_time_std_360', 'yx_relation_time_std_720', 'yx_relation_time_std_1410']
Data shapes:  (143115, 35) (102789, 35) (68526, 35)

Valid MSE: 		 0.022652
Valid R2 (x100): 	 -15.558

Test MSE: 		 0.018436
Test R2 (x100): 	 1.0791


In [15]:
agg_col = 'yx_relation'
zscore_cols = []
for lag in oneday_agg_periods:
    current_mean = data['{}_time_mean_{}'.format(agg_col, lag)]
    current_std = data['{}_time_std_{}'.format(agg_col, lag)]
    colname = '{}_time_zscore_{}'.format(agg_col, lag)
    data[colname] = current_mean / current_std
    zscore_cols.append(colname)

added_cols = greedy_add_strategy(model, data, selected_cols, zscore_cols, 
                                 valid_ratio, test_ratio, droprows)
validate_sklearn_model(model, data, added_cols, valid_ratio, test_ratio, droprows);

added yx_relation_time_zscore_6: r2: -15.558
added yx_relation_time_zscore_360: r2: -15.556
added yx_relation_time_zscore_720: r2: -15.556
Data shapes:  (143115, 38) (102789, 38) (68526, 38)

Valid MSE: 		 0.022651
Valid R2 (x100): 	 -15.556

Test MSE: 		 0.018437
Test R2 (x100): 	 1.0761


In [11]:
# agg_col = 'xy_geom'
# new_cols = add_time_depended_rolling(data, agg_col, oneday_agg_periods, np.mean, 'mean')
# for col in new_cols:
#     data[col] = data[col] - data[agg_col]
# usecols.extend(new_cols)

# selected_cols = greedy_add_strategy(model, data, selected_cols, new_cols,
#                                     valid_ratio, test_ratio, droprows)
# validate_sklearn_model(model, data, selected_cols, valid_ratio, test_ratio, droprows);

In [12]:
# agg_col = 'yx_relation_time_mean_360'
# new_cols = add_diffs(data, agg_col, month_agg_periods)
# usecols.extend(new_cols)

# selected_cols = greedy_add_strategy(model, data, selected_cols, new_cols,
#                                     valid_ratio, test_ratio, droprows)
# validate_sklearn_model(model, data, selected_cols, valid_ratio, test_ratio, droprows);

In [13]:
# agg_col = 'xy_geom'
# new_cols = add_time_depended_rolling(data, agg_col, [60, 360, 1410], np.std, 'std')
# for col in new_cols:
#     data[col] = data[col].fillna(0)
# usecols.extend(new_cols)

# selected_cols = greedy_add_strategy(model, data, selected_cols, new_cols,
#                                     valid_ratio, test_ratio, droprows)
# validate_sklearn_model(model, data, selected_cols, valid_ratio, test_ratio, droprows);

In [14]:
# agg_col = 'xy_relation_time_mean_60'
# new_cols = add_diffs(data, agg_col, month_agg_periods)

# usecols.extend(new_cols)
# validate_sklearn_model(model, data, usecols, valid_ratio, test_ratio, droprows);

In [15]:
# agg_col = 'yprice'
# # new_cols = add_time_depended_rolling(data, agg_col, [60, 360, 1410], np.std, 'std')
# data['yprice_time_normbot_60'] = data.yprice_time_mean_60 - 2 * data.yprice_time_std_60.fillna(0)
# data['yprice_time_normtop_60'] = data.yprice_time_mean_60 + 2 * data.yprice_time_std_60.fillna(0)
# data['yprice_time_normbot_360'] = data.yprice_time_mean_360 - 2 * data.yprice_time_std_360.fillna(0)
# data['yprice_time_normtop_360'] = data.yprice_time_mean_360 + 2 * data.yprice_time_std_360.fillna(0)
# data['yprice_time_normbot_1410'] = data.yprice_time_mean_1410 - 2 * data.yprice_time_std_1410.fillna(0)
# data['yprice_time_normtop_1410'] = data.yprice_time_mean_1410 + 2 * data.yprice_time_std_1410.fillna(0)
# norm_cols = [
#     'yprice_time_normbot_60', 
#     'yprice_time_normtop_60',
#     'yprice_time_normbot_360', 
#     'yprice_time_normtop_360',
#     'yprice_time_normbot_1410',
#     'yprice_time_normtop_1410'
# ]
# for col in new_cols:
#     data[col] = data[col].fillna(0)
# validate_sklearn_model(model, data, selected_cols + norm_cols, valid_ratio, test_ratio, droprows);

In [16]:
agg_col = 'yprice'
new_cols = add_time_depended_rolling(data, agg_col, [6, 60, 360, 1410], np.max, 'max')

for col in new_cols:
    data[col] = data[col] - data[agg_col
                                ]
# validate_sklearn_model(model, data, selected_cols + new_cols, valid_ratio, test_ratio, droprows);
added_cols = greedy_add_strategy(model, data, selected_cols, new_cols, 
                                 valid_ratio, test_ratio, droprows)

validate_sklearn_model(model, data, added_cols, valid_ratio, test_ratio, droprows);

['yprice_time_max_6', 'yprice_time_max_60', 'yprice_time_max_360', 'yprice_time_max_1410']
added yprice_time_max_6: r2: 1.0393
Data shapes:  (194510, 34) (68526, 34) (51394, 34)

Valid MSE: 		 0.019387
Valid R2 (x100): 	 1.0393

Test MSE: 		 0.015695
Test R2 (x100): 	 1.3925


In [18]:
agg_col = 'yprice_time_max_60'
new_cols = add_shifts(data, agg_col, [60, 120, 180, 240])
usecols.extend(new_cols)

added_cols = greedy_add_strategy(model, data, added_cols, new_cols,
                                valid_ratio, test_ratio, droprows)

validate_sklearn_model(model, data, added_cols, valid_ratio, test_ratio, droprows);

['yprice_time_max_60_lag_60', 'yprice_time_max_60_lag_120', 'yprice_time_max_60_lag_180', 'yprice_time_max_60_lag_240']
added yprice_time_max_60_lag_180: r2: 1.059
Data shapes:  (194510, 35) (68526, 35) (51394, 35)

Valid MSE: 		 0.019383
Valid R2 (x100): 	 1.059

Test MSE: 		 0.015691
Test R2 (x100): 	 1.4162


In [19]:
selected_cols = added_cols.copy()

In [20]:
# agg_col = 'yprice'
# new_cols = add_time_depended_rolling(data, agg_col, [6, 60, 360, 1410], np.min, 'min')

# for col in new_cols:
#     data[col] = data[col] - data[agg_col
#                                 ]
# # validate_sklearn_model(model, data, selected_cols + new_cols, valid_ratio, test_ratio, droprows);
# added_cols = greedy_add_strategy(model, data, selected_cols, new_cols,
#                                     valid_ratio, test_ratio, droprows)

# validate_sklearn_model(model, data, added_cols, valid_ratio, test_ratio, droprows);


['yprice_time_min_6', 'yprice_time_min_60', 'yprice_time_min_360', 'yprice_time_min_1410']
Data shapes:  (194510, 35) (68526, 35) (51394, 35)

Valid MSE: 		 0.019383
Valid R2 (x100): 	 1.059

Test MSE: 		 0.015691
Test R2 (x100): 	 1.4162


In [78]:
agg_col = 'yprice'
# new_cols = add_time_depended_rolling(data, agg_col, oneday_agg_periods, np.std, 'std')
# data['yprice_time_zscore_60'] = data.yprice_time_mean_60 / (data.yprice_time_std_60.fillna(1) + 1e-5)
data['yprice_time_zscore_360'] = data.yprice_time_mean_360 / (data.yprice_time_std_360.fillna(1) + 0.1)
# data['yprice_time_zscore_1410'] = data.yprice_time_mean_1410 / (data.yprice_time_std_1410.fillna(1) + 1e-5)

zscore_cols = [
#     'yprice_time_zscore_60', 
    'yprice_time_zscore_360',
#     'yprice_time_zscore_1410', 
]

added_cols = greedy_add_strategy(model, data, selected_cols, zscore_cols, 
                                 valid_ratio, test_ratio, droprows)
validate_sklearn_model(model, data, added_cols, valid_ratio, test_ratio, droprows);

added yprice_time_zscore_360: r2: 1.9878
Data shapes:  (194510, 36) (68526, 36) (51394, 36)

Valid MSE: 		 0.019201
Valid R2 (x100): 	 1.9878

Test MSE: 		 0.015652
Test R2 (x100): 	 1.6602


In [80]:
valid_ratio, test_ratio

(0.2, 0.15)

In [24]:
validate_sklearn_model(model, data, selected_cols + ['yprice_time_zscore_360'], valid_ratio, test_ratio, droprows);

Data shapes:  (194510, 36) (68526, 36) (51394, 36)

Valid MSE: 		 0.019221
Valid R2 (x100): 	 1.8884

Test MSE: 		 0.01569
Test R2 (x100): 	 1.4207


In [28]:
selected_cols = added_cols.copy()

In [30]:
# agg_col = 'xprice'
# new_cols = add_time_depended_rolling(data, agg_col, oneday_agg_periods, np.std, 'std')
# data['xprice_time_zscore_60'] = data.yprice_time_mean_60 / (data.yprice_time_std_60.fillna(1) + 1e-5)
# data['xprice_time_zscore_360'] = data.yprice_time_mean_360 / (data.yprice_time_std_360.fillna(1) + 1e-5)
# data['xprice_time_zscore_1410'] = data.yprice_time_mean_1410 / (data.yprice_time_std_1410.fillna(1) + 1e-5)

# zscore_cols = [
#     'xprice_time_zscore_60', 
#     'xprice_time_zscore_360',
#     'xprice_time_zscore_1410', 
# ]

# added_cols = greedy_add_strategy(model, data, selected_cols, zscore_cols, 
#                                  valid_ratio, test_ratio, droprows)
# validate_sklearn_model(model, data, selected_cols + ['xprice_time_zscore_360'], valid_ratio, test_ratio, droprows);

['xprice_time_std_6', 'xprice_time_std_60', 'xprice_time_std_360', 'xprice_time_std_720', 'xprice_time_std_1410']
Data shapes:  (194510, 37) (68526, 37) (51394, 37)

Valid MSE: 		 0.019221
Valid R2 (x100): 	 1.8884

Test MSE: 		 0.01569
Test R2 (x100): 	 1.4207


In [34]:
# agg_col = 'yprice_time_zscore_360'
# new_cols = add_shifts(data, agg_col, month_agg_periods)
# usecols.extend(new_cols)

# added_cols = greedy_add_strategy(model, data, selected_cols, new_cols,
#                                 valid_ratio, test_ratio, droprows)

# validate_sklearn_model(model, data, added_cols, valid_ratio, test_ratio, droprows);

['yprice_time_zscore_360_lag_1410', 'yprice_time_zscore_360_lag_2820', 'yprice_time_zscore_360_lag_4230', 'yprice_time_zscore_360_lag_5640', 'yprice_time_zscore_360_lag_7050', 'yprice_time_zscore_360_lag_14100', 'yprice_time_zscore_360_lag_21150', 'yprice_time_zscore_360_lag_28200']
added yprice_time_zscore_360_lag_28200: r2: 1.9145
Data shapes:  (194510, 37) (68526, 37) (51394, 37)

Valid MSE: 		 0.019216
Valid R2 (x100): 	 1.9145

Test MSE: 		 0.015709
Test R2 (x100): 	 1.3032


In [14]:
print_importances(model, selected_cols)

xy_geom_full_history_diff                13.39%           -0.28
xprice_time_mean_6                       11.31%            0.24
yprice_time_mean_360                     7.22%           -0.15
xprice_full_history_diff                 7.10%            0.15
xprice_time_mean_720                     7.09%           -0.15
yprice_time_mean_720                     6.43%            0.14
yprice_full_history_diff                 6.17%            0.13
xprice_time_mean_1410                    6.07%            0.13
xprice_time_mean_360                     5.08%            0.11
yprice_time_mean_1410                    4.02%          -0.085
yrel_from_opening                        3.89%           0.082
xprice_time_mean_60                      3.13%           0.066
yprice_time_mean_6                       2.64%          -0.056
yprice_time_mean_360_diff_1410           1.32%          -0.028
yprice_time_mean_60                      1.31%          -0.028
xrel_from_opening                        1.25%       

In [28]:
selected_cols = greedy_add_del_strategy(model, data, usecols, valid_ratio, test_ratio, droprows)

removed yprice_time_mean_360_diff_2820: r2: 0.95017
removed xprice_time_mean_360_diff_4230: r2: 0.99257
removed yprice_time_mean_60: r2: 1.0282
removed xy_geom_time_mean_60: r2: 1.2378
removed yprice_time_mean_6: r2: 1.2725
removed xprice_time_mean_6: r2: 1.3136
removed xprice_time_mean_360_diff_5640: r2: 1.3487
removed xprice_time_mean_360_diff_14100: r2: 1.3803
removed yprice_time_mean_360_diff_21150: r2: 1.4096
removed yprice_time_mean_360_diff_4230: r2: 1.4122
removed xy_relation_time_mean_60_diff_7050: r2: 1.4135
removed xrel_from_opening: r2: 1.4145
removed xy_relation_time_mean_60_diff_2820: r2: 1.4153
removed xy_relation_time_mean_360_diff_2820: r2: 1.4155
removed xy_relation_time_mean_60: r2: 1.416
removed xy_relation_time_mean_60_diff_2820: r2: 1.4163
removed xy_relation_time_mean_60_diff_21150: r2: 1.4167
removed xy_relation_time_mean_60_diff_14100: r2: 1.417
removed yx_relation_full_history_diff: r2: 1.4173
removed xy_relation_time_mean_360_diff_4230: r2: 1.4175
removed xy_

In [30]:
selected_cols

['ydiff_from_closing',
 'xdiff_from_closing',
 'xrel_from_closing',
 'ydiff_from_closing',
 'xdiff_from_closing',
 'xprice_time_mean_60',
 'xprice_time_mean_360',
 'xprice_time_mean_720',
 'xprice_time_mean_1410',
 'yprice_time_mean_360',
 'yprice_time_mean_720',
 'yprice_time_mean_1410',
 'xprice_full_history_diff',
 'yprice_full_history_diff',
 'xy_geom_full_history_diff',
 'yprice_time_mean_360_diff_1410',
 'yprice_time_mean_360_diff_5640',
 'yprice_time_mean_360_diff_14100',
 'yprice_time_mean_360_diff_28200',
 'yprice_time_mean_360_diff_14100',
 'xprice_time_mean_360_diff_1410',
 'xprice_time_mean_360_diff_2820',
 'xprice_time_mean_360_diff_21150',
 'xprice_time_mean_360_diff_28200',
 'xy_relation_time_mean_6',
 'xy_relation_time_mean_360',
 'xy_relation_time_mean_720',
 'xy_relation_time_mean_1410',
 'xy_geom_time_mean_6',
 'xy_geom_time_mean_360',
 'xy_geom_time_mean_720',
 'xy_geom_time_mean_1410',
 'xy_relation_time_mean_360_diff_1410',
 'xy_relation_time_mean_360_diff_7050',


In [29]:
validate_sklearn_model(model, data, selected_cols, valid_ratio, test_ratio, droprows);

Data shapes:  (194510, 36) (68526, 36) (51394, 36)

Valid MSE: 		 0.019312
Valid R2 (x100): 	 1.421

Test MSE: 		 0.015751
Test R2 (x100): 	 1.0392
