In [1]:
import pandas as pd
import numpy as np

import matplotlib.pylab as plt
%matplotlib inline

from tqdm import tqdm, tqdm_notebook
pd.set_option("display.max_rows", 100)
pd.set_option('display.max_columns', 200)

from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression, ElasticNet, Ridge
fname = 'data.csv'

from ts_features import init_data, add_hand_feats, add_diffs, add_shifts
from ts_features import add_time_depended_rolling, add_full_history_diff
from ts_validation import time_split, validate_sklearn_model, greedy_add_del_strategy, greedy_add_strategy
from helper import print_importances

In [2]:
short_agg_periods = [6, 60, 360]
oneday_agg_periods = [6, 60, 360, 720, 1410]
twoweeks_agg_periods = [6, 60, 360, 720, 1410, 2820, 7050, 14100]

month_days_periods = [1,2,3,4,5,10,15,20]
month_agg_periods = list(map(lambda x: x * 1410, month_days_periods))
print('month_agg_periods: {}'.format(month_agg_periods))

merged_agg_periods = [6, 60, 360, 720, 1410, 2820, 4230, 5640, 7050, 14100, 21150]

month_agg_periods: [1410, 2820, 4230, 5640, 7050, 14100, 21150, 28200]


In [3]:
valid_ratio = 0.2
test_ratio = 0.15

droprows = 7050
# droprows = 28200

In [4]:
model = Ridge(alpha=10)

In [5]:
# usecols = []

# data = init_data(fname)

# hand_crafted_cols = add_hand_feats(data)
# usecols.extend(hand_crafted_cols)

# xcols = add_time_depended_rolling(data, 'xprice', oneday_agg_periods, np.mean, 'mean')
# for col in xcols:
#     data[col] = data.xprice - data[col]
# usecols.extend(xcols)

# ycols = add_time_depended_rolling(data, 'yprice', oneday_agg_periods, np.mean, 'mean')
# for col in ycols:
#     data[col] = data.yprice - data[col]
# usecols.extend(ycols)

# usecols.append(add_full_history_diff(data, 'xprice'))
# usecols.append(add_full_history_diff(data, 'yprice'))
# usecols.append(add_full_history_diff(data, 'yx_relation'))
# usecols.append(add_full_history_diff(data, 'xy_geom'))

# validate_sklearn_model(model, data, usecols, valid_ratio, test_ratio, droprows);
# data.to_pickle('init_data.pkl')

In [6]:
data = pd.read_pickle('init_data.pkl')
usecols = [
    'ydiff_from_closing','xdiff_from_closing', 'yrel_from_closing','xrel_from_closing',
    'ydiff_from_closing','xdiff_from_closing','yrel_from_opening','xrel_from_opening',
    'xprice_time_mean_6','xprice_time_mean_60','xprice_time_mean_360','xprice_time_mean_720',
    'xprice_time_mean_1410','yprice_time_mean_6','yprice_time_mean_60','yprice_time_mean_360',
    'yprice_time_mean_720','yprice_time_mean_1410','xprice_full_history_diff','yprice_full_history_diff',
    'yx_relation_full_history_diff','xy_geom_full_history_diff'
]
validate_sklearn_model(model, data, usecols, valid_ratio, test_ratio, droprows);

Data shapes:  (215660, 22) (68526, 22) (51394, 22)

Valid MSE: 		 0.019413
Valid R2 (x100): 	 0.90537

Test MSE: 		 0.015764
Test R2 (x100): 	 0.95689


In [7]:
# rsi_spread_cols = add_rsi(data, 'yx_spread', twoweeks_agg_periods)
# usecols.extend(new_cols)

# rsi_yxrel_cols = add_rsi(data, 'yx_relation', twoweeks_agg_periods)
# usecols.extend(new_cols)

# rsi_spread_cols = add_rsi(data, 'xy_relation', twoweeks_agg_periods)
# usecols.extend(new_cols)

# rsi_spread_cols = add_rsi(data, 'xy_geom', twoweeks_agg_periods)
# usecols.extend(new_cols)

# rsi_spread_cols = add_rsi(data, 'xy_garmonic', twoweeks_agg_periods)
# usecols.extend(new_cols)

In [8]:
agg_col = 'yprice_time_mean_360'
new_cols = add_diffs(data, agg_col, month_agg_periods)
usecols.extend(new_cols)

selected_cols = greedy_add_strategy(model, data, usecols, new_cols,
                                    valid_ratio, test_ratio, droprows)

validate_sklearn_model(model, data, selected_cols, valid_ratio, test_ratio, droprows);

['yprice_time_mean_360_diff_1410', 'yprice_time_mean_360_diff_2820', 'yprice_time_mean_360_diff_4230', 'yprice_time_mean_360_diff_5640', 'yprice_time_mean_360_diff_7050', 'yprice_time_mean_360_diff_14100', 'yprice_time_mean_360_diff_21150', 'yprice_time_mean_360_diff_28200']
added yprice_time_mean_360_diff_14100: r2: 0.88222
Data shapes:  (194510, 30) (68526, 30) (51394, 30)

Valid MSE: 		 0.019418
Valid R2 (x100): 	 0.88222

Test MSE: 		 0.015722
Test R2 (x100): 	 1.2195


In [9]:
agg_col = 'xprice_time_mean_360'
new_cols = add_diffs(data, agg_col, month_agg_periods)
usecols.extend(new_cols)

selected_cols = greedy_add_strategy(model, data, selected_cols, new_cols,
                                    valid_ratio, test_ratio, droprows)

validate_sklearn_model(model, data, selected_cols, valid_ratio, test_ratio, droprows);

['xprice_time_mean_360_diff_1410', 'xprice_time_mean_360_diff_2820', 'xprice_time_mean_360_diff_4230', 'xprice_time_mean_360_diff_5640', 'xprice_time_mean_360_diff_7050', 'xprice_time_mean_360_diff_14100', 'xprice_time_mean_360_diff_21150', 'xprice_time_mean_360_diff_28200']
added xprice_time_mean_360_diff_1410: r2: 0.98928
added xprice_time_mean_360_diff_7050: r2: 0.99212
added xprice_time_mean_360_diff_28200: r2: 1.0374
Data shapes:  (194510, 33) (68526, 33) (51394, 33)

Valid MSE: 		 0.019388
Valid R2 (x100): 	 1.0374

Test MSE: 		 0.015696
Test R2 (x100): 	 1.383


In [10]:
# agg_col = 'yx_relation'
# new_cols = add_time_depended_rolling(data, agg_col, oneday_agg_periods, np.mean, 'mean')
# for col in new_cols:
#     data[col] = data[col] - data[agg_col]
# usecols.extend(new_cols)

# selected_cols = greedy_add_strategy(model, data, selected_cols, new_cols,
#                                     valid_ratio, test_ratio, droprows)
# validate_sklearn_model(model, data, selected_cols, valid_ratio, test_ratio, droprows);

In [11]:
# agg_col = 'xy_geom'
# new_cols = add_time_depended_rolling(data, agg_col, oneday_agg_periods, np.mean, 'mean')
# for col in new_cols:
#     data[col] = data[col] - data[agg_col]
# usecols.extend(new_cols)

# selected_cols = greedy_add_strategy(model, data, selected_cols, new_cols,
#                                     valid_ratio, test_ratio, droprows)
# validate_sklearn_model(model, data, selected_cols, valid_ratio, test_ratio, droprows);

In [12]:
# agg_col = 'yx_relation_time_mean_360'
# new_cols = add_diffs(data, agg_col, month_agg_periods)
# usecols.extend(new_cols)

# selected_cols = greedy_add_strategy(model, data, selected_cols, new_cols,
#                                     valid_ratio, test_ratio, droprows)
# validate_sklearn_model(model, data, selected_cols, valid_ratio, test_ratio, droprows);

In [13]:
# agg_col = 'xy_geom'
# new_cols = add_time_depended_rolling(data, agg_col, [60, 360, 1410], np.std, 'std')
# for col in new_cols:
#     data[col] = data[col].fillna(0)
# usecols.extend(new_cols)

# selected_cols = greedy_add_strategy(model, data, selected_cols, new_cols,
#                                     valid_ratio, test_ratio, droprows)
# validate_sklearn_model(model, data, selected_cols, valid_ratio, test_ratio, droprows);

In [14]:
# agg_col = 'xy_relation_time_mean_60'
# new_cols = add_diffs(data, agg_col, month_agg_periods)

# usecols.extend(new_cols)
# validate_sklearn_model(model, data, usecols, valid_ratio, test_ratio, droprows);

In [15]:
# agg_col = 'yprice'
# # new_cols = add_time_depended_rolling(data, agg_col, [60, 360, 1410], np.std, 'std')
# data['yprice_time_normbot_60'] = data.yprice_time_mean_60 - 2 * data.yprice_time_std_60.fillna(0)
# data['yprice_time_normtop_60'] = data.yprice_time_mean_60 + 2 * data.yprice_time_std_60.fillna(0)
# data['yprice_time_normbot_360'] = data.yprice_time_mean_360 - 2 * data.yprice_time_std_360.fillna(0)
# data['yprice_time_normtop_360'] = data.yprice_time_mean_360 + 2 * data.yprice_time_std_360.fillna(0)
# data['yprice_time_normbot_1410'] = data.yprice_time_mean_1410 - 2 * data.yprice_time_std_1410.fillna(0)
# data['yprice_time_normtop_1410'] = data.yprice_time_mean_1410 + 2 * data.yprice_time_std_1410.fillna(0)
# norm_cols = [
#     'yprice_time_normbot_60', 
#     'yprice_time_normtop_60',
#     'yprice_time_normbot_360', 
#     'yprice_time_normtop_360',
#     'yprice_time_normbot_1410',
#     'yprice_time_normtop_1410'
# ]
# for col in new_cols:
#     data[col] = data[col].fillna(0)
# validate_sklearn_model(model, data, selected_cols + norm_cols, valid_ratio, test_ratio, droprows);

In [16]:
agg_col = 'yprice'
new_cols = add_time_depended_rolling(data, agg_col, [6, 60, 360, 1410], np.max, 'max')

for col in new_cols:
    data[col] = data[col] - data[agg_col
                                ]
# validate_sklearn_model(model, data, selected_cols + new_cols, valid_ratio, test_ratio, droprows);
added_cols = greedy_add_strategy(model, data, selected_cols, new_cols, 
                                 valid_ratio, test_ratio, droprows)

validate_sklearn_model(model, data, added_cols, valid_ratio, test_ratio, droprows);

['yprice_time_max_6', 'yprice_time_max_60', 'yprice_time_max_360', 'yprice_time_max_1410']
added yprice_time_max_6: r2: 1.0393
Data shapes:  (194510, 34) (68526, 34) (51394, 34)

Valid MSE: 		 0.019387
Valid R2 (x100): 	 1.0393

Test MSE: 		 0.015695
Test R2 (x100): 	 1.3925


In [17]:
agg_col = 'yprice_time_max_60'
new_cols = add_shifts(data, agg_col, [60, 120, 180, 240])
usecols.extend(new_cols)

added_cols = greedy_add_strategy(model, data, added_cols, new_cols,
                                valid_ratio, test_ratio, droprows)

validate_sklearn_model(model, data, added_cols, valid_ratio, test_ratio, droprows);

['yprice_time_max_60_lag_60', 'yprice_time_max_60_lag_120', 'yprice_time_max_60_lag_180', 'yprice_time_max_60_lag_240']
added yprice_time_max_60_lag_180: r2: 1.059
Data shapes:  (194510, 35) (68526, 35) (51394, 35)

Valid MSE: 		 0.019383
Valid R2 (x100): 	 1.059

Test MSE: 		 0.015691
Test R2 (x100): 	 1.4162


In [19]:
selected_cols = added_cols.copy()

In [20]:
agg_col = 'yprice'
new_cols = add_time_depended_rolling(data, agg_col, [6, 60, 360, 1410], np.min, 'min')

for col in new_cols:
    data[col] = data[col] - data[agg_col
                                ]
# validate_sklearn_model(model, data, selected_cols + new_cols, valid_ratio, test_ratio, droprows);
added_cols = greedy_add_strategy(model, data, selected_cols, new_cols,
                                    valid_ratio, test_ratio, droprows)

validate_sklearn_model(model, data, added_cols, valid_ratio, test_ratio, droprows);


['yprice_time_min_6', 'yprice_time_min_60', 'yprice_time_min_360', 'yprice_time_min_1410']
Data shapes:  (194510, 35) (68526, 35) (51394, 35)

Valid MSE: 		 0.019383
Valid R2 (x100): 	 1.059

Test MSE: 		 0.015691
Test R2 (x100): 	 1.4162


In [18]:
validate_sklearn_model(model, data, selected_cols, valid_ratio, test_ratio, droprows);


Data shapes:  (194510, 35) (68526, 35) (51394, 35)

Valid MSE: 		 0.019383
Valid R2 (x100): 	 1.059

Test MSE: 		 0.015691
Test R2 (x100): 	 1.4162


In [None]:
usecols.extend(new_cols)

# selected_cols = greedy_add_strategy(model, data, selected_cols, new_cols,
#                                     valid_ratio, test_ratio, droprows)


In [14]:
print_importances(model, selected_cols)

xy_geom_full_history_diff                13.39%           -0.28
xprice_time_mean_6                       11.31%            0.24
yprice_time_mean_360                     7.22%           -0.15
xprice_full_history_diff                 7.10%            0.15
xprice_time_mean_720                     7.09%           -0.15
yprice_time_mean_720                     6.43%            0.14
yprice_full_history_diff                 6.17%            0.13
xprice_time_mean_1410                    6.07%            0.13
xprice_time_mean_360                     5.08%            0.11
yprice_time_mean_1410                    4.02%          -0.085
yrel_from_opening                        3.89%           0.082
xprice_time_mean_60                      3.13%           0.066
yprice_time_mean_6                       2.64%          -0.056
yprice_time_mean_360_diff_1410           1.32%          -0.028
yprice_time_mean_60                      1.31%          -0.028
xrel_from_opening                        1.25%       

In [28]:
selected_cols = greedy_add_del_strategy(model, data, usecols, valid_ratio, test_ratio, droprows)

removed yprice_time_mean_360_diff_2820: r2: 0.95017
removed xprice_time_mean_360_diff_4230: r2: 0.99257
removed yprice_time_mean_60: r2: 1.0282
removed xy_geom_time_mean_60: r2: 1.2378
removed yprice_time_mean_6: r2: 1.2725
removed xprice_time_mean_6: r2: 1.3136
removed xprice_time_mean_360_diff_5640: r2: 1.3487
removed xprice_time_mean_360_diff_14100: r2: 1.3803
removed yprice_time_mean_360_diff_21150: r2: 1.4096
removed yprice_time_mean_360_diff_4230: r2: 1.4122
removed xy_relation_time_mean_60_diff_7050: r2: 1.4135
removed xrel_from_opening: r2: 1.4145
removed xy_relation_time_mean_60_diff_2820: r2: 1.4153
removed xy_relation_time_mean_360_diff_2820: r2: 1.4155
removed xy_relation_time_mean_60: r2: 1.416
removed xy_relation_time_mean_60_diff_2820: r2: 1.4163
removed xy_relation_time_mean_60_diff_21150: r2: 1.4167
removed xy_relation_time_mean_60_diff_14100: r2: 1.417
removed yx_relation_full_history_diff: r2: 1.4173
removed xy_relation_time_mean_360_diff_4230: r2: 1.4175
removed xy_

In [30]:
selected_cols

['ydiff_from_closing',
 'xdiff_from_closing',
 'xrel_from_closing',
 'ydiff_from_closing',
 'xdiff_from_closing',
 'xprice_time_mean_60',
 'xprice_time_mean_360',
 'xprice_time_mean_720',
 'xprice_time_mean_1410',
 'yprice_time_mean_360',
 'yprice_time_mean_720',
 'yprice_time_mean_1410',
 'xprice_full_history_diff',
 'yprice_full_history_diff',
 'xy_geom_full_history_diff',
 'yprice_time_mean_360_diff_1410',
 'yprice_time_mean_360_diff_5640',
 'yprice_time_mean_360_diff_14100',
 'yprice_time_mean_360_diff_28200',
 'yprice_time_mean_360_diff_14100',
 'xprice_time_mean_360_diff_1410',
 'xprice_time_mean_360_diff_2820',
 'xprice_time_mean_360_diff_21150',
 'xprice_time_mean_360_diff_28200',
 'xy_relation_time_mean_6',
 'xy_relation_time_mean_360',
 'xy_relation_time_mean_720',
 'xy_relation_time_mean_1410',
 'xy_geom_time_mean_6',
 'xy_geom_time_mean_360',
 'xy_geom_time_mean_720',
 'xy_geom_time_mean_1410',
 'xy_relation_time_mean_360_diff_1410',
 'xy_relation_time_mean_360_diff_7050',


In [29]:
validate_sklearn_model(model, data, selected_cols, valid_ratio, test_ratio, droprows);

Data shapes:  (194510, 36) (68526, 36) (51394, 36)

Valid MSE: 		 0.019312
Valid R2 (x100): 	 1.421

Test MSE: 		 0.015751
Test R2 (x100): 	 1.0392
