In [1]:
import pandas as pd
import numpy as np

import matplotlib.pylab as plt
%matplotlib inline

from tqdm import tqdm, tqdm_notebook
pd.set_option("display.max_rows", 100)
pd.set_option('display.max_columns', 200)

from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression, ElasticNet, Ridge
fname = 'data.csv'

from ts_features import init_data, add_hand_feats, add_diffs, add_shifts
from ts_features import add_ewma, add_intraday_ewma
from ts_features import add_time_depended_rolling, add_full_history_diff
from ts_validation import validate_sklearn_model, validate_model_by_pentate, validate_model_by_triplets
from ts_validation import greedy_add_del_strategy, greedy_add_strategy
from helper import print_importances

- 6 - 1min
- 60 - 10min
- 360 - 1hour
- 1410 - 1workday (~ 4 hours per day)
- 7050 - 1workweek (5 days per week)
- 28200 - 1 workmonth (~ 4 weeks per month)


I used standardized functions and aggregation periods

**7 basic features:**
- x
- y
- yx_spread = yprice - xprice
- xy_relation = xprice / yprice
- xy_square = (xprice ** 2 + yprice ** 2) ** 0.5 / 2
- xy_geom = (data.xprice * data.yprice) ** 0.5
- xy_garmonic = 2 / (1 / xprice + 1 / yprice)

**basic periods:**
- intraday_agg_periods: [6, 12, 24, 60, 120, 360, 720]
- oneday_agg_periods: [6, 60, 360, 720, 1410]
- oneweek_agg_periods: [1410, 2820, 4230, 5640]
- oneweek_agg_periods: [1410, 2820, 4230, 5640]

In [2]:
short_agg_periods = [6, 60, 360]
oneday_agg_periods = [6, 60, 360, 720, 1410]
intraday_agg_periods = [6, 12, 24, 60, 120, 360, 720]
oneweek_agg_periods = [1410, 2820, 4230, 5640]

valid_ratio = 0.25
test_ratio = 0.25
triplets = [
    (0.5, 0.25, 0.25), 
    (0.6, 0.2, 0.2), 
    (0.7,0.15,0.15),
#     (0.65, 0.2, 0.15)
]
droprows = 7050


month_agg_periods: [1410, 2820, 4230, 5640, 7050, 14100, 21150]


In [3]:
data = init_data(fname)
hand_crafted_cols = add_hand_feats(data)
data.head()

['ydiff_from_closing', 'xdiff_from_closing', 'yrel_from_closing', 'xrel_from_closing', 'ydiff_from_closing', 'xdiff_from_closing', 'yrel_from_opening', 'xrel_from_opening']


Unnamed: 0_level_0,timestamp,xprice,yprice,returns,yx_spread,yx_relation,xy_relation,xy_square,xy_geom,xy_garmonic,weekday,day,periods_before_closing,periods_after_opening,ydiff_from_closing,yrel_from_closing,xdiff_from_closing,xrel_from_closing,ydiff_from_opening,yrel_from_opening,xdiff_from_opening,xrel_from_opening
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2013-01-03 20:05:00,2013-01-03 20:05:00,12.8375,23.25,0.3125,10.4125,1.8111,0.552151,13.279344,17.276339,16.541566,3,0,1409,0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.999999
2013-01-03 20:05:10,2013-01-03 20:05:10,12.8875,23.3,0.275,10.4125,1.807953,0.553112,13.313317,17.328553,16.595717,3,0,1408,1,0.0,1.0,0.0,1.0,0.05,1.00215,0.05,1.003894
2013-01-03 20:05:20,2013-01-03 20:05:20,12.8625,23.3375,0.25,10.475,1.814383,0.551152,13.32369,17.325663,16.584453,3,0,1407,2,0.0,1.0,0.0,1.0,0.0875,1.003763,0.025,1.001947
2013-01-03 20:05:30,2013-01-03 20:05:30,12.8375,23.3625,0.2375,10.525,1.819864,0.549492,13.328614,17.318086,16.56995,3,0,1406,3,0.0,1.0,0.0,1.0,0.1125,1.004838,0.0,0.999999
2013-01-03 20:05:40,2013-01-03 20:05:40,12.8375,23.3625,0.325,10.525,1.819864,0.549492,13.328614,17.318086,16.56995,3,0,1405,4,0.0,1.0,0.0,1.0,0.1125,1.004838,0.0,0.999999


In [4]:
agg_cols = ['xprice', 'yprice', 'xy_relation', 'yx_spread', 'xy_geom', 'xy_square', 'xy_garmonic']

for agg_col in agg_cols:
    mean_cols = add_time_depended_rolling(data, agg_col, oneday_agg_periods, np.mean, 'mean')
    for col in mean_cols:
        data[col] = data[agg_col] - data[col]

['xprice_time_mean_6', 'xprice_time_mean_60', 'xprice_time_mean_360', 'xprice_time_mean_720', 'xprice_time_mean_1410']
['yprice_time_mean_6', 'yprice_time_mean_60', 'yprice_time_mean_360', 'yprice_time_mean_720', 'yprice_time_mean_1410']
['xy_relation_time_mean_6', 'xy_relation_time_mean_60', 'xy_relation_time_mean_360', 'xy_relation_time_mean_720', 'xy_relation_time_mean_1410']
['yx_spread_time_mean_6', 'yx_spread_time_mean_60', 'yx_spread_time_mean_360', 'yx_spread_time_mean_720', 'yx_spread_time_mean_1410']
['xy_geom_time_mean_6', 'xy_geom_time_mean_60', 'xy_geom_time_mean_360', 'xy_geom_time_mean_720', 'xy_geom_time_mean_1410']
['xy_square_time_mean_6', 'xy_square_time_mean_60', 'xy_square_time_mean_360', 'xy_square_time_mean_720', 'xy_square_time_mean_1410']
['xy_garmonic_time_mean_6', 'xy_garmonic_time_mean_60', 'xy_garmonic_time_mean_360', 'xy_garmonic_time_mean_720', 'xy_garmonic_time_mean_1410']


In [5]:
agg_cols = ['xprice', 'yprice', 'xy_relation', 'yx_spread', 'xy_geom', 'xy_square', 'xy_garmonic']
std_reg_const = 0.1
std_cols = []

for agg_col in agg_cols:
    std_cols = add_time_depended_rolling(data, agg_col, oneday_agg_periods, np.std, 'std')
    for col in std_cols:
        data[col] = data[col].fillna(0) + std_reg_const

['xprice_time_std_6', 'xprice_time_std_60', 'xprice_time_std_360', 'xprice_time_std_720', 'xprice_time_std_1410']
['yprice_time_std_6', 'yprice_time_std_60', 'yprice_time_std_360', 'yprice_time_std_720', 'yprice_time_std_1410']
['xy_relation_time_std_6', 'xy_relation_time_std_60', 'xy_relation_time_std_360', 'xy_relation_time_std_720', 'xy_relation_time_std_1410']
['yx_spread_time_std_6', 'yx_spread_time_std_60', 'yx_spread_time_std_360', 'yx_spread_time_std_720', 'yx_spread_time_std_1410']
['xy_geom_time_std_6', 'xy_geom_time_std_60', 'xy_geom_time_std_360', 'xy_geom_time_std_720', 'xy_geom_time_std_1410']
['xy_square_time_std_6', 'xy_square_time_std_60', 'xy_square_time_std_360', 'xy_square_time_std_720', 'xy_square_time_std_1410']
['xy_garmonic_time_std_6', 'xy_garmonic_time_std_60', 'xy_garmonic_time_std_360', 'xy_garmonic_time_std_720', 'xy_garmonic_time_std_1410']


In [6]:
agg_cols = ['xprice', 'yprice', 'xy_relation', 'yx_spread', 'xy_geom', 'xy_square', 'xy_garmonic']
zscore_cols = []
for agg_col in agg_cols:
    for lag in oneday_agg_periods:
        current_mean = data['{}_time_mean_{}'.format(agg_col, lag)]
        current_std = data['{}_time_std_{}'.format(agg_col, lag)]
        colname = '{}_time_zscore_{}'.format(agg_col, lag)
        data[colname] = current_mean / current_std
        zscore_cols.append(colname)

In [7]:
agg_cols = ['xprice', 'yprice', 'xy_relation', 'yx_spread', 'xy_geom', 'xy_square', 'xy_garmonic']

for agg_col in agg_cols:
    history_cols = add_full_history_diff(data, agg_col)

xprice_full_history_diff
yprice_full_history_diff
xy_relation_full_history_diff
yx_spread_full_history_diff
xy_geom_full_history_diff
xy_square_full_history_diff
xy_garmonic_full_history_diff


In [8]:
agg_cols = ['xprice', 'yprice', 'xy_relation', 'yx_spread', 'xy_geom', 'xy_square', 'xy_garmonic']

for agg_col in agg_cols:
    ewma_cols = add_intraday_ewma(data, agg_col, intraday_agg_periods)
    for col in ewma_cols:
        data[col] = data[agg_col] - data[col]

['xprice_dayly_ewma_6', 'xprice_dayly_ewma_12', 'xprice_dayly_ewma_24', 'xprice_dayly_ewma_60', 'xprice_dayly_ewma_120', 'xprice_dayly_ewma_360', 'xprice_dayly_ewma_720']
['yprice_dayly_ewma_6', 'yprice_dayly_ewma_12', 'yprice_dayly_ewma_24', 'yprice_dayly_ewma_60', 'yprice_dayly_ewma_120', 'yprice_dayly_ewma_360', 'yprice_dayly_ewma_720']
['xy_relation_dayly_ewma_6', 'xy_relation_dayly_ewma_12', 'xy_relation_dayly_ewma_24', 'xy_relation_dayly_ewma_60', 'xy_relation_dayly_ewma_120', 'xy_relation_dayly_ewma_360', 'xy_relation_dayly_ewma_720']
['yx_spread_dayly_ewma_6', 'yx_spread_dayly_ewma_12', 'yx_spread_dayly_ewma_24', 'yx_spread_dayly_ewma_60', 'yx_spread_dayly_ewma_120', 'yx_spread_dayly_ewma_360', 'yx_spread_dayly_ewma_720']
['xy_geom_dayly_ewma_6', 'xy_geom_dayly_ewma_12', 'xy_geom_dayly_ewma_24', 'xy_geom_dayly_ewma_60', 'xy_geom_dayly_ewma_120', 'xy_geom_dayly_ewma_360', 'xy_geom_dayly_ewma_720']
['xy_square_dayly_ewma_6', 'xy_square_dayly_ewma_12', 'xy_square_dayly_ewma_24', '

In [9]:
agg_cols = ['xprice', 'yprice', 'xy_relation', 'yx_spread', 'xy_geom', 'xy_square', 'xy_garmonic']
lags_pair = [(12, 6), (24, 6), (60,6), (60, 24), (120,60),(360,60),(720,360)]
lagpair_cols = []

for agg_col in agg_cols:
    for lag_from, lag_to in lags_pair:
        lag_from_col = '{}_dayly_ewma_{}'.format(agg_col, lag_from)
        lag_to_col = '{}_dayly_ewma_{}'.format(agg_col, lag_to)
        
        dif_col = '{}_ewma_difpair_{}_{}'.format(agg_col, lag_from, lag_to)
        data[dif_col] = data[lag_from_col] - data[lag_to_col]
        
        prod_col = '{}_ewma_prodpair_{}_{}'.format(agg_col, lag_from, lag_to)
        data[prod_col] = data[lag_from_col] * data[lag_to_col]
        
        lagpair_cols.extend([dif_col, prod_col])

print(lagpair_cols)

['xprice_ewma_difpair_12_6', 'xprice_ewma_prodpair_12_6', 'xprice_ewma_difpair_24_6', 'xprice_ewma_prodpair_24_6', 'xprice_ewma_difpair_60_6', 'xprice_ewma_prodpair_60_6', 'xprice_ewma_difpair_60_24', 'xprice_ewma_prodpair_60_24', 'xprice_ewma_difpair_120_60', 'xprice_ewma_prodpair_120_60', 'xprice_ewma_difpair_360_60', 'xprice_ewma_prodpair_360_60', 'xprice_ewma_difpair_720_360', 'xprice_ewma_prodpair_720_360', 'yprice_ewma_difpair_12_6', 'yprice_ewma_prodpair_12_6', 'yprice_ewma_difpair_24_6', 'yprice_ewma_prodpair_24_6', 'yprice_ewma_difpair_60_6', 'yprice_ewma_prodpair_60_6', 'yprice_ewma_difpair_60_24', 'yprice_ewma_prodpair_60_24', 'yprice_ewma_difpair_120_60', 'yprice_ewma_prodpair_120_60', 'yprice_ewma_difpair_360_60', 'yprice_ewma_prodpair_360_60', 'yprice_ewma_difpair_720_360', 'yprice_ewma_prodpair_720_360', 'xy_relation_ewma_difpair_12_6', 'xy_relation_ewma_prodpair_12_6', 'xy_relation_ewma_difpair_24_6', 'xy_relation_ewma_prodpair_24_6', 'xy_relation_ewma_difpair_60_6', 'x

In [10]:
agg_cols = ['xprice', 'yprice', 'xy_relation', 'yx_spread', 'xy_geom', 'xy_square', 'xy_garmonic']

for agg_col in agg_cols:
    overnight_dif_cols = add_diffs(data, agg_col, oneweek_agg_periods)
    for col in overnight_dif_cols:
        data[col] = data[agg_col] - data[col]

['xprice_diff_1410', 'xprice_diff_2820', 'xprice_diff_4230', 'xprice_diff_5640']
['yprice_diff_1410', 'yprice_diff_2820', 'yprice_diff_4230', 'yprice_diff_5640']
['xy_relation_diff_1410', 'xy_relation_diff_2820', 'xy_relation_diff_4230', 'xy_relation_diff_5640']
['yx_spread_diff_1410', 'yx_spread_diff_2820', 'yx_spread_diff_4230', 'yx_spread_diff_5640']
['xy_geom_diff_1410', 'xy_geom_diff_2820', 'xy_geom_diff_4230', 'xy_geom_diff_5640']
['xy_square_diff_1410', 'xy_square_diff_2820', 'xy_square_diff_4230', 'xy_square_diff_5640']
['xy_garmonic_diff_1410', 'xy_garmonic_diff_2820', 'xy_garmonic_diff_4230', 'xy_garmonic_diff_5640']


In [11]:
data['is_monday'] = (data.timestamp.dt.weekday == 0).astype(int)
data['is_tuesday'] = (data.timestamp.dt.weekday == 1).astype(int)
data['is_wednesday'] = (data.timestamp.dt.weekday == 2).astype(int)
data['is_thursday'] = (data.timestamp.dt.weekday == 3).astype(int)
data['is_friday'] = (data.timestamp.dt.weekday == 4).astype(int)
data['is_end_of_week'] = (data.timestamp.dt.weekday >= 2).astype(int)

In [12]:
agg_cols = ['yprice_time_mean_360', 'xprice_time_mean_360']
for agg_col in agg_cols:
    new_cols = add_shifts(data,agg_col,oneweek_agg_periods)

['yprice_time_mean_360_lag_1410', 'yprice_time_mean_360_lag_2820', 'yprice_time_mean_360_lag_4230', 'yprice_time_mean_360_lag_5640']
['xprice_time_mean_360_lag_1410', 'xprice_time_mean_360_lag_2820', 'xprice_time_mean_360_lag_4230', 'xprice_time_mean_360_lag_5640']


In [13]:
data['closing_indicator'] =  np.log1p(data.periods_before_closing) / 7
data.loc[data['closing_indicator'] > 0.6, 'closing_indicator'] = 0.6
data['closing_indicator'] = data['closing_indicator'] * 10

In [14]:
selected_cols = [
    'closing_indicator',
    'xdiff_from_closing', 
    'xprice_diff_1410',
    'xprice_time_mean_1410',
    'xprice_time_mean_360', 
    'xprice_time_mean_360_lag_1410',
    'xprice_time_mean_6', 'xprice_time_mean_60', 'xprice_time_mean_720', 
    'ydiff_from_closing',
    'yprice_time_mean_1410', 'yprice_time_mean_360',  
    'yprice_time_mean_60',
    'yprice_time_mean_360_lag_2820', 
    'yprice_time_mean_720',
    'yprice_time_zscore_720',
    'yprice_time_mean_360_lag_1410',
    'yprice_time_zscore_360',
    'xprice'
] 
print(len(selected_cols))
model = Ridge(alpha=1)
validate_model_by_pentate(model, data, selected_cols, droprows)

19


Unnamed: 0,train_50_percent,train_60_percent,train_70_percent,train_80_percent,train_90_percent,min_stats,max_stats,avg
mse,0.021624,0.020502,0.016182,0.021429,0.014772,0.014771,0.021622,0.018707
r2,0.322848,0.795997,0.105496,2.593555,2.218181,0.105469,2.59375,1.248047


In [None]:
data['ylog'] = data.yprice.apply(np.log1p)
data['xlog'] = data.xprice.apply(np.log1p)
intraday_agg_periods = [6, 12, 24, 60, 120, 360, 720]
from ts_features import add_intraday_ewma

agg_cols = ['ylog', 'xlog']

for agg_col in agg_cols:
    ewma_cols = add_intraday_ewma(data, agg_col, intraday_agg_periods)
    for col in ewma_cols:
        data[col] = data[agg_col] - data[col]

In [None]:
from ts_features import add_shifts
oneweek_agg_periods = [1410, 2820, 4230, 5640]
intraday_agg_periods = [6, 12, 24, 60, 120, 360, 720]

agg_cols = [
    'yprice_time_mean_6', 'yprice_time_mean_60',
    'xprice_time_mean_6', 'xprice_time_mean_60',
    'xy_geom_time_mean_6', 'xy_geom_time_mean_60',
    'yx_spread_time_mean_6', 'yx_spread_time_mean_60',
]
for agg_col in agg_cols:
    add_shifts(data,agg_col,intraday_agg_periods)
    add_intraday_ewma(data, agg_col, intraday_agg_periods)

In [None]:
from ts_features import add_rsi
oneweek_agg_periods = [1410, 2820, 4230, 5640]
intraday_agg_periods = [6, 12, 24, 60, 120, 360, 720]

agg_cols = [
    'yprice_time_mean_6', 'yprice_time_mean_60',
    'xprice_time_mean_6', 'xprice_time_mean_60',
    'xy_geom_time_mean_6', 'xy_geom_time_mean_60',
    'yx_spread_time_mean_6', 'yx_spread_time_mean_60',
]
for agg_col in agg_cols:
    add_rsi(data,agg_col,intraday_agg_periods)

In [None]:
from ts_features import add_shifts
oneweek_agg_periods = [1410, 2820, 4230, 5640]
intraday_agg_periods = [6, 12, 24, 60, 120, 360, 720]

agg_cols = [
    'yprice_time_mean_360', 'yprice_time_mean_720',
    'xprice_time_mean_360', 'xprice_time_mean_720',
    'xy_geom_time_mean_360','xy_geom_time_mean_720',
    'yx_spread_time_mean_360','yx_spread_time_mean_720',
]
for agg_col in agg_cols:
    add_shifts(data,agg_col,[120,360,720])
    add_intraday_ewma(data, agg_col, [120,360, 720])

In [None]:
from ts_features import add_shifts

agg_cols = [
    'xdiff_from_closing', 'ydiff_from_closing',
]
for agg_col in agg_cols:
    add_shifts(data,agg_col, oneweek_agg_periods)

In [None]:
data.to_pickle('final_heap.pkl')