In [1]:
import pandas as pd
import numpy as np

from tqdm import tqdm, tqdm_notebook
import random
pd.set_option("display.max_rows", 100)
pd.set_option('display.max_columns', 200)

from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression, ElasticNet, Ridge

from ts_validation import validate_sklearn_model, validate_model_by_pentate, validate_model_by_triplets
from ts_validation import greedy_add_del_strategy, greedy_add_strategy
from helper import print_importances

In [3]:
data = pd.read_pickle('final_heap.pkl')

valid_ratio = 0.25
test_ratio = 0.25
triplets = [
    (0.5, 0.25, 0.25), 
    (0.6, 0.2, 0.2), 
    (0.7,0.15,0.15),
#     (0.65, 0.2, 0.15)
]
droprows = 7050
data.shape

(342630, 564)

In [4]:
selected_cols = [
    'is_end_of_week',
    'weekday',
    'xdiff_from_closing',
    'xdiff_from_opening',
    'xlog',
    'xlog_dayly_ewma_60',
    'xprice_dayly_ewma_24',
    'xprice_diff_1410',
    'xprice_time_mean_1410',
    'xprice_time_mean_360_lag_1410',
    'xprice_time_mean_6',
    'xprice_time_mean_60_dayly_ewma_60',
    'xprice_time_mean_6_rsi_6',
    'xy_garmonic_ewma_prodpair_720_360',
    'xy_garmonic_time_std_1410',
    'xy_geom_time_mean_360_lag_120',
    'xy_geom_time_mean_6',
    'xy_geom_time_mean_60_dayly_ewma_6',
    'xy_geom_time_mean_720_dayly_ewma_120',
    'xy_relation_time_std_360',
    'xy_relation_time_std_720',
    'xy_square_time_zscore_60',
    'ydiff_from_closing',
    'ylog_dayly_ewma_360',
    'yprice_dayly_ewma_60',
    'yprice_diff_1410',
    'yprice_ewma_difpair_60_24',
    'yprice_full_history_diff',
    'yprice_time_mean_360',
    'yprice_time_mean_360_lag_1410',
    'yprice_time_mean_360_lag_2820',
    'yprice_time_mean_60',
    'yprice_time_mean_60_dayly_ewma_24',
    'yprice_time_mean_60_lag_60',
    'yprice_time_mean_720',
    'yprice_time_zscore_360',
    'yprice_time_zscore_720',
    'yx_spread_ewma_prodpair_360_60',
    'yx_spread_time_mean_60_lag_360',
    'yx_spread_time_mean_720_lag_120',
    'yx_spread_time_zscore_1410',
    'yprice_time_mean_60_rsi_360'
] 

In [5]:
model = Ridge(alpha=1)
validate_model_by_pentate(model, data, selected_cols, droprows)

Unnamed: 0,train_50_percent,train_60_percent,train_70_percent,train_80_percent,train_90_percent,min_stats,max_stats,avg
mse,0.021796,0.020175,0.016409,0.020972,0.014756,0.014756,0.021796,0.018666
r2,2.433431,2.38568,2.378185,5.98592,2.407721,2.378185,5.985919,3.422149


In [215]:
check_cols = selected_cols# + added_cols
print(len(check_cols))
best_score = validate_model_by_pentate(model, data, check_cols, droprows).avg['r2']
for col in check_cols[:30]:
    other = check_cols.copy()
    other.remove(col)
    dashboard = validate_model_by_pentate(model, data, other, droprows)
    min_score = dashboard.min_stats['r2']
    mean_score = dashboard.avg['r2']
    print(col, min_score, mean_score)
    if min_score > 2.32 or mean_score > 3.39:
        print(dashboard.loc[['r2']])
        print('--' * 50)

41
xdiff_from_closing 1.3064246 2.975
xprice_diff_1410 1.3541011 3.031
xprice_time_mean_360_lag_1410 2.130821 3.256
xprice_time_mean_6 2.3183453 3.389
xprice_time_mean_1410 2.069077 3.184
ydiff_from_closing 2.1009536 3.332
yprice_time_mean_360 2.310204 3.408
    train_50_percent  train_60_percent  train_70_percent  train_80_percent  \
r2          2.404391          2.475934          2.369868           5.99314   

    train_90_percent  min_stats  max_stats       avg  
r2          2.310204   2.310204    5.99314  3.408203  
----------------------------------------------------------------------------------------------------
yprice_time_mean_60 2.2552626 3.36
yprice_time_mean_360_lag_2820 2.1056728 3.336
yprice_time_mean_720 2.3064146 3.285
yprice_time_zscore_720 1.9825295 3.236
yprice_time_mean_360_lag_1410 2.1700382 3.406
    train_50_percent  train_60_percent  train_70_percent  train_80_percent  \
r2          2.304564           2.35618          2.170038          6.204768   

    train_90_

In [20]:
current_dashboard = validate_model_by_pentate(model, data, selected_cols, droprows)
best_score = current_dashboard.avg['r2']
best_min_score = current_dashboard.min_stats['r2']

bad_threshold = 3. #best_score / 2
useless_cols = []
added_cols = []

eps = 0.01

structural_cols = [
    'timestamp', 'returns', 'day', 'periods_before_closing', 'periods_after_opening',
    'yprice', 'yx_spread', 'yx_relation', 'xy_relation', 'xy_square', 'xy_geom', 'xy_garmonic',
    'is_monday', 'is_tuesday', 'is_wednesday', 'is_thursday', 'is_friday', 'is_end_of_week'
]

print('Current score: ', best_score)
print('Bad threshold:', bad_threshold)

while True:
    other_cols = [
        col for col in data.columns if col not in (selected_cols + \
            structural_cols + \
            useless_cols + \
            added_cols + ['closing_indicator'])
    ]

    random.shuffle(other_cols)
    progress_bar = tqdm_notebook(other_cols)
    for col in progress_bar:
        metrics = validate_model_by_pentate(model, data, selected_cols + added_cols + [col], droprows)
        mean_score = metrics.avg['r2']
        min_score = metrics.min_stats['r2']
        progress_bar.set_description('{}: {:.5}'.format(col, score))
        if mean_score > best_score + eps and min_score > best_min_score:
            added_cols.append(col)
            best_score = mean_score
            best_min_score = min_score
            print('usefull: ', col, score)
            print(metrics)
            break
        elif mean_score < bad_threshold:
            print('useless: ', col, score)
            useless_cols.append(col)

Current score:  3.4221487
Bad threshold: 3.0


HBox(children=(IntProgress(value=0, max=504), HTML(value='')))

useless:  xprice_time_zscore_720 3.3352077
usefull:  yx_spread_time_mean_6_rsi_720 3.3352077
     train_50_percent  train_60_percent  train_70_percent  train_80_percent  \
mse          0.021796          0.020174          0.016408          0.020973   
r2           2.434384          2.389267          2.392501          6.020598   

     train_90_percent  min_stats  max_stats       avg  
mse          0.014757   0.014757   0.021796  0.018666  
r2           2.396649   2.389267   6.020598  3.434752  


HBox(children=(IntProgress(value=0, max=502), HTML(value='')))

useless:  xy_garmonic_time_zscore_720 3.3352077
useless:  xprice_time_zscore_1410 3.3352077


HBox(children=(IntProgress(value=0, max=500), HTML(value='')))

KeyboardInterrupt: 