In [67]:
import pandas as pd
from datetime import datetime

import numpy as np

import matplotlib.pylab as plt
%matplotlib inline

from tqdm import tqdm, tqdm_notebook
pd.set_option("display.max_rows", 100)
pd.set_option('display.max_columns', 200)

from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression, ElasticNet, Ridge
fname = 'data.csv'

def init_data(fname):
    data = pd.read_csv('data.csv')
    data['yx_spread'] = data.yprice - data.xprice
    data['yx_relation'] = data.yprice / data.xprice
    data['xy_relation'] = data.xprice / data.yprice
    data['xy_geom'] = np.sqrt(data.xprice * data.yprice)
    data['xy_garmonic'] = 2 / (1 / data.xprice + 1 / data.yprice)
    
#     data.xprice = (data.xprice - data.xprice.min())# / data.xprice.std() 
#     data.yprice = (data.yprice - data.yprice.min())# / data.yprice.std() 
    data['timestamp'] = data['timestamp'] // 1000
    data['timestamp'] = data['timestamp'].apply(lambda stamp: datetime.fromtimestamp(stamp))
    data['timestamp'] = data['timestamp'] - pd.Timedelta(hours=1) # for flexibility
    data.index = data['timestamp']
    
    data['weekday'] = data.timestamp.dt.weekday
    data['day'] = (data.timestamp.dt.date - data.timestamp.dt.date.min()).apply(lambda x: int(x.days))
    day_close_time = data.day.map(data.groupby('day').timestamp.max())
    data['periods_before_closing'] = (day_close_time - data.timestamp).apply(lambda x: x.seconds // 10)
    day_open_time = data.day.map(data.groupby('day').timestamp.min())
    data['periods_after_opening'] = (data.timestamp - day_open_time).apply(lambda x: x.seconds // 10)
#     data.drop('timestamp', 1, inplace=True)
    return data
    
def time_split(data, valid_ratio, test_ratio):
    n_valid = max(1, int(data.shape[0] * valid_ratio))
    n_test = max(1, int(data.shape[0] * test_ratio))
    n_train = data.shape[0] - n_valid - n_test
    
    train = data.iloc[:n_train].reset_index(drop=True).copy()
    valid = data.iloc[n_train:-n_test].reset_index(drop=True).copy()
    test = data.iloc[-n_test:].reset_index(drop=True).copy()
    merged_test = valid.append(test).reset_index(drop=True)
    return train, valid, test

In [3]:
def add_diffs(df, column, uselags):
    new_columns = []
    for lag in uselags:
        colname = '{}_diff_{}'.format(column, lag)
        df.loc[:, colname] = df[column].diff(lag)
        new_columns.append(colname)
    print(new_columns)
    return new_columns

def add_shifts(df, column, uselags):
    new_columns = []
    for lag in uselags:
        colname = '{}_lag_{}'.format(column, lag)
        df.loc[:, colname] = df[column].shift(lag)
        new_columns.append(colname)
    print(new_columns)
    return new_columns

def add_rolling_mean(df, column, windows):
    new_columns = []
    for window_size in windows:
        colname = '{}_ma_{}'.format(column, window_size)
        df.loc[:, colname] = df[column].rolling(window=window_size).mean()
        new_columns.append(colname)
    print(new_columns)
    return new_columns

def add_curstom_rolling_operation(df, column, agg_function, function_name, windows):
    new_columns = []
    for window_size in windows:
        colname = '{}_{}_{}'.format(column, function_name, window_size)
        df.loc[:, colname] = df[column].rolling(window=window_size).agg(agg_function)
        new_columns.append(colname)
    print(new_columns)
    return new_columns  

def rsiFunc(prices, n=14):
    deltas = np.diff(prices)
    seed = deltas[:n+1]
    up = seed[seed>=0].sum()/n
    down = -seed[seed<0].sum()/n
    rs = up/down
    rsi = np.zeros_like(prices)
    rsi[:n] = 100. - 100./(1.+rs)

    for i in range(n, len(prices)):
        delta = deltas[i-1] # cause the diff is 1 shorter

        if delta>0:
            upval = delta
            downval = 0.
        else:
            upval = 0.
            downval = -delta

        up = (up*(n-1) + upval)/n
        down = (down*(n-1) + downval)/n

        rs = up/down
        rsi[i] = 100. - 100./(1.+rs)

    return rsi

def add_rsi(df, column, windows):
    new_columns = []
    for window_size in windows:
        colname = '{}_rsi_{}'.format(column, window_size)
        df.loc[:, colname] = rsiFunc(df[column].values, window_size)
        new_columns.append(colname)
    print(new_columns)
    return new_columns  

def add_ewma(df, column, windows):
    new_columns = []
    for window_size in windows:
        colname = '{}_ewma_{}'.format(column, window_size)
        df.loc[:, colname] = pd.Series.ewm(df[column], span=window_size).mean()
        new_columns.append(colname)
    print(new_columns)
    return new_columns 

def add_time_depended_rolling(df, source_column, agg_periods_per_seconds, agg_fun, agg_repr):
    '''
        df: source dataframe
        source_column: column for building feature
        agg_periods_per_seconds: list with periods in seconds
        agg_fun: aggregation function
        agg_repr: name of agg function
    '''
    is_allowed_arguments = sum(map(lambda x: x % 10, agg_periods_per_seconds)) == 0
    assert is_allowed_arguments, 'agg_periods_per_seconds divided by 10'
    
    new_cols = []
    for agg_period in agg_periods_per_seconds:
        agg_shifts = range(10, agg_period, 10)
        period_repr = '{}s'.format(agg_period)
        
        agg_helper_df = df[source_column].resample(
            period_repr, label='right', closed='right').agg(agg_fun)
                                             
        for shift in agg_shifts:
            agg_helper_df = agg_helper_df.append(df[source_column].resample(
                period_repr, label='right', closed='right', base=shift).agg(agg_fun))
        colname = '{}_time_{}_{}'.format(source_column, agg_repr, agg_period)
        df.loc[:, colname] = agg_helper_df
        new_cols.append(colname)
    print(new_cols)
    return new_cols

In [4]:
def add_hand_feats(df):
    close_price_per_day = df.groupby('day').timestamp.max().shift(1).map(
        df[['timestamp', 'yprice']].set_index('timestamp').yprice)
    df.loc[:, 'ydiff_from_closing'] = (df.day.map(close_price_per_day) - df.yprice).fillna(0)
    close_price_per_day = df.groupby('day').timestamp.max().shift(1).map(
        df[['timestamp', 'xprice']].set_index('timestamp').xprice)
    df.loc[:, 'xdiff_from_closing'] = (df.day.map(close_price_per_day) - df.yprice).fillna(0)
    
    open_price_per_day = df.groupby('day').timestamp.min().map(
        df[['timestamp', 'yprice']].set_index('timestamp').yprice)
    df.loc[:, 'ydiff_from_opening'] = (df.day.map(open_price_per_day) - df.yprice)
    
    open_price_per_day = df.groupby('day').timestamp.min().map(
        df[['timestamp', 'xprice']].set_index('timestamp').xprice)
    df.loc[:, 'xdiff_from_opening'] = (df.day.map(open_price_per_day) - df.xprice)
    new_columns = ['ydiff_from_closing', 'xdiff_from_closing', 'ydiff_from_opening', 'xdiff_from_opening']
    print(new_columns)
    return new_columns

def add_full_history_diff(df, col):
    mean = df[col].cumsum() / np.arange(1, df.shape[0] + 1)
    new_col = '{}_full_history_diff'.format(col)
    df.loc[:, new_col] = df[col] - mean
    print(new_col)
    return new_col

In [106]:
def validate_sklearn_model(model, data, selected_cols, valid_rate, test_rate, droprows=0, verbose=True):
    train, valid, test = time_split(data, valid_rate, test_rate)
    train.drop(np.arange(droprows), inplace=True)
    train.dropna(inplace=True)
    if verbose:
        print('Data shapes: ', train.shape, valid.shape, test.shape)

    metrics_dict = {}
    
    if valid_rate!=0:
        model.fit(train[selected_cols], train.returns)
        y_valid_predicted = model.predict(valid[selected_cols])
        y_valid_predicted[valid.periods_before_closing == 0] = 0

        metrics_dict['valid_mse'] = mean_squared_error(y_valid_predicted, valid.returns)
        metrics_dict['valid_r2'] = r2_score(valid.returns, y_valid_predicted) * 100
        if verbose:
            print('\nValid MSE: \t\t {:.5}'.format(metrics_dict['valid_mse']))
            print('Valid R2 (x100): \t {:.5}'.format(metrics_dict['valid_r2']))
    
    if test_rate!=0:
        model.fit(train.append(valid)[selected_cols], train.append(valid).returns)
        y_test_predicted = model.predict(test[selected_cols])
        y_test_predicted[test.periods_before_closing == 0] = 0

        metrics_dict['test_mse'] = mean_squared_error(y_test_predicted, test.returns)
        metrics_dict['test_r2'] = r2_score(test.returns, y_test_predicted) * 100
        if verbose:
            print('\nTest MSE: \t\t {:.5}'.format(metrics_dict['test_mse']))
            print('Test R2 (x100): \t {:.5}'.format(metrics_dict['test_r2']))
    
    return metrics_dict


def greedy_add_del_strategy(model, data, cols, test_rate, droprows=600, add_frequency=3):
    selected_cols = cols.copy()
    removed_cols = []
    current_step = 0
    
    current_score = 0
    
    while selected_cols:
        current_step += 1
        if current_step % add_frequency == 0:
            for col in removed_cols:
                current_cols = selected_cols + [col]
                current_metrics = validate_sklearn_model(
                    model, data, current_cols,
                    valid_rate=0, test_rate=test_rate, droprows=600,
                    verbose=False
                )
                if current_metrics['test_r2'] > current_score:
                    current_score = current_metrics['test_r2']
                    selected_cols.append(col)
                    print('added {}: r2: {:.5}'.format(col, current_score))

        best_score_by_iter = 0
        worst_col = ''
        for col in selected_cols:
            current_cols = [c for c in selected_cols if c!=col]
            current_metrics = validate_sklearn_model(
                model, data, current_cols, 
                valid_rate=0, test_rate=test_rate, droprows=600,
                verbose=False
            )
            if current_metrics['test_r2'] > best_score_by_iter:
                best_score_by_iter = current_metrics['test_r2']
                worst_col = col
        if best_score_by_iter > current_score:
            current_score = best_score_by_iter
            print('removed {}: r2: {:.5}'.format(worst_col, best_score_by_iter))
            selected_cols.remove(worst_col)
            removed_cols.append(worst_col)
        else:
            return selected_cols

In [10]:
short_agg_periods = [60, 600, 3600]
oneday_agg_periods = [60, 600, 3600, 7200, 14100]

standart_calendar_lags = [6, 60, 360, 1410, 7050, 14100, 28200, 42300]
qazy_calendar_lags = [6, 60, 360, 720, 1410, 2820, 7050, 14100, 28200, 42300]
day_lags = 1410 * np.arange(1, 75)
valid_ratio = 0
test_ratio = 0.2

## Trivial solutuin

In [33]:
train, valid, test = time_split(data, valid_ratio, test_ratio)
train.dropna(inplace=True)

trivial_solution = np.ones_like(test.returns.values) * test.returns.mean()
print('Zero Prediction MSE: \t {:.5}'.format(np.mean(np.square(test.returns.values))))
print('Mean Prediction MSE: \t {:.5}'.format(mean_squared_error(test.returns, trivial_solution)))
print('Mean Prediction R2: \t {:.5}'.format(r2_score(test.returns, trivial_solution)))

Data shapes:  (274103, 17) (1, 17) (68526, 17)
Zero Prediction MSE: 	 0.018675
Mean Prediction MSE: 	 0.018637
Mean Prediction R2: 	 0.0


In [38]:
data = init_data(fname)
selected_cols = ['xprice', 'yprice']

model = Ridge()
model.fit(train[usecols], train.returns)
validate_sklearn_model(model, data, selected_cols, valid_rate=0.2, test_rate=0.2, droprows=600);

Data shapes:  (205578, 13) (68526, 13) (68526, 13)

Valid MSE: 		 0.01894
Valid R2 (x100): 	 -2.073

Test MSE: 		 0.018595
Test R2 (x100): 	 0.22516


## Simple features

In [25]:
usecols = [
    'xprice', 'yprice',
    'yx_relation', 'xy_relation',
    'yx_spread', 'xy_geom', 'xy_garmonic',
    'periods_before_closing'
]

data = init_data(fname)
hand_crafted_cols = add_hand_feats(data)
usecols.extend(hand_crafted_cols)

['ydiff_from_closing', 'xdiff_from_closing', 'ydiff_from_opening', 'xdiff_from_opening']
Data shapes:  (274103, 17) (1, 17) (68526, 17)


In [27]:
removing_cols = [
    'periods_before_closing',
    'xy_relation',
    'yx_relation',
    'xy_geom',
    'xy_garmonic'
]
selected_cols = [col for col in usecols if col not in removing_cols]

model = Ridge(alpha=1)
validate_sklearn_model(model, data, selected_cols, valid_rate=0.2, test_rate=0.2, droprows=600);

Data shapes:  (205578, 17) (68526, 17) (68526, 17)

Valid MSE: 		 0.018874
Valid R2 (x100): 	 -1.7135

Test MSE: 		 0.018532
Test R2 (x100): 	 0.56464


In [31]:
removing_cols = [
    'periods_before_closing',
#     'xy_relation',
    'yx_relation',
#     'xy_geom',
#     'xy_garmonic'
]
selected_cols = [col for col in usecols if col not in removing_cols]


model = ElasticNet(alpha=0.0001, l1_ratio=0.1, max_iter=1000)
validate_sklearn_model(model, data, selected_cols, valid_rate=0.2, test_rate=0.2, droprows=600);

Data shapes:  (205578, 17) (68526, 17) (68526, 17)


  positive)



Valid MSE: 		 0.018872
Valid R2 (x100): 	 -1.7057

Test MSE: 		 0.018534
Test R2 (x100): 	 0.55649


  positive)


## Agg v1

- 6 - 1min
- 60 - 10min
- 360 - 1hour
- 1410 - 1workday (~ 4 hours per day)
- 7050 - 1workweek (5 days per week)
- 28200 - 1 workmonth (~ 4 weeks per month)

In [39]:
usecols = [
    'xprice', 'yprice',
    'yx_relation', 'xy_relation',
    'yx_spread', 'xy_geom', 'xy_garmonic',
    'periods_before_closing'
]

data = init_data(fname)

hand_crafted_cols = add_hand_feats(data)
usecols.extend(hand_crafted_cols)

xcols = add_time_depended_rolling(data, 'xprice', short_agg_periods, np.mean, 'mean')
usecols.extend(xcols)

ycols = add_time_depended_rolling(data, 'yprice', short_agg_periods, np.mean, 'mean')
usecols.extend(ycols)

['ydiff_from_closing', 'xdiff_from_closing', 'ydiff_from_opening', 'xdiff_from_opening']
['xprice_time_mean_60', 'xprice_time_mean_600', 'xprice_time_mean_3600']
['yprice_time_mean_60', 'yprice_time_mean_600', 'yprice_time_mean_3600']


In [40]:
removing_cols = [
    'periods_before_closing',
    'xy_relation',
    'yx_relation',
    'xy_geom',
    'xy_garmonic'
]
selected_cols = [col for col in usecols if col not in removing_cols]

model = Ridge(alpha=10)
validate_sklearn_model(model, data, selected_cols, valid_rate=0.2, test_rate=0.2, droprows=600);

Data shapes:  (205578, 23) (68526, 23) (68526, 23)

Valid MSE: 		 0.018847
Valid R2 (x100): 	 -1.5725

Test MSE: 		 0.018352
Test R2 (x100): 	 1.5328


In [41]:
removing_cols = [
    'periods_before_closing',
    'xy_relation',
#     'yx_relation',
#     'xy_geom',
#     'xy_garmonic'
]
selected_cols = [col for col in usecols if col not in removing_cols]


model = ElasticNet(alpha=0.0001, l1_ratio=0.2, max_iter=1000)
validate_sklearn_model(model, data, selected_cols, valid_rate=0.2, test_rate=0.2, droprows=600);

Data shapes:  (205578, 23) (68526, 23) (68526, 23)


  positive)



Valid MSE: 		 0.018818
Valid R2 (x100): 	 -1.4125

Test MSE: 		 0.018436
Test R2 (x100): 	 1.081


  positive)


## Full History Diff

In [92]:
usecols = [
    'xprice', 'yprice',
    'yx_relation', 'xy_relation',
    'yx_spread', 'xy_geom', 'xy_garmonic',
    'periods_before_closing'
]

data = init_data(fname)

hand_crafted_cols = add_hand_feats(data)
usecols.extend(hand_crafted_cols)

xcols = add_time_depended_rolling(data, 'xprice', short_agg_periods, np.mean, 'mean')
for col in xcols:
    data[col] = data.xprice - data[col]
usecols.extend(xcols)

ycols = add_time_depended_rolling(data, 'yprice', short_agg_periods, np.mean, 'mean')
for col in ycols:
    data[col] = data.yprice - data[col]
usecols.extend(ycols)

usecols.append(add_full_history_diff(data, 'xprice'))
usecols.append(add_full_history_diff(data, 'yprice'))
usecols.append(add_full_history_diff(data, 'yx_relation'))
usecols.append(add_full_history_diff(data, 'xy_geom'))

['ydiff_from_closing', 'xdiff_from_closing', 'ydiff_from_opening', 'xdiff_from_opening']
['xprice_time_mean_60', 'xprice_time_mean_600', 'xprice_time_mean_3600']
['yprice_time_mean_60', 'yprice_time_mean_600', 'yprice_time_mean_3600']
xprice_full_history_diff
yprice_full_history_diff
yx_relation_full_history_diff
xy_geom_full_history_diff


In [94]:
removing_cols = [
    'xprice',
#     'yprice',
    'periods_before_closing',
#     'yx_spread',
    'xy_relation',
    'yx_relation',
    'xy_geom',
    'xy_garmonic',
    'yx_relation_full_history_diff',
    'xy_geom_full_history_diff',
    'xprice_full_history_diff',
#     'yprice_full_history_diff',
]

selected_cols = [col for col in usecols if col not in removing_cols]
model = Ridge(alpha=10)
validate_sklearn_model(model, data, selected_cols, valid_rate=0.2, test_rate=0.2, droprows=600);

Data shapes:  (204978, 27) (68526, 27) (68526, 27)

Valid MSE: 		 0.018884
Valid R2 (x100): 	 -1.7677

Test MSE: 		 0.018332
Test R2 (x100): 	 1.6395


In [109]:
model = Ridge(alpha=10)
filtered_cols = greedy_add_del_strategy(model, data, usecols, test_rate=0.2, droprows=600)
validate_sklearn_model(model, data, filtered_cols, valid_rate=0.2, test_rate=0.2, droprows=600);

removed xdiff_from_opening: r2: 1.0394
removed xdiff_from_closing: r2: 1.2817
removed xy_garmonic: r2: 1.4445
removed xy_geom: r2: 1.5929
removed xprice_full_history_diff: r2: 1.8184
removed xy_geom_full_history_diff: r2: 1.8573
removed yprice_time_mean_60: r2: 1.875
removed periods_before_closing: r2: 1.8903
removed yx_relation_full_history_diff: r2: 1.8976
removed xy_relation: r2: 1.8997
removed yx_relation: r2: 1.9
removed yprice: r2: 1.9
Data shapes:  (204978, 27) (68526, 27) (68526, 27)

Valid MSE: 		 0.018874
Valid R2 (x100): 	 -1.7182

Test MSE: 		 0.018283
Test R2 (x100): 	 1.9


In [50]:
removing_cols = [
    'xprice',
#     'yprice'
    'periods_before_closing',
    'yx_spread',
#     'xy_relation',
#     'yx_relation',
    'xy_geom',
    'xy_garmonic',
#     'yx_relation_full_history_diff',
#     'xy_geom_full_history_diff',
#     'xprice_full_history_diff',
#     'yprice_full_history_diff',
]

selected_cols = [col for col in usecols if col not in removing_cols]


model = ElasticNet(alpha=0.001, l1_ratio=0.1, max_iter=2000)
validate_sklearn_model(model, data, selected_cols, valid_rate=0.2, test_rate=0.2, droprows=600);

Data shapes:  (205578, 27) (68526, 27) (68526, 27)

Valid MSE: 		 0.018882
Valid R2 (x100): 	 -1.7585

Test MSE: 		 0.018379
Test R2 (x100): 	 1.3854


## Heap of features