In [19]:
import pandas as pd
from datetime import datetime

import numpy as np

import matplotlib.pylab as plt
%matplotlib inline

from tqdm import tqdm, tqdm_notebook
pd.set_option("display.max_rows", 100)
pd.set_option('display.max_columns', 200)

from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression, ElasticNet, Ridge
fname = 'data.csv'

def init_data(fname):
    data = pd.read_csv('data.csv')
    data['yx_spread'] = data.yprice - data.xprice
    data['yx_relation'] = data.yprice / data.xprice
    data['xy_relation'] = data.xprice / data.yprice
    data['xy_geom'] = np.sqrt(data.xprice * data.yprice)
    data['xy_garmonic'] = 2 / (1 / data.xprice + 1 / data.yprice)
    
#     data.xprice = (data.xprice - data.xprice.min())# / data.xprice.std() 
#     data.yprice = (data.yprice - data.yprice.min())# / data.yprice.std() 
    data['timestamp'] = data['timestamp'] // 1000
    data['timestamp'] = data['timestamp'].apply(lambda stamp: datetime.fromtimestamp(stamp))
    data['timestamp'] = data['timestamp'] - pd.Timedelta(hours=1) # for flexibility
    data.index = data['timestamp']
    
    data['weekday'] = data.timestamp.dt.weekday
    data['day'] = (data.timestamp.dt.date - data.timestamp.dt.date.min()).apply(lambda x: int(x.days))
    day_close_time = data.day.map(data.groupby('day').timestamp.max())
    data['periods_before_closing'] = (day_close_time - data.timestamp).apply(lambda x: x.seconds // 10)
    day_open_time = data.day.map(data.groupby('day').timestamp.min())
    data['periods_after_opening'] = (data.timestamp - day_open_time).apply(lambda x: x.seconds // 10)
#     data.drop('timestamp', 1, inplace=True)
    return data
    
def time_split(data, valid_ratio, test_ratio):
    n_valid = max(1, int(data.shape[0] * valid_ratio))
    n_test = max(1, int(data.shape[0] * test_ratio))
    n_train = data.shape[0] - n_valid - n_test
    
    train = data.iloc[:n_train].reset_index(drop=True).copy()
    valid = data.iloc[n_train:-n_test].reset_index(drop=True).copy()
    test = data.iloc[-n_test:].reset_index(drop=True).copy()
    merged_test = valid.append(test).reset_index(drop=True)
    print('Data shapes: ', train.shape, valid.shape, test.shape)
    return train, valid, test

In [72]:
def add_diffs(df, column, uselags):
    new_columns = []
    for lag in uselags:
        colname = '{}_diff_{}'.format(column, lag)
        df.loc[:, colname] = df[column].diff(lag)
        new_columns.append(colname)
    print(new_columns)
    return new_columns

def add_shifts(df, column, uselags):
    new_columns = []
    for lag in uselags:
        colname = '{}_lag_{}'.format(column, lag)
        df.loc[:, colname] = df[column].shift(lag)
        new_columns.append(colname)
    print(new_columns)
    return new_columns

def add_rolling_mean(df, column, windows):
    new_columns = []
    for window_size in windows:
        colname = '{}_ma_{}'.format(column, window_size)
        df.loc[:, colname] = df[column].rolling(window=window_size).mean()
        new_columns.append(colname)
    print(new_columns)
    return new_columns

def add_curstom_rolling_operation(df, column, agg_function, function_name, windows):
    new_columns = []
    for window_size in windows:
        colname = '{}_{}_{}'.format(column, function_name, window_size)
        df.loc[:, colname] = df[column].rolling(window=window_size).agg(agg_function)
        new_columns.append(colname)
    print(new_columns)
    return new_columns  

def rsiFunc(prices, n=14):
    deltas = np.diff(prices)
    seed = deltas[:n+1]
    up = seed[seed>=0].sum()/n
    down = -seed[seed<0].sum()/n
    rs = up/down
    rsi = np.zeros_like(prices)
    rsi[:n] = 100. - 100./(1.+rs)

    for i in range(n, len(prices)):
        delta = deltas[i-1] # cause the diff is 1 shorter

        if delta>0:
            upval = delta
            downval = 0.
        else:
            upval = 0.
            downval = -delta

        up = (up*(n-1) + upval)/n
        down = (down*(n-1) + downval)/n

        rs = up/down
        rsi[i] = 100. - 100./(1.+rs)

    return rsi

def add_rsi(df, column, windows):
    new_columns = []
    for window_size in windows:
        colname = '{}_rsi_{}'.format(column, window_size)
        df.loc[:, colname] = rsiFunc(df[column].values, window_size)
        new_columns.append(colname)
    print(new_columns)
    return new_columns  

def add_ewma(df, column, windows):
    new_columns = []
    for window_size in windows:
        colname = '{}_ewma_{}'.format(column, window_size)
        df.loc[:, colname] = pd.Series.ewm(df[column], span=window_size).mean()
        new_columns.append(colname)
    print(new_columns)
    return new_columns 

def add_time_depended_rolling(df, source_column, agg_periods_per_seconds, agg_fun, agg_repr):
    '''
        df: source dataframe
        source_column: column for building feature
        agg_periods_per_seconds: list with periods in seconds
        agg_fun: aggregation function
        agg_repr: name of agg function
    '''
    is_allowed_arguments = sum(map(lambda x: x % 10, agg_periods_per_seconds)) == 0
    assert is_allowed_arguments, 'agg_periods_per_seconds divided by 10'
    
    new_cols = []
    for agg_period in agg_periods_per_seconds:
        agg_shifts = range(10, agg_period, 10)
        period_repr = '{}s'.format(agg_period)
        
        agg_helper_df = df[source_column].resample(
            period_repr, label='right', closed='right').agg(agg_fun)
                                             
        for shift in agg_shifts:
            agg_helper_df = agg_helper_df.append(df[source_column].resample(
                period_repr, label='right', closed='right', base=shift).agg(agg_fun))
        colname = '{}_time_{}_{}'.format(source_column, agg_repr, agg_period)
        df.loc[:, colname] = agg_helper_df
        new_cols.append(colname)
    print(new_cols)
    return new_cols

In [138]:
def add_hand_feats(df):
    close_price_per_day = df.groupby('day').timestamp.max().shift(1).map(
        df[['timestamp', 'yprice']].set_index('timestamp').yprice)
    df.loc[:, 'ydiff_from_closing'] = (df.day.map(close_price_per_day) - df.yprice).fillna(0)
    close_price_per_day = df.groupby('day').timestamp.max().shift(1).map(
        df[['timestamp', 'xprice']].set_index('timestamp').xprice)
    df.loc[:, 'xdiff_from_closing'] = (df.day.map(close_price_per_day) - df.yprice).fillna(0)
    
    open_price_per_day = df.groupby('day').timestamp.min().map(
        df[['timestamp', 'yprice']].set_index('timestamp').yprice)
    df.loc[:, 'ydiff_from_opening'] = (df.day.map(open_price_per_day) - df.yprice)
    
    open_price_per_day = df.groupby('day').timestamp.min().map(
        df[['timestamp', 'xprice']].set_index('timestamp').xprice)
    df.loc[:, 'xdiff_from_opening'] = (df.day.map(open_price_per_day) - df.xprice)
    new_columns = ['ydiff_from_closing', 'xdiff_from_closing', 'ydiff_from_opening', 'xdiff_from_opening']
    print(new_columns)
    return new_columns

def add_full_history_diff(df, col):
    mean = df[col].cumsum() / np.arange(1, df.shape[0] + 1)
    new_col = '{}_full_history_diff'.format(col)
    df.loc[:, new_col] = df[col] - mean
    print(new_col)
    return new_col

In [4]:
data = init_data(fname)
data.head()

Unnamed: 0_level_0,xprice,yprice,returns,yx_spread,yx_relation,xy_relation,xy_geom,xy_garmonic,weekday,day,periods_before_closing,periods_after_opening
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2013-01-03 20:05:00,139.8375,169.25,0.3125,29.4125,1.210333,0.826219,153.842442,153.144316,3,0,1409,0
2013-01-03 20:05:10,139.8875,169.3,0.275,29.4125,1.210258,0.82627,153.89267,153.194769,3,0,1408,1
2013-01-03 20:05:20,139.8625,169.3375,0.25,29.475,1.210743,0.825939,153.895959,153.195124,3,0,1407,2
2013-01-03 20:05:30,139.8375,169.3625,0.2375,29.525,1.211138,0.82567,153.893563,153.190353,3,0,1406,3
2013-01-03 20:05:40,139.8375,169.3625,0.325,29.525,1.211138,0.82567,153.893563,153.190353,3,0,1405,4


In [5]:
standart_calendar_lags = [6, 60, 360, 1410, 7050, 14100, 28200, 42300]
qazy_calendar_lags = [6, 60, 360, 720, 1410, 2820, 7050, 14100, 28200, 42300]
day_lags = 1410 * np.arange(1, 75)
valid_ratio = 0
test_ratio = 0.2

## Trivial solutuin

In [7]:
train, valid, test = time_split(data, valid_ratio, test_ratio)
train.dropna(inplace=True)

trivial_solution = np.ones_like(test.returns.values) * test.returns.mean()
print('Zero Prediction MSE: \t {:.5}'.format(np.mean(np.square(test.returns.values))))
print('Mean Prediction MSE: \t {:.5}'.format(mean_squared_error(test.returns, trivial_solution)))
print('Mean Prediction R2: \t {:.5}'.format(r2_score(test.returns, trivial_solution)))

Data shapes:  (274103, 12) (1, 12) (68526, 12)
Zero Prediction MSE: 	 0.018675
Mean Prediction MSE: 	 0.018637
Mean Prediction R2: 	 0.0


In [8]:
usecols = ['xprice', 'yprice']

model = Ridge()
model.fit(train[usecols], train.returns)

y_test_predicted = model.predict(test[usecols])
y_test_predicted[test.periods_before_closing == 0] = 0
print('\nTest MSE: \t\t {:.5}'.format(mean_squared_error(y_test_predicted, test.returns)))
print('Test R2 (x100): \t {:.5}'.format(r2_score(test.returns, y_test_predicted) * 100))


Test MSE: 		 0.018603
Test R2 (x100): 	 0.18287


## Simple features

In [43]:
usecols = [
    'xprice', 'yprice',
    'yx_relation', 'xy_relation',
    'yx_spread', 'xy_geom', 'xy_garmonic',
    'periods_before_closing'
]

data = init_data(fname)
hand_crafted_cols = add_hand_feats(data)
usecols.extend(hand_crafted_cols)

train, valid, test = time_split(data, valid_ratio, test_ratio)
train.dropna(inplace=True)

['ydiff_from_closing', 'xdiff_from_closing', 'ydiff_from_opening', 'xdiff_from_opening']
Data shapes:  (274103, 17) (1, 17) (68526, 17)


In [50]:
removing_cols = [
    'periods_before_closing',
    'xy_relation',
    'yx_relation',
    'xy_geom',
    'xy_garmonic'
]
selected_cols = [col for col in usecols if col not in removing_cols]

model = Ridge(alpha=1)
model.fit(train[selected_cols], train.returns)

y_test_predicted = model.predict(test[selected_cols])
y_test_predicted[test.periods_before_closing == 0] = 0
print('\nTest MSE: \t\t {:.5}'.format(mean_squared_error(y_test_predicted, test.returns)))
print('Test R2 (x100): \t {:.5}'.format(r2_score(test.returns, y_test_predicted) * 100))


Test MSE: 		 0.018535
Test R2 (x100): 	 0.5485


In [68]:
removing_cols = [
    'periods_before_closing',
#     'xy_relation',
    'yx_relation',
#     'xy_geom',
#     'xy_garmonic'
]
selected_cols = [col for col in usecols if col not in removing_cols]


model = ElasticNet(alpha=0.01, l1_ratio=0.001, max_iter=1000)
model.fit(train[selected_cols], train.returns)

y_test_predicted = model.predict(test[selected_cols])
y_test_predicted[test.periods_before_closing == 0] = 0
print('\nTest MSE: \t\t {:.5}'.format(mean_squared_error(y_test_predicted, test.returns)))
print('Test R2 (x100): \t {:.5}'.format(r2_score(test.returns, y_test_predicted) * 100))


Test MSE: 		 0.018541
Test R2 (x100): 	 0.51472


  positive)


## Agg v1

- 6 - 1min
- 60 - 10min
- 360 - 1hour
- 1410 - 1workday (~ 4 hours per day)
- 7050 - 1workweek (5 days per week)
- 28200 - 1 workmonth (~ 4 weeks per month)

In [222]:
short_agg_periods = [60, 600, 3600]
oneday_agg_periods = [60, 600, 3600, 7200, 14100]

In [71]:
usecols = [
    'xprice', 'yprice',
    'yx_relation', 'xy_relation',
    'yx_spread', 'xy_geom', 'xy_garmonic',
    'periods_before_closing'
]

data = init_data(fname)

hand_crafted_cols = add_hand_feats(data)
usecols.extend(hand_crafted_cols)

xcols = add_time_depended_rolling(data, 'xprice', short_agg_periods, np.mean, 'mean')
usecols.extend(xcols)

ycols = add_time_depended_rolling(data, 'yprice', short_agg_periods, np.mean, 'mean')
usecols.extend(ycols)

train, valid, test = time_split(data, valid_ratio, test_ratio)
train.drop(np.arange(600), inplace=True)
train.dropna(inplace=True)

['ydiff_from_closing', 'xdiff_from_closing', 'ydiff_from_opening', 'xdiff_from_opening']
Data shapes:  (274103, 23) (1, 23) (68526, 23)


In [101]:
removing_cols = [
    'periods_before_closing',
    'xy_relation',
    'yx_relation',
    'xy_geom',
    'xy_garmonic'
]
selected_cols = [col for col in usecols if col not in removing_cols]

model = Ridge(alpha=10)
model.fit(train[selected_cols], train.returns)

y_test_predicted = model.predict(test[selected_cols])
y_test_predicted[test.periods_before_closing == 0] = 0
print('\nTest MSE: \t\t {:.5}'.format(mean_squared_error(y_test_predicted, test.returns)))
print('Test R2 (x100): \t {:.5}'.format(r2_score(test.returns, y_test_predicted) * 100))


Test MSE: 		 0.018354
Test R2 (x100): 	 1.5199


In [95]:
removing_cols = [
    'periods_before_closing',
    'xy_relation',
#     'yx_relation',
#     'xy_geom',
#     'xy_garmonic'
]
selected_cols = [col for col in usecols if col not in removing_cols]


model = ElasticNet(alpha=0.01, l1_ratio=0., max_iter=1000)
model.fit(train[selected_cols], train.returns)

y_test_predicted = model.predict(test[selected_cols])
y_test_predicted[test.periods_before_closing == 0] = 0
print('\nTest MSE: \t\t {:.5}'.format(mean_squared_error(y_test_predicted, test.returns)))
print('Test R2 (x100): \t {:.5}'.format(r2_score(test.returns, y_test_predicted) * 100))


Test MSE: 		 0.018461
Test R2 (x100): 	 0.9449


  positive)


## Full History Diff

In [234]:
usecols = [
    'xprice', 'yprice',
    'yx_relation', 'xy_relation',
    'yx_spread', 'xy_geom', 'xy_garmonic',
    'periods_before_closing'
]

data = init_data(fname)

hand_crafted_cols = add_hand_feats(data)
usecols.extend(hand_crafted_cols)

xcols = add_time_depended_rolling(data, 'xprice', short_agg_periods, np.mean, 'mean')
usecols.extend(xcols)

ycols = add_time_depended_rolling(data, 'yprice', short_agg_periods, np.mean, 'mean')
usecols.extend(ycols)

usecols.append(add_full_history_diff(data, 'xprice'))
usecols.append(add_full_history_diff(data, 'yprice'))
usecols.append(add_full_history_diff(data, 'yx_relation'))
usecols.append(add_full_history_diff(data, 'xy_geom'))


train, valid, test = time_split(data, valid_ratio, test_ratio)
train.drop(np.arange(600), inplace=True)
train.dropna(inplace=True)

['ydiff_from_closing', 'xdiff_from_closing', 'ydiff_from_opening', 'xdiff_from_opening']
['xprice_time_mean_60', 'xprice_time_mean_600', 'xprice_time_mean_3600']
['yprice_time_mean_60', 'yprice_time_mean_600', 'yprice_time_mean_3600']
xprice_full_history_diff
yprice_full_history_diff
yx_relation_full_history_diff
xy_geom_full_history_diff
Data shapes:  (274103, 27) (1, 27) (68526, 27)


In [235]:
train, valid, test = time_split(data, 0.2, 0.2)
train.drop(np.arange(600), inplace=True)
train.dropna(inplace=True)

Data shapes:  (205578, 27) (68526, 27) (68526, 27)


In [236]:
removing_cols = [
    'xprice',
    'periods_before_closing',
#     'yx_spread',
    'xy_relation',
    'yx_relation',
    'xy_geom',
    'xy_garmonic',
    'yx_relation_full_history_diff',
    'xy_geom_full_history_diff',
    'xprice_full_history_diff',
#     'yprice_full_history_diff',
]
selected_cols = [col for col in usecols if col not in removing_cols]

model = Ridge(alpha=10)
model.fit(train[selected_cols], train.returns)
y_valid_predicted = model.predict(valid[selected_cols])
y_valid_predicted[valid.periods_before_closing == 0] = 0
print('\nTest MSE: \t\t {:.5}'.format(mean_squared_error(y_valid_predicted, valid.returns)))
print('Test R2 (x100): \t {:.5}'.format(r2_score(valid.returns, y_valid_predicted) * 100))


model = Ridge(alpha=10)
model.fit(train.append(valid)[selected_cols], train.append(valid).returns)
y_test_predicted = model.predict(test[selected_cols])
y_test_predicted[test.periods_before_closing == 0] = 0
print('\nTest MSE: \t\t {:.5}'.format(mean_squared_error(y_test_predicted, test.returns)))
print('Test R2 (x100): \t {:.5}'.format(r2_score(test.returns, y_test_predicted) * 100))


Test MSE: 		 0.01888
Test R2 (x100): 	 -1.7503

Test MSE: 		 0.018333
Test R2 (x100): 	 1.6314


In [237]:
removing_cols = [
    'xprice',
#     'yprice'
    'periods_before_closing',
    'yx_spread',
#     'xy_relation',
#     'yx_relation',
    'xy_geom',
    'xy_garmonic',
#     'yx_relation_full_history_diff',
#     'xy_geom_full_history_diff',
#     'xprice_full_history_diff',
#     'yprice_full_history_diff',
]

selected_cols = [col for col in usecols if col not in removing_cols]


model = ElasticNet(alpha=0.01, l1_ratio=0.01, max_iter=2000)
model.fit(train[selected_cols], train.returns)


y_valid_predicted = model.predict(valid[selected_cols])
y_valid_predicted[valid.periods_before_closing == 0] = 0
print('\Valid MSE: \t\t {:.5}'.format(mean_squared_error(y_valid_predicted, valid.returns)))
print('Valid R2 (x100): \t {:.5}'.format(r2_score(valid.returns, y_valid_predicted) * 100))

y_test_predicted = model.predict(test[selected_cols])
y_test_predicted[test.periods_before_closing == 0] = 0
print('\nTest MSE: \t\t {:.5}'.format(mean_squared_error(y_test_predicted, test.returns)))
print('Test R2 (x100): \t {:.5}'.format(r2_score(test.returns, y_test_predicted) * 100))

\Valid MSE: 		 0.018741
Valid R2 (x100): 	 -1.0013

Test MSE: 		 0.0185
Test R2 (x100): 	 0.73883


  positive)
