In [1]:
import pandas as pd
from datetime import datetime

import numpy as np

import matplotlib.pylab as plt
%matplotlib inline

from tqdm import tqdm, tqdm_notebook
pd.set_option("display.max_rows", 100)
pd.set_option('display.max_columns', 200)

from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression, ElasticNet, Ridge
fname = 'data.csv'

def init_data(fname):
    data = pd.read_csv('data.csv')
    data['yx_spread'] = data.yprice - data.xprice
    data['yx_relation'] = data.yprice / data.xprice
    data['xy_relation'] = data.xprice / data.yprice
    data['xy_geom'] = np.sqrt(data.xprice * data.yprice)
    data['xy_garmonic'] = 2 / (1 / data.xprice + 1 / data.yprice)
    
#     data.xprice = (data.xprice - data.xprice.min())# / data.xprice.std() 
#     data.yprice = (data.yprice - data.yprice.min())# / data.yprice.std() 
    data['timestamp'] = data['timestamp'] // 1000
    data['timestamp'] = data['timestamp'].apply(lambda stamp: datetime.fromtimestamp(stamp))
    data['timestamp'] = data['timestamp'] - pd.Timedelta(hours=1) # for flexibility
    data.index = data['timestamp']
    
    data['weekday'] = data.timestamp.dt.weekday
    data['day'] = (data.timestamp.dt.date - data.timestamp.dt.date.min()).apply(lambda x: int(x.days))
    day_close_time = data.day.map(data.groupby('day').timestamp.max())
    data['periods_before_closing'] = (day_close_time - data.timestamp).apply(lambda x: x.seconds // 10)
    day_open_time = data.day.map(data.groupby('day').timestamp.min())
    data['periods_after_opening'] = (data.timestamp - day_open_time).apply(lambda x: x.seconds // 10)
#     data.drop('timestamp', 1, inplace=True)
    return data
    
def time_split(data, valid_ratio, test_ratio):
    n_valid = max(1, int(data.shape[0] * valid_ratio))
    n_test = max(1, int(data.shape[0] * test_ratio))
    n_train = data.shape[0] - n_valid - n_test
    
    train = data.iloc[:n_train].reset_index(drop=True).copy()
    valid = data.iloc[n_train:-n_test].reset_index(drop=True).copy()
    test = data.iloc[-n_test:].reset_index(drop=True).copy()
    merged_test = valid.append(test).reset_index(drop=True)
    return train, valid, test

In [2]:
def add_diffs(df, column, uselags):
    new_columns = []
    for lag in uselags:
        colname = '{}_diff_{}'.format(column, lag)
        df.loc[:, colname] = df[column].diff(lag)
        new_columns.append(colname)
    print(new_columns)
    return new_columns

def add_shifts(df, column, uselags):
    new_columns = []
    for lag in uselags:
        colname = '{}_lag_{}'.format(column, lag)
        df.loc[:, colname] = df[column].shift(lag)
        new_columns.append(colname)
    print(new_columns)
    return new_columns

def add_rolling_mean(df, column, windows):
    new_columns = []
    for window_size in windows:
        colname = '{}_ma_{}'.format(column, window_size)
        df.loc[:, colname] = df[column].rolling(window=window_size).mean()
        new_columns.append(colname)
    print(new_columns)
    return new_columns

def add_curstom_rolling_operation(df, column, agg_function, function_name, windows):
    new_columns = []
    for window_size in windows:
        colname = '{}_{}_{}'.format(column, function_name, window_size)
        df.loc[:, colname] = df[column].rolling(window=window_size).agg(agg_function)
        new_columns.append(colname)
    print(new_columns)
    return new_columns  

def rsiFunc(prices, n=14):
    deltas = np.diff(prices)
    seed = deltas[:n+1]
    up = seed[seed>=0].sum()/n
    down = -seed[seed<0].sum()/n
    rs = up/down
    rsi = np.zeros_like(prices)
    rsi[:n] = 100. - 100./(1.+rs)

    for i in range(n, len(prices)):
        delta = deltas[i-1] # cause the diff is 1 shorter

        if delta>0:
            upval = delta
            downval = 0.
        else:
            upval = 0.
            downval = -delta

        up = (up*(n-1) + upval)/n
        down = (down*(n-1) + downval)/n

        rs = up/down
        rsi[i] = 100. - 100./(1.+rs)

    return rsi

def add_rsi(df, column, windows):
    new_columns = []
    for window_size in windows:
        colname = '{}_rsi_{}'.format(column, window_size)
        df.loc[:, colname] = rsiFunc(df[column].values, window_size)
        new_columns.append(colname)
    print(new_columns)
    return new_columns  

def add_ewma(df, column, windows):
    new_columns = []
    for window_size in windows:
        colname = '{}_ewma_{}'.format(column, window_size)
        df.loc[:, colname] = pd.Series.ewm(df[column], span=window_size).mean()
        new_columns.append(colname)
    print(new_columns)
    return new_columns 

def add_time_depended_rolling(df, source_column, windows, agg_fun, agg_repr):
    '''
        df: source dataframe
        source_column: column for building feature
        windows: list with periods (1 period = 10 sec)
        agg_fun: aggregation function
        agg_repr: name of agg function
    '''    
    new_cols = []
    for agg_period in windows:
        agg_shifts = range(10, agg_period * 10, 10)
        period_repr = '{}s'.format(agg_period * 10)
        
        agg_helper_df = df[source_column].resample(
            period_repr, label='right', closed='right').agg(agg_fun)
                                             
        for shift in agg_shifts:
            agg_helper_df = agg_helper_df.append(df[source_column].resample(
                period_repr, label='right', closed='right', base=shift).agg(agg_fun))
        colname = '{}_time_{}_{}'.format(source_column, agg_repr, agg_period)
        df.loc[:, colname] = agg_helper_df
        new_cols.append(colname)
    print(new_cols)
    return new_cols

In [3]:
def add_hand_feats(df):
    close_price_per_day = df.groupby('day').timestamp.max().shift(1).map(
        df[['timestamp', 'yprice']].set_index('timestamp').yprice)
    df.loc[:, 'ydiff_from_closing'] = (df.day.map(close_price_per_day) - df.yprice).fillna(0)
    close_price_per_day = df.groupby('day').timestamp.max().shift(1).map(
        df[['timestamp', 'xprice']].set_index('timestamp').xprice)
    df.loc[:, 'xdiff_from_closing'] = (df.day.map(close_price_per_day) - df.yprice).fillna(0)
    
    open_price_per_day = df.groupby('day').timestamp.min().map(
        df[['timestamp', 'yprice']].set_index('timestamp').yprice)
    df.loc[:, 'ydiff_from_opening'] = (df.day.map(open_price_per_day) - df.yprice)
    
    open_price_per_day = df.groupby('day').timestamp.min().map(
        df[['timestamp', 'xprice']].set_index('timestamp').xprice)
    df.loc[:, 'xdiff_from_opening'] = (df.day.map(open_price_per_day) - df.xprice)
    new_columns = ['ydiff_from_closing', 'xdiff_from_closing', 'ydiff_from_opening', 'xdiff_from_opening']
    print(new_columns)
    return new_columns

def add_full_history_diff(df, col):
    mean = df[col].cumsum() / np.arange(1, df.shape[0] + 1)
    new_col = '{}_full_history_diff'.format(col)
    df.loc[:, new_col] = df[col] - mean
    print(new_col)
    return new_col

In [66]:
def validate_sklearn_model(model, data, selected_cols, valid_ratio, test_ratio, droprows=0, 
                           verbose=True, only_valid=False):
    helper_cols = list(set(selected_cols + ['periods_before_closing', 'returns']))
    train, valid, test = time_split(data[helper_cols], valid_ratio, test_ratio)
    train.drop(np.arange(droprows), inplace=True)
    train.dropna(inplace=True)
    
    if verbose:
        print('Data shapes: ', train.shape, valid.shape, test.shape)

    metrics_dict = {}
    
    if valid_ratio!=0:
        model.fit(train[selected_cols], train.returns)
        y_valid_predicted = model.predict(valid[selected_cols])
        
        y_valid_predicted[valid.periods_before_closing == 0] = 0
        
        metrics_dict['valid_mse'] = mean_squared_error(y_valid_predicted, valid.returns)
        metrics_dict['valid_r2'] = r2_score(valid.returns, y_valid_predicted) * 100
        if verbose:
            print('\nValid MSE: \t\t {:.5}'.format(metrics_dict['valid_mse']))
            print('Valid R2 (x100): \t {:.5}'.format(metrics_dict['valid_r2']))
    
    if not only_valid:
        model.fit(train.append(valid)[selected_cols], train.append(valid).returns)
        y_test_predicted = model.predict(test[selected_cols])
        y_test_predicted[test.periods_before_closing == 0] = 0

        metrics_dict['test_mse'] = mean_squared_error(y_test_predicted, test.returns)
        metrics_dict['test_r2'] = r2_score(test.returns, y_test_predicted) * 100
        if verbose:
            print('\nTest MSE: \t\t {:.5}'.format(metrics_dict['test_mse']))
            print('Test R2 (x100): \t {:.5}'.format(metrics_dict['test_r2']))
    
#     metrics_dict['model'] = model
    return metrics_dict


def greedy_add_del_strategy(model, data, cols, valid_ratio, test_ratio, droprows=0, add_frequency=1):
    selected_cols = cols.copy()
    removed_cols = []
    current_step = 0
    
    current_score = -float('inf')
    
    while selected_cols:
        current_step += 1
        if current_step % add_frequency == 0:
            for col in removed_cols:
                current_cols = selected_cols + [col]
                current_metrics = validate_sklearn_model(
                    model, data, current_cols,
                    valid_ratio=valid_ratio, test_ratio=test_ratio, droprows=droprows,
                    verbose=False, only_valid=True
                )
                if current_metrics['valid_r2'] > current_score:
                    current_score = current_metrics['valid_r2']
                    selected_cols.append(col)
                    print('added {}: r2: {:.5}'.format(col, current_score))

        best_score_by_iter = -float('inf')
        worst_col = ''
        for col in selected_cols:
            current_cols = [c for c in selected_cols if c!=col]
            current_metrics = validate_sklearn_model(
                model, data, current_cols, 
                valid_ratio, test_ratio, droprows,
                verbose=False, only_valid=True
            )

            if current_metrics['valid_r2'] > best_score_by_iter:
                best_score_by_iter = current_metrics['valid_r2']
                worst_col = col
        if best_score_by_iter > current_score:
            current_score = best_score_by_iter
            print('removed {}: r2: {:.5}'.format(worst_col, best_score_by_iter))
            selected_cols.remove(worst_col)
            removed_cols.append(worst_col)
        else:
            return selected_cols
        
def greedy_add_strategy(model, data, base_cols, additional_cols, valid_ratio, test_ratio, droprows=0):
    current_score = validate_sklearn_model(
        model, data, base_cols,
        valid_ratio, test_ratio, droprows,
        verbose=False, only_valid=True
    )['valid_r2']
    is_continue_search = True
    while is_continue_search:
        is_continue_search = False
        for col in additional_cols:
            current_cols = base_cols + [col]
            current_metrics = validate_sklearn_model(
                model, data, current_cols,
                valid_ratio, test_ratio, droprows,
                verbose=False, only_valid=True
            )
            if current_metrics['valid_r2'] > current_score:
                current_score = current_metrics['valid_r2']
                base_cols.append(col)
                additional_cols.remove(col)
                is_continue_search = True
                print('added {}: r2: {:.5}'.format(col, current_score))
        
    return base_cols

In [5]:
def add_time_dif(df, column, windows):
    pass

In [6]:
def print_importances(model, selected_cols):
    weigts_sum = sum(map(abs, model.coef_))
    for name, weight in sorted(zip(selected_cols, model.coef_), key=lambda x: -abs(x[1])):
        percent_weight = abs(weight) / weigts_sum
        print('{:40} {:.2%} {:15.2}'.format(name, percent_weight, weight))

In [7]:
short_agg_periods = [6, 60, 360]
oneday_agg_periods = [6, 60, 360, 720, 1410]
twoweeks_agg_periods = [6, 60, 360, 720, 1410, 2820, 7050, 14100]

month_days_periods = [1,2,3,4,5,10,15,20]
month_agg_periods = list(map(lambda x: x * 1410, month_days_periods))
print('month_agg_periods: {}'.format(month_agg_periods))

merged_agg_periods = [6, 60, 360, 720, 1410, 2820, 4230, 5640, 7050, 14100, 21150]

month_agg_periods: [1410, 2820, 4230, 5640, 7050, 14100, 21150, 28200]


In [8]:
valid_ratio = 0.2
test_ratio = 0.15

droprows = 7050
# droprows = 28200

- 6 - 1min
- 60 - 10min
- 360 - 1hour
- 1410 - 1workday (~ 4 hours per day)
- 7050 - 1workweek (5 days per week)
- 28200 - 1 workmonth (~ 4 weeks per month)

## Heap of features

In [9]:
model = Ridge(alpha=1000)

In [10]:
usecols = [
    'xprice', 'yprice',
    'yx_relation', 'xy_relation',
    'yx_spread', 'xy_geom',
    'periods_before_closing'
]

data = init_data(fname)

hand_crafted_cols = add_hand_feats(data)
usecols.extend(hand_crafted_cols)

xcols = add_time_depended_rolling(data, 'xprice', oneday_agg_periods, np.mean, 'mean')
for col in xcols:
    data[col] = data.xprice - data[col]
usecols.extend(xcols)

ycols = add_time_depended_rolling(data, 'yprice', oneday_agg_periods, np.mean, 'mean')
for col in ycols:
    data[col] = data.yprice - data[col]
usecols.extend(ycols)

usecols.append(add_full_history_diff(data, 'xprice'))
usecols.append(add_full_history_diff(data, 'yprice'))
usecols.append(add_full_history_diff(data, 'yx_relation'))
usecols.append(add_full_history_diff(data, 'xy_geom'))

['ydiff_from_closing', 'xdiff_from_closing', 'ydiff_from_opening', 'xdiff_from_opening']
['xprice_time_mean_6', 'xprice_time_mean_60', 'xprice_time_mean_360', 'xprice_time_mean_720', 'xprice_time_mean_1410']
['yprice_time_mean_6', 'yprice_time_mean_60', 'yprice_time_mean_360', 'yprice_time_mean_720', 'yprice_time_mean_1410']
xprice_full_history_diff
yprice_full_history_diff
yx_relation_full_history_diff
xy_geom_full_history_diff


In [11]:
filtered_cols = greedy_add_del_strategy(model, data, usecols, valid_ratio, test_ratio,
                                        droprows, add_frequency=4)
validate_sklearn_model(model, data, filtered_cols, valid_ratio, test_ratio, droprows);

removed xdiff_from_opening: r2: 0.69289
removed yprice_time_mean_60: r2: 0.76083
removed xprice_time_mean_720: r2: 0.79424
removed yprice_time_mean_6: r2: 0.79568
removed yprice_time_mean_720: r2: 0.79609
removed yx_relation_full_history_diff: r2: 0.7961
Data shapes:  (215660, 20) (68526, 20) (51394, 20)

Valid MSE: 		 0.019435
Valid R2 (x100): 	 0.7961

Test MSE: 		 0.015817
Test R2 (x100): 	 0.62798


In [12]:
new_cols = add_rsi(data, 'yx_spread', twoweeks_agg_periods)
usecols.extend(new_cols)

selected_cols = greedy_add_strategy(model, data, filtered_cols, new_cols,
                                    valid_ratio, test_ratio, droprows=7050)
validate_sklearn_model(model, data, selected_cols, valid_ratio, test_ratio, droprows);

['yx_spread_rsi_6', 'yx_spread_rsi_60', 'yx_spread_rsi_360', 'yx_spread_rsi_720', 'yx_spread_rsi_1410', 'yx_spread_rsi_2820', 'yx_spread_rsi_7050', 'yx_spread_rsi_14100']
added yx_spread_rsi_360: r2: 0.89474
Data shapes:  (215660, 21) (68526, 21) (51394, 21)

Valid MSE: 		 0.019416
Valid R2 (x100): 	 0.89474

Test MSE: 		 0.015838
Test R2 (x100): 	 0.49049


In [13]:
new_cols = add_rsi(data, 'yx_relation', twoweeks_agg_periods)
usecols.extend(new_cols)

selected_cols = greedy_add_strategy(model, data, selected_cols, new_cols,
                                    valid_ratio, test_ratio, droprows)
validate_sklearn_model(model, data, selected_cols, valid_ratio, test_ratio, droprows);

['yx_relation_rsi_6', 'yx_relation_rsi_60', 'yx_relation_rsi_360', 'yx_relation_rsi_720', 'yx_relation_rsi_1410', 'yx_relation_rsi_2820', 'yx_relation_rsi_7050', 'yx_relation_rsi_14100']
added yx_relation_rsi_360: r2: 0.90858
Data shapes:  (215660, 22) (68526, 22) (51394, 22)

Valid MSE: 		 0.019413
Valid R2 (x100): 	 0.90858

Test MSE: 		 0.015807
Test R2 (x100): 	 0.68864


In [14]:
new_cols = add_rsi(data, 'xy_relation', twoweeks_agg_periods)
usecols.extend(new_cols)

selected_cols = greedy_add_strategy(model, data, selected_cols, new_cols,
                                    valid_ratio, test_ratio, droprows)
validate_sklearn_model(model, data, selected_cols, valid_ratio, test_ratio, droprows);

['xy_relation_rsi_6', 'xy_relation_rsi_60', 'xy_relation_rsi_360', 'xy_relation_rsi_720', 'xy_relation_rsi_1410', 'xy_relation_rsi_2820', 'xy_relation_rsi_7050', 'xy_relation_rsi_14100']
Data shapes:  (215660, 22) (68526, 22) (51394, 22)

Valid MSE: 		 0.019413
Valid R2 (x100): 	 0.90858

Test MSE: 		 0.015807
Test R2 (x100): 	 0.68864


In [15]:
new_cols = add_rsi(data, 'xy_geom', twoweeks_agg_periods)
usecols.extend(new_cols)

selected_cols = greedy_add_strategy(model, data, selected_cols, new_cols,
                                    valid_ratio, test_ratio, droprows)
validate_sklearn_model(model, data, selected_cols, valid_ratio, test_ratio, droprows);

['xy_geom_rsi_6', 'xy_geom_rsi_60', 'xy_geom_rsi_360', 'xy_geom_rsi_720', 'xy_geom_rsi_1410', 'xy_geom_rsi_2820', 'xy_geom_rsi_7050', 'xy_geom_rsi_14100']
added xy_geom_rsi_6: r2: 0.93002
added xy_geom_rsi_720: r2: 0.9551
added xy_geom_rsi_60: r2: 1.0241
Data shapes:  (215660, 25) (68526, 25) (51394, 25)

Valid MSE: 		 0.01939
Valid R2 (x100): 	 1.0241

Test MSE: 		 0.015831
Test R2 (x100): 	 0.53594


In [16]:
new_cols = add_rsi(data, 'xy_garmonic', twoweeks_agg_periods)
usecols.extend(new_cols)

selected_cols = greedy_add_strategy(model, data, selected_cols, new_cols,
                                    valid_ratio, test_ratio, droprows)
validate_sklearn_model(model, data, selected_cols, valid_ratio, test_ratio, droprows);

['xy_garmonic_rsi_6', 'xy_garmonic_rsi_60', 'xy_garmonic_rsi_360', 'xy_garmonic_rsi_720', 'xy_garmonic_rsi_1410', 'xy_garmonic_rsi_2820', 'xy_garmonic_rsi_7050', 'xy_garmonic_rsi_14100']
added xy_garmonic_rsi_60: r2: 1.0605
Data shapes:  (215660, 26) (68526, 26) (51394, 26)

Valid MSE: 		 0.019383
Valid R2 (x100): 	 1.0605

Test MSE: 		 0.015841
Test R2 (x100): 	 0.47766


In [17]:
new_cols = add_rsi(data, 'xprice_time_mean_60', twoweeks_agg_periods)
usecols.extend(new_cols)

selected_cols = greedy_add_strategy(model, data, selected_cols, new_cols,
                                    valid_ratio, test_ratio, droprows)
validate_sklearn_model(model, data, selected_cols, valid_ratio, test_ratio, droprows);

['xprice_time_mean_60_rsi_6', 'xprice_time_mean_60_rsi_60', 'xprice_time_mean_60_rsi_360', 'xprice_time_mean_60_rsi_720', 'xprice_time_mean_60_rsi_1410', 'xprice_time_mean_60_rsi_2820', 'xprice_time_mean_60_rsi_7050', 'xprice_time_mean_60_rsi_14100']
added xprice_time_mean_60_rsi_6: r2: 1.09
added xprice_time_mean_60_rsi_360: r2: 1.1126
added xprice_time_mean_60_rsi_1410: r2: 1.1127
added xprice_time_mean_60_rsi_60: r2: 1.1127
added xprice_time_mean_60_rsi_2820: r2: 1.119
added xprice_time_mean_60_rsi_720: r2: 1.1226
Data shapes:  (215660, 32) (68526, 32) (51394, 32)

Valid MSE: 		 0.019371
Valid R2 (x100): 	 1.1226

Test MSE: 		 0.015832
Test R2 (x100): 	 0.53224


In [18]:
new_cols = add_rsi(data, 'xprice_time_mean_360', twoweeks_agg_periods)
usecols.extend(new_cols)

selected_cols = greedy_add_strategy(model, data, selected_cols, new_cols,
                                    valid_ratio, test_ratio, droprows)
validate_sklearn_model(model, data, selected_cols, valid_ratio, test_ratio, droprows);

['xprice_time_mean_360_rsi_6', 'xprice_time_mean_360_rsi_60', 'xprice_time_mean_360_rsi_360', 'xprice_time_mean_360_rsi_720', 'xprice_time_mean_360_rsi_1410', 'xprice_time_mean_360_rsi_2820', 'xprice_time_mean_360_rsi_7050', 'xprice_time_mean_360_rsi_14100']
added xprice_time_mean_360_rsi_6: r2: 1.1253
added xprice_time_mean_360_rsi_14100: r2: 1.1299
Data shapes:  (215660, 34) (68526, 34) (51394, 34)

Valid MSE: 		 0.019369
Valid R2 (x100): 	 1.1299

Test MSE: 		 0.015831
Test R2 (x100): 	 0.54014


In [19]:
new_cols = add_rsi(data, 'yprice_time_mean_60', twoweeks_agg_periods)
usecols.extend(new_cols)

selected_cols = greedy_add_strategy(model, data, selected_cols, new_cols,
                                    valid_ratio, test_ratio, droprows)
validate_sklearn_model(model, data, selected_cols, valid_ratio, test_ratio, droprows);

['yprice_time_mean_60_rsi_6', 'yprice_time_mean_60_rsi_60', 'yprice_time_mean_60_rsi_360', 'yprice_time_mean_60_rsi_720', 'yprice_time_mean_60_rsi_1410', 'yprice_time_mean_60_rsi_2820', 'yprice_time_mean_60_rsi_7050', 'yprice_time_mean_60_rsi_14100']
added yprice_time_mean_60_rsi_14100: r2: 1.1309
Data shapes:  (215660, 35) (68526, 35) (51394, 35)

Valid MSE: 		 0.019369
Valid R2 (x100): 	 1.1309

Test MSE: 		 0.015821
Test R2 (x100): 	 0.59732


In [20]:
new_cols = add_rsi(data, 'yprice_time_mean_360', twoweeks_agg_periods)
usecols.extend(new_cols)

selected_cols = greedy_add_strategy(model, data, selected_cols, new_cols,
                                    valid_ratio, test_ratio, droprows)
validate_sklearn_model(model, data, selected_cols, valid_ratio, test_ratio, droprows);

['yprice_time_mean_360_rsi_6', 'yprice_time_mean_360_rsi_60', 'yprice_time_mean_360_rsi_360', 'yprice_time_mean_360_rsi_720', 'yprice_time_mean_360_rsi_1410', 'yprice_time_mean_360_rsi_2820', 'yprice_time_mean_360_rsi_7050', 'yprice_time_mean_360_rsi_14100']
added yprice_time_mean_360_rsi_6: r2: 1.3413
added yprice_time_mean_360_rsi_360: r2: 1.3533
added yprice_time_mean_360_rsi_14100: r2: 1.3546
added yprice_time_mean_360_rsi_60: r2: 1.4948
Data shapes:  (215660, 39) (68526, 39) (51394, 39)

Valid MSE: 		 0.019298
Valid R2 (x100): 	 1.4948

Test MSE: 		 0.015761
Test R2 (x100): 	 0.97795


In [24]:
# agg_col = 'yx_spread'
# new_cols = add_time_depended_rolling(data, agg_col, oneday_agg_periods, np.mean, 'mean')
# for col in new_cols:
#     data[col] = data[agg_col] - data[col]
# usecols.extend(new_cols)

# selected_cols = greedy_add_strategy(model, data, selected_cols, new_cols,
#                                     valid_ratio, test_ratio, droprows)
# validate_sklearn_model(model, data, selected_cols, valid_ratio, test_ratio, droprows);

In [25]:
# agg_col = 'yx_relation'
# new_cols = add_time_depended_rolling(data, agg_col, oneday_agg_periods, np.mean, 'mean')
# for col in new_cols:
#     data[col] = data[agg_col] - data[col]
# usecols.extend(new_cols)

# selected_cols = greedy_add_strategy(model, data, selected_cols, new_cols,
#                                     valid_ratio, test_ratio, droprows)
# validate_sklearn_model(model, data, selected_cols, valid_ratio, test_ratio, droprows);

In [26]:
# agg_col = 'xy_relation'
# new_cols = add_time_depended_rolling(data, agg_col, oneday_agg_periods, np.mean, 'mean')
# for col in new_cols:
#     data[col] = data[agg_col] - data[col]
# usecols.extend(new_cols)

# selected_cols = greedy_add_strategy(model, data, selected_cols, new_cols,
#                                     valid_ratio, test_ratio, droprows)
# validate_sklearn_model(model, data, selected_cols, valid_ratio, test_ratio, droprows);

In [27]:
# agg_col = 'xy_geom'
# new_cols = add_time_depended_rolling(data, agg_col, oneday_agg_periods, np.mean, 'mean')
# for col in new_cols:
#     data[col] = data[agg_col] - data[col]
# usecols.extend(new_cols)

# selected_cols = greedy_add_strategy(model, data, selected_cols, new_cols,
#                                     valid_ratio, test_ratio, droprows)
# validate_sklearn_model(model, data, selected_cols, valid_ratio, test_ratio, droprows);

In [26]:
print_importances(model, selected_cols)

yprice_time_mean_360                     12.38%          -0.057
xprice_time_mean_360                     10.31%           0.047
xprice_time_mean_1410                    8.79%            0.04
yprice_time_mean_1410                    6.22%          -0.029
yx_spread_rsi_360                        6.06%          -0.028
yx_relation_rsi_360                      5.87%           0.027
xy_geom_rsi_60                           5.84%           0.027
xprice_time_mean_360_rsi_14100           5.57%           0.026
xy_garmonic_rsi_60                       5.52%          -0.025
yx_spread                                4.00%           0.018
xdiff_from_closing                       3.78%           0.017
xprice_time_mean_60_rsi_720              2.68%           0.012
yprice_time_mean_60_rsi_14100            2.61%          -0.012
xprice_time_mean_60_rsi_2820             2.23%            0.01
xprice_full_history_diff                 2.22%            0.01
xprice                                   2.18%       

In [29]:
data.head()

Unnamed: 0_level_0,timestamp,xprice,yprice,returns,yx_spread,yx_relation,xy_relation,xy_geom,xy_garmonic,weekday,day,periods_before_closing,periods_after_opening,ydiff_from_closing,xdiff_from_closing,ydiff_from_opening,xdiff_from_opening,xprice_time_mean_6,xprice_time_mean_60,xprice_time_mean_360,xprice_time_mean_720,xprice_time_mean_1410,yprice_time_mean_6,yprice_time_mean_60,yprice_time_mean_360,yprice_time_mean_720,yprice_time_mean_1410,xprice_full_history_diff,yprice_full_history_diff,yx_relation_full_history_diff,xy_geom_full_history_diff,yx_spread_rsi_6,yx_spread_rsi_60,yx_spread_rsi_360,yx_spread_rsi_720,yx_spread_rsi_1410,yx_spread_rsi_2820,yx_spread_rsi_7050,yx_spread_rsi_14100,yx_relation_rsi_6,yx_relation_rsi_60,yx_relation_rsi_360,yx_relation_rsi_720,yx_relation_rsi_1410,yx_relation_rsi_2820,yx_relation_rsi_7050,yx_relation_rsi_14100,xy_relation_rsi_6,xy_relation_rsi_60,xy_relation_rsi_360,xy_relation_rsi_720,xy_relation_rsi_1410,xy_relation_rsi_2820,xy_relation_rsi_7050,xy_relation_rsi_14100,xy_geom_rsi_6,xy_geom_rsi_60,xy_geom_rsi_360,xy_geom_rsi_720,xy_geom_rsi_1410,xy_geom_rsi_2820,xy_geom_rsi_7050,xy_geom_rsi_14100,xy_garmonic_rsi_6,xy_garmonic_rsi_60,xy_garmonic_rsi_360,xy_garmonic_rsi_720,xy_garmonic_rsi_1410,xy_garmonic_rsi_2820,xy_garmonic_rsi_7050,xy_garmonic_rsi_14100,xprice_time_mean_60_rsi_6,xprice_time_mean_60_rsi_60,xprice_time_mean_60_rsi_360,xprice_time_mean_60_rsi_720,xprice_time_mean_60_rsi_1410,xprice_time_mean_60_rsi_2820,xprice_time_mean_60_rsi_7050,xprice_time_mean_60_rsi_14100,xprice_time_mean_360_rsi_6,xprice_time_mean_360_rsi_60,xprice_time_mean_360_rsi_360,xprice_time_mean_360_rsi_720,xprice_time_mean_360_rsi_1410,xprice_time_mean_360_rsi_2820,xprice_time_mean_360_rsi_7050,xprice_time_mean_360_rsi_14100,yprice_time_mean_60_rsi_6,yprice_time_mean_60_rsi_60,yprice_time_mean_60_rsi_360,yprice_time_mean_60_rsi_720,yprice_time_mean_60_rsi_1410,yprice_time_mean_60_rsi_2820,yprice_time_mean_60_rsi_7050,yprice_time_mean_60_rsi_14100,yprice_time_mean_360_rsi_6,yprice_time_mean_360_rsi_60,yprice_time_mean_360_rsi_360,yprice_time_mean_360_rsi_720,yprice_time_mean_360_rsi_1410,yprice_time_mean_360_rsi_2820,yprice_time_mean_360_rsi_7050,yprice_time_mean_360_rsi_14100
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1,Unnamed: 102_level_1,Unnamed: 103_level_1
2013-01-03 20:05:00,2013-01-03 20:05:00,139.8375,169.25,0.3125,29.4125,1.210333,0.826219,153.842442,153.144316,3,0,1409,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,92.307692,53.947368,50.440529,50.695322,51.837959,52.958364,50.41713,49.480854,90.475901,51.076559,49.251652,49.604623,51.05667,52.583742,50.473075,49.688294,9.526089,48.92224,50.74845,50.395223,48.946046,47.430787,49.523254,50.315173,49.856181,72.501587,58.967364,57.750591,55.497764,52.703849,49.56721,48.259862,46.860517,72.904259,58.972409,57.675955,55.36095,52.504893,49.533416,48.271873,25.626741,69.679324,50.517284,50.3873,49.821003,50.0,49.950653,49.984612,25.626741,69.749214,51.773008,50.659557,49.809904,50.0,49.947973,49.983703,48.497854,59.226556,50.977506,50.402428,49.880448,50.131736,49.955492,49.992585,48.497854,59.490724,51.538851,51.061202,49.875562,50.137536,49.953548,49.992284
2013-01-03 20:05:10,2013-01-03 20:05:10,139.8875,169.3,0.275,29.4125,1.210258,0.82627,153.89267,153.194769,3,0,1408,1,0.0,0.0,-0.05,-0.05,0.025,0.025,0.025,0.025,0.025,0.025,0.025,0.025,0.025,0.025,0.025,0.025,-3.8e-05,0.025114,92.307692,53.947368,50.440529,50.695322,51.837959,52.958364,50.41713,49.480854,90.475901,51.076559,49.251652,49.604623,51.05667,52.583742,50.473075,49.688294,9.526089,48.92224,50.74845,50.395223,48.946046,47.430787,49.523254,50.315173,49.856181,72.501587,58.967364,57.750591,55.497764,52.703849,49.56721,48.259862,46.860517,72.904259,58.972409,57.675955,55.36095,52.504893,49.533416,48.271873,25.626741,69.679324,50.517284,50.3873,49.821003,50.0,49.950653,49.984612,25.626741,69.749214,51.773008,50.659557,49.809904,50.0,49.947973,49.983703,48.497854,59.226556,50.977506,50.402428,49.880448,50.131736,49.955492,49.992585,48.497854,59.490724,51.538851,51.061202,49.875562,50.137536,49.953548,49.992284
2013-01-03 20:05:20,2013-01-03 20:05:20,139.8625,169.3375,0.25,29.475,1.210743,0.825939,153.895959,153.195124,3,0,1407,2,0.0,0.0,-0.0875,-0.025,0.0,0.0,0.0,0.0,0.0,0.041667,0.041667,0.041667,0.041667,0.041667,0.0,0.041667,0.000298,0.018935,92.307692,53.947368,50.440529,50.695322,51.837959,52.958364,50.41713,49.480854,90.475901,51.076559,49.251652,49.604623,51.05667,52.583742,50.473075,49.688294,9.526089,48.92224,50.74845,50.395223,48.946046,47.430787,49.523254,50.315173,49.856181,72.501587,58.967364,57.750591,55.497764,52.703849,49.56721,48.259862,46.860517,72.904259,58.972409,57.675955,55.36095,52.504893,49.533416,48.271873,25.626741,69.679324,50.517284,50.3873,49.821003,50.0,49.950653,49.984612,25.626741,69.749214,51.773008,50.659557,49.809904,50.0,49.947973,49.983703,48.497854,59.226556,50.977506,50.402428,49.880448,50.131736,49.955492,49.992585,48.497854,59.490724,51.538851,51.061202,49.875562,50.137536,49.953548,49.992284
2013-01-03 20:05:30,2013-01-03 20:05:30,139.8375,169.3625,0.2375,29.525,1.211138,0.82567,153.893563,153.190353,3,0,1406,3,0.0,0.0,-0.1125,0.0,-0.01875,-0.01875,-0.01875,-0.01875,-0.01875,0.05,0.05,0.05,0.05,0.05,-0.01875,0.05,0.00052,0.012404,92.307692,53.947368,50.440529,50.695322,51.837959,52.958364,50.41713,49.480854,90.475901,51.076559,49.251652,49.604623,51.05667,52.583742,50.473075,49.688294,9.526089,48.92224,50.74845,50.395223,48.946046,47.430787,49.523254,50.315173,49.856181,72.501587,58.967364,57.750591,55.497764,52.703849,49.56721,48.259862,46.860517,72.904259,58.972409,57.675955,55.36095,52.504893,49.533416,48.271873,25.626741,69.679324,50.517284,50.3873,49.821003,50.0,49.950653,49.984612,25.626741,69.749214,51.773008,50.659557,49.809904,50.0,49.947973,49.983703,48.497854,59.226556,50.977506,50.402428,49.880448,50.131736,49.955492,49.992585,48.497854,59.490724,51.538851,51.061202,49.875562,50.137536,49.953548,49.992284
2013-01-03 20:05:40,2013-01-03 20:05:40,139.8375,169.3625,0.325,29.525,1.211138,0.82567,153.893563,153.190353,3,0,1405,4,0.0,0.0,-0.1125,0.0,-0.015,-0.015,-0.015,-0.015,-0.015,0.04,0.04,0.04,0.04,0.04,-0.015,0.04,0.000416,0.009924,92.307692,53.947368,50.440529,50.695322,51.837959,52.958364,50.41713,49.480854,90.475901,51.076559,49.251652,49.604623,51.05667,52.583742,50.473075,49.688294,9.526089,48.92224,50.74845,50.395223,48.946046,47.430787,49.523254,50.315173,49.856181,72.501587,58.967364,57.750591,55.497764,52.703849,49.56721,48.259862,46.860517,72.904259,58.972409,57.675955,55.36095,52.504893,49.533416,48.271873,25.626741,69.679324,50.517284,50.3873,49.821003,50.0,49.950653,49.984612,25.626741,69.749214,51.773008,50.659557,49.809904,50.0,49.947973,49.983703,48.497854,59.226556,50.977506,50.402428,49.880448,50.131736,49.955492,49.992585,48.497854,59.490724,51.538851,51.061202,49.875562,50.137536,49.953548,49.992284


In [40]:
agg_col = 'yprice_time_mean_360'
new_cols = add_diffs(data, agg_col, month_agg_periods)

usecols.extend(new_cols)

selected_cols = greedy_add_strategy(model, data, selected_cols, new_cols,
                                    valid_ratio, test_ratio, droprows)
validate_sklearn_model(model, data, selected_cols, valid_ratio, test_ratio, droprows);

['yprice_time_mean_360_diff_1410', 'yprice_time_mean_360_diff_2820', 'yprice_time_mean_360_diff_4230', 'yprice_time_mean_360_diff_5640', 'yprice_time_mean_360_diff_7050', 'yprice_time_mean_360_diff_14100', 'yprice_time_mean_360_diff_21150', 'yprice_time_mean_360_diff_28200']
added yprice_time_mean_360_diff_1410: r2: 1.728
added yprice_time_mean_360_diff_14100: r2: 1.7293
Data shapes:  (208610, 44) (68526, 44) (51394, 44)

Valid MSE: 		 0.019252
Valid R2 (x100): 	 1.7293

Test MSE: 		 0.01571
Test R2 (x100): 	 1.2965


In [41]:
agg_col = 'xprice_time_mean_360'
new_cols = add_diffs(data, agg_col, month_agg_periods)

usecols.extend(new_cols)

selected_cols = greedy_add_strategy(model, data, selected_cols, new_cols,
                                    valid_ratio, test_ratio, droprows)
validate_sklearn_model(model, data, selected_cols, valid_ratio, test_ratio, droprows);

['xprice_time_mean_360_diff_1410', 'xprice_time_mean_360_diff_2820', 'xprice_time_mean_360_diff_4230', 'xprice_time_mean_360_diff_5640', 'xprice_time_mean_360_diff_7050', 'xprice_time_mean_360_diff_14100', 'xprice_time_mean_360_diff_21150', 'xprice_time_mean_360_diff_28200']
added xprice_time_mean_360_diff_1410: r2: 1.6835
Data shapes:  (208610, 44) (68526, 44) (51394, 44)

Valid MSE: 		 0.019261
Valid R2 (x100): 	 1.6835

Test MSE: 		 0.015721
Test R2 (x100): 	 1.2275


In [67]:
agg_col = 'yprice_time_mean_60'
new_cols = add_diffs(data, agg_col, [60, 120, 360, 720])

usecols.extend(new_cols)

selected_cols = greedy_add_strategy(model, data, selected_cols, new_cols,
                                    valid_ratio, test_ratio, droprows)
validate_sklearn_model(model, data, selected_cols, valid_ratio, test_ratio, droprows);

['yprice_time_mean_60_diff_60', 'yprice_time_mean_60_diff_120', 'yprice_time_mean_60_diff_360', 'yprice_time_mean_60_diff_720']
added yprice_time_mean_60_diff_360: r2: 1.682
Data shapes:  (208610, 45) (68526, 45) (51394, 45)

Valid MSE: 		 0.019261
Valid R2 (x100): 	 1.682

Test MSE: 		 0.015716
Test R2 (x100): 	 1.2614


In [69]:
new_cols = add_time_depended_rolling(data, 'xprice', oneday_agg_periods, np.max, 'max')
usecols.extend(new_cols)

selected_cols = greedy_add_strategy(model, data, selected_cols, new_cols,
                                    valid_ratio, test_ratio, droprows)
validate_sklearn_model(model, data, selected_cols, valid_ratio, test_ratio, droprows);

['xprice_time_max_6', 'xprice_time_max_60', 'xprice_time_max_360', 'xprice_time_max_720', 'xprice_time_max_1410']
added xprice_time_max_6: r2: 1.7089
added xprice_time_max_60: r2: 1.7187
Data shapes:  (208610, 48) (68526, 48) (51394, 48)

Valid MSE: 		 0.019254
Valid R2 (x100): 	 1.7187

Test MSE: 		 0.015718
Test R2 (x100): 	 1.2495


In [70]:
agg_col = 'xprice_time_max_360'
new_cols = add_diffs(data, agg_col, month_agg_periods)

usecols.extend(new_cols)

selected_cols = greedy_add_strategy(model, data, selected_cols, new_cols,
                                    valid_ratio, test_ratio, droprows)
validate_sklearn_model(model, data, selected_cols, valid_ratio, test_ratio, droprows);

['xprice_time_max_360_diff_1410', 'xprice_time_max_360_diff_2820', 'xprice_time_max_360_diff_4230', 'xprice_time_max_360_diff_5640', 'xprice_time_max_360_diff_7050', 'xprice_time_max_360_diff_14100', 'xprice_time_max_360_diff_21150', 'xprice_time_max_360_diff_28200']
added xprice_time_max_360_diff_2820: r2: 1.721
Data shapes:  (208610, 49) (68526, 49) (51394, 49)

Valid MSE: 		 0.019254
Valid R2 (x100): 	 1.721

Test MSE: 		 0.015717
Test R2 (x100): 	 1.2559


In [71]:
new_cols = add_time_depended_rolling(data, 'yprice', oneday_agg_periods, np.max, 'max')
usecols.extend(new_cols)

selected_cols = greedy_add_strategy(model, data, selected_cols, new_cols,
                                    valid_ratio, test_ratio, droprows)
validate_sklearn_model(model, data, selected_cols, valid_ratio, test_ratio, droprows);

['yprice_time_max_6', 'yprice_time_max_60', 'yprice_time_max_360', 'yprice_time_max_720', 'yprice_time_max_1410']
added yprice_time_max_6: r2: 1.7465
added yprice_time_max_60: r2: 1.7527
Data shapes:  (208610, 51) (68526, 51) (51394, 51)

Valid MSE: 		 0.019247
Valid R2 (x100): 	 1.7527

Test MSE: 		 0.015717
Test R2 (x100): 	 1.2534


In [74]:
super_selected_cols = greedy_add_del_strategy(model, data, usecols, valid_ratio, test_ratio,
                                        droprows, add_frequency=4)
validate_sklearn_model(model, data, super_selected_cols, valid_ratio, test_ratio, droprows);

removed xprice_time_max_360_diff_28200: r2: -5.1616
removed xprice_time_max_1410: r2: -4.6849
removed xprice_time_max_360_diff_1410: r2: -4.1426
removed yprice_time_max_360_diff_1410: r2: -3.7471
removed yprice_full_history_diff: r2: -3.4968
removed xy_geom_full_history_diff: r2: -3.1429
removed yprice_time_max_720_diff_1410: r2: -2.9145
removed xprice_time_max_360_diff_2820: r2: -2.7344
removed yprice_time_max_360_diff_2820: r2: -2.4681
removed xprice_time_max_360: r2: -2.2695
removed ydiff_from_opening: r2: -2.0747
removed yprice_time_max_360_diff_28200: r2: -1.8892
removed xprice_time_mean_360_rsi_60: r2: -1.7253
removed yx_spread_rsi_360: r2: -1.5614
removed yx_spread_rsi_1410: r2: -1.4278
removed xprice_time_max_720: r2: -1.3009
removed yprice_time_max_360_diff_21150: r2: -1.1645
removed yprice_time_max_720_diff_21150: r2: -0.9972
removed yx_spread_rsi_720: r2: -0.88213
removed yx_spread_rsi_60: r2: -0.74746
removed yprice_time_max_60: r2: -0.63882
removed yprice_time_mean_360_dif

In [73]:
agg_col = 'yprice_time_max_720'
new_cols = add_diffs(data, agg_col, month_agg_periods)

usecols.extend(new_cols)

selected_cols = greedy_add_strategy(model, data, selected_cols, new_cols,
                                    valid_ratio, test_ratio, droprows)
validate_sklearn_model(model, data, selected_cols, valid_ratio, test_ratio, droprows);

['yprice_time_max_720_diff_1410', 'yprice_time_max_720_diff_2820', 'yprice_time_max_720_diff_4230', 'yprice_time_max_720_diff_5640', 'yprice_time_max_720_diff_7050', 'yprice_time_max_720_diff_14100', 'yprice_time_max_720_diff_21150', 'yprice_time_max_720_diff_28200']
Data shapes:  (208610, 53) (68526, 53) (51394, 53)

Valid MSE: 		 0.019235
Valid R2 (x100): 	 1.8142

Test MSE: 		 0.015739
Test R2 (x100): 	 1.1185


In [42]:
# l1_model = ElasticNet(alpha=1, l1_ratio=0.03)
# selected_cols = list(set(selected_cols))
# validate_sklearn_model(l1_model, data, selected_cols, valid_ratio, test_ratio, droprows);
# reg_cols = np.array(selected_cols)[l1_model.coef_ == 0]
# validate_sklearn_model(model, data, reg_cols.tolist(), valid_ratio, test_ratio, droprows);

Data shapes:  (208610, 44) (68526, 44) (51394, 44)

Valid MSE: 		 0.019513
Valid R2 (x100): 	 0.39846

Test MSE: 		 0.015889
Test R2 (x100): 	 0.17463
Data shapes:  (208610, 39) (68526, 39) (51394, 39)

Valid MSE: 		 0.019288
Valid R2 (x100): 	 1.5475

Test MSE: 		 0.015702
Test R2 (x100): 	 1.3493


In [None]:
new_cols = add_ewma(data, 'xy_garmonic', twoweeks_agg_periods)
usecols.extend(new_cols)

model = Ridge(alpha=10)

selected_cols = greedy_add_strategy(model, data, selected_cols, new_cols,
                                    valid_ratio, test_ratio, droprows=7050)
validate_sklearn_model(model, data, selected_cols, valid_ratio, test_ratio, droprows=7050);