In [1]:
from eli5.sklearn import PermutationImportance 
import eli5
from sklearn.model_selection import KFold, GroupKFold
import lightgbm as lgb
from optiver_features import *

In [237]:
train_df = pd.read_feather('train_182cols.feather')
test_df =pd.read_feather('test_182cols.feather')

In [238]:
def rmspe_np(y_true, y_pred):
    return np.sqrt(np.mean(np.square((y_true - y_pred) / y_true)))
def feval_rmspe(y_true, y_pred):
    return 'RMSPE', round(rmspe_np(y_true = y_true, y_pred = y_pred), 5), False

params_lgbm = {
        'task': 'train',
        'boosting_type': 'gbdt',
        'learning_rate': 0.01,
        'objective': 'regression',
        'metric': 'None',
        'max_depth': -1,
        'n_jobs': -1,
        'feature_fraction': 0.7,
        'bagging_fraction': 0.7,
        'lambda_l2': 1,
        'verbose': -1,
        'early_stopping_rounds': 500,
        #'bagging_freq': 5
        #'device_type':'gpu'
}

def train_models(train, to_keep = None, with_importance=False):
    # Hyperparammeters (optimized)
    seed = 29
    
    
    # Split features and target
    if to_keep: x = train[to_keep]
    else: x = train.drop(['row_id', 'target', 'time_id'], axis = 1)
    y = train['target']
    # Transform stock id to a numeric value
    #x['stock_id'] = x['stock_id'].astype(int)
    models =[]
    # Create out of folds array
    oof_predictions = np.zeros(x.shape[0])
    # Create a KFold object
    kfold = GroupKFold()
    # Iterate through each fold
    importances = []
    for fold, (trn_ind, val_ind) in enumerate(kfold.split(x, groups = train.time_id)):
        print(f'Training fold {fold + 1}')
        x_train, x_val = x.iloc[trn_ind], x.iloc[val_ind]
        y_train, y_val = y.iloc[trn_ind], y.iloc[val_ind]
        # Root mean squared percentage error weights
        train_weights = 1 / np.square(y_train)
        val_weights = 1 / np.square(y_val)
        train_dataset = lgb.Dataset(x_train, y_train, weight = train_weights)
        val_dataset = lgb.Dataset(x_val, y_val, weight = val_weights)
        
        weights_1 = 1/np.square(y_train)
        weights_2 = 1/np.square(y_val)

        model = lgb.LGBMRegressor(**params_lgbm, 
                                  random_state = 1976, 

                                  device_type = 'gpu',
                                  n_estimators= 5000)
        model.fit(x_train, y_train, 
              eval_set=[(x_val, y_val)], 
              eval_metric = feval_rmspe,
              sample_weight=weights_1,
              eval_sample_weight=[weights_2],
              verbose=500,
              categorical_feature = ['stock_id']
                 )
        models.append(model)
        oof_predictions[val_ind] = model.predict(x_val)
        if with_importance:
            perm = PermutationImportance(model, random_state=42)
            perm.fit(x_val, y_val)
            importances.append(perm.feature_importances_)
    rmspe_score = rmspe_np(y, oof_predictions)
    print(f'Our out of folds RMSPE is {rmspe_score}')
    # Return test predictions
    return models, importances

In [239]:
models, feat_imps = train_models(train_df)

Training fold 1


New categorical_feature is ['stock_id']






Training until validation scores don't improve for 500 rounds
[500]	valid_0's RMSPE: 0.22068
[1000]	valid_0's RMSPE: 0.22
Early stopping, best iteration is:
[802]	valid_0's RMSPE: 0.21975
Training fold 2
Training until validation scores don't improve for 500 rounds
[500]	valid_0's RMSPE: 0.22242
Early stopping, best iteration is:
[484]	valid_0's RMSPE: 0.22235
Training fold 3
Training until validation scores don't improve for 500 rounds
[500]	valid_0's RMSPE: 0.2229
[1000]	valid_0's RMSPE: 0.22069
[1500]	valid_0's RMSPE: 0.22014
[2000]	valid_0's RMSPE: 0.22008
[2500]	valid_0's RMSPE: 0.21995
Early stopping, best iteration is:
[2469]	valid_0's RMSPE: 0.21995
Training fold 4
Training until validation scores don't improve for 500 rounds
[500]	valid_0's RMSPE: 0.22359
[1000]	valid_0's RMSPE: 0.22264
Early stopping, best iteration is:
[999]	valid_0's RMSPE: 0.22254
Training fold 5
Training until validation scores don't improve for 500 rounds
[500]	valid_0's RMSPE: 0.22169
[1000]	valid_0's R

In [246]:
def test_score(models, columns):
    test_pred = np.zeros(len(test_df))
    for model in models:
        pred = model.predict(test_df[columns]) 
        test_pred += pred /5

    return  rmspe_np(test_df.target, test_pred)


In [247]:
starting_cols = [x for x in train_df.columns if x not in ['row_id', 'target', 'time_id']]

In [248]:
test_score(models, starting_cols)

0.22334512233363665

In [249]:
windows = list(zip(range(0,601,100), [600]*6))
windows

[(0, 600), (100, 600), (200, 600), (300, 600), (400, 600), (500, 600)]

In [303]:
minimal_cols = [ 'log_return2_std_0_600',  'log_return_price_std_0_600', '5m_pred', 'stock_id'] 
minimal_cols += [f'time_emb{x}' for x in range(10)]
minimal_cols +=['order_count_sum_0_600', 'seconds_in_bucket_size_0_600', 'size_sum_0_600']
minimal_cols += [f'stock_emb{x}' for x in range(9)]
minimal_cols += ['log_return1_std_0_600_min_time', 'log_return1_std_0_600_mean_time']

minimal_cols +=['log_return1_std_0_600_min_stock', 'log_return1_std_0_600_mean_stock']
minimal_cols += [f'{log_ret}_{a}_{b}' for a,b in windows for log_ret in ['log_return1_std']]

minimal_cols += [f'price_spread_mean_0_600']
minimal_cols += [f'log_return_price_std_{a}_{b}_mean_time' for a, b in windows]
minimal_cols += [f'log_return_price_std_{a}_{b}_min_time' for a, b in windows]
minimal_cols += ['total_volume_mean_0_600']
len(minimal_cols)

50

In [302]:
minimal_cols

['log_return2_std_0_600',
 'log_return_price_std_0_600',
 '5m_pred',
 'stock_id',
 'time_emb0',
 'time_emb1',
 'time_emb2',
 'time_emb3',
 'time_emb4',
 'time_emb5',
 'time_emb6',
 'time_emb7',
 'time_emb8',
 'time_emb9',
 'order_count_sum_0_600',
 'seconds_in_bucket_size_0_600',
 'size_sum_0_600',
 'log_return1_std_0_600_min_time',
 'log_return1_std_0_600_mean_time',
 'log_return1_std_0_600_min_stock',
 'log_return1_std_0_600_mean_stock',
 'log_return1_std_0_600',
 'log_return1_std_100_600',
 'log_return1_std_200_600',
 'log_return1_std_300_600',
 'log_return1_std_400_600',
 'log_return1_std_500_600',
 'price_spread_mean_0_600',
 'log_return_price_std_0_600_mean_time',
 'log_return_price_std_100_600_mean_time',
 'log_return_price_std_200_600_mean_time',
 'log_return_price_std_300_600_mean_time',
 'log_return_price_std_400_600_mean_time',
 'log_return_price_std_500_600_mean_time',
 'log_return_price_std_0_600_min_time',
 'log_return_price_std_100_600_min_time',
 'log_return_price_std_2

In [259]:
# for x in feat_imp_df.max(axis=0).sort_values(ascending=False).index.to_list():
#     if x not in minimal_cols: print(x)

In [305]:
models, imps = train_models(train_df, minimal_cols)

Training fold 1


New categorical_feature is ['stock_id']


Training until validation scores don't improve for 500 rounds
[500]	valid_0's RMSPE: 0.22049
[1000]	valid_0's RMSPE: 0.22023
Early stopping, best iteration is:
[746]	valid_0's RMSPE: 0.21959
Training fold 2
Training until validation scores don't improve for 500 rounds
[500]	valid_0's RMSPE: 0.22391
[1000]	valid_0's RMSPE: 0.22685
Early stopping, best iteration is:
[514]	valid_0's RMSPE: 0.22385
Training fold 3
Training until validation scores don't improve for 500 rounds
[500]	valid_0's RMSPE: 0.22282
[1000]	valid_0's RMSPE: 0.21987
[1500]	valid_0's RMSPE: 0.21905
[2000]	valid_0's RMSPE: 0.21905
Early stopping, best iteration is:
[1583]	valid_0's RMSPE: 0.219
Training fold 4
Training until validation scores don't improve for 500 rounds
[500]	valid_0's RMSPE: 0.22361
[1000]	valid_0's RMSPE: 0.22285
Early stopping, best iteration is:
[870]	valid_0's RMSPE: 0.22275
Training fold 5
Training until validation scores don't improve for 500 rounds
[500]	valid_0's RMSPE: 0.22264
[1000]	valid_0's

In [306]:
test_score(models, minimal_cols)

0.2231407111076792

In [304]:
2216, 2232

(2216, 2232)

In [283]:
to_drop  =pd.DataFrame(imps, columns = minimal_cols).max(axis=0).sort_values()[:1].index.to_list()

In [296]:
C = train_df[minimal_cols].corr()
for i in range(len(C)):
    for j in range(i+1, len(C)):
        if C.iloc[i,j] > .95:
            print(C.columns[i], C.columns[j], C.iloc[i,j])

log_return2_std_0_600 log_return1_std_0_600 0.970282057943801
log_return2_std_0_600 log_return1_std_100_600 0.9641458966476303
log_return2_std_0_600 log_return1_std_200_600 0.9540069652041591
5m_pred log_return1_std_100_600 0.9534987560166015
5m_pred log_return1_std_200_600 0.9631501229739752
5m_pred log_return1_std_300_600 0.9711535326423157
5m_pred log_return1_std_400_600 0.9667020406285889
log_return1_std_0_600_mean_time log_return_price_std_0_600_mean_time 0.9838662929929951
log_return1_std_0_600_mean_time log_return_price_std_100_600_mean_time 0.9840205362498697
log_return1_std_0_600_mean_time log_return_price_std_200_600_mean_time 0.9834728804881593
log_return1_std_0_600_mean_time log_return_price_std_300_600_mean_time 0.9821428551607181
log_return1_std_0_600_mean_time log_return_price_std_400_600_mean_time 0.9805363755189537
log_return1_std_0_600_mean_time log_return_price_std_500_600_mean_time 0.9793468802257981
log_return1_std_0_600 log_return1_std_100_600 0.9928527275972774
l

In [295]:
C.iloc[0,1]

0.8839656981232888