Simple lgb baseline with GroupKFold CV split by investment id. The last 2 mio rows are used for training due to memory restrictions.

Dataset is taken from: https://www.kaggle.com/columbia2131/speed-up-reading-csv-to-pickle

In [None]:
import numpy as np
import pandas as pd
import os
from sklearn.model_selection import KFold, GroupKFold
from sklearn.linear_model import Ridge
from scipy.stats import pearsonr as p
import lightgbm as lgb

import warnings
warnings.filterwarnings('ignore')

In [None]:
train = pd.read_pickle('../input/ump-train-picklefile/train.pkl')[-1500000:]
kfold = GroupKFold(n_splits=5)
feats = [f for f in train.columns if f not in ['time_id', 'row_id', 'target']]
target = 'target'

In [None]:
train[feats]

In [None]:
fold_scores = []
models = []

for fold, (trn_idx, val_idx) in enumerate(kfold.split(train[feats], train[target], groups=train['time_id'])):
    xtrain, ytrain = train.iloc[trn_idx][feats], train.iloc[trn_idx][target]
    xval, yval = train.iloc[val_idx][feats], train.iloc[val_idx][target]
    
    model = lgb.LGBMRegressor(objective='regression', metric='rmse', n_estimators=1000, num_leaves=32, learning_rate=0.1, subsample=0.8, feature_fraction=0.6)
    model.fit(xtrain, ytrain, eval_metric='rmse', eval_set=[(xtrain, ytrain), (xval, yval)], verbose=100, early_stopping_rounds=100)
    val_preds = model.predict(xval)
    
    score = p(val_preds, yval)[0]
    print(f"Fold {fold + 1}: {score}")
    fold_scores.append(score)
    models.append(model)

In [None]:
print(f"Overall score: {np.mean(fold_scores)}")

In [None]:
import ubiquant
env = ubiquant.make_env()  
iter_test = env.iter_test()

In [None]:
for (test_df, sample_prediction_df) in iter_test:
    test_df = test_df[feats]
    
    pred_0 = models[0].predict(test_df)
    pred_1 = models[1].predict(test_df)
    pred_2 = models[2].predict(test_df)
    pred_3 = models[3].predict(test_df)
    pred_4 = models[4].predict(test_df)
    pred = (pred_0 + pred_1 + pred_2 + pred_3 + pred_4) / 5
    sample_prediction_df['target'] = pred
    env.predict(sample_prediction_df) 