In [None]:
# import libs
import pandas as pd
import numpy as np
import lightgbm as lgb
import gc
import optuna

from sklearn.metrics import mean_squared_error
from lightgbm import LGBMRegressor

This notebook is a modified version of the following one:
https://www.kaggle.com/yosukeyama/ubiquant-simple-lgbm-train-infer

In [None]:
# reduce cols for use to save memory capacity
basic_cols = ['row_id', 'time_id', 'investment_id', 'target']
num_feat = 5
features = [f'f_{i}' for i in range(num_feat)]
cols = basic_cols + features

# load data
train_df = pd.read_csv('../input/ubiquant-market-prediction/train.csv', usecols=cols)
display(train_df)

gc.collect()

I only use 5 features here as an example. If you want to achieve a better score then perhaps you should use more

In [None]:
print(train_df.info())
print('')
print(train_df.describe())

In [None]:
# split train data
investment_ids = train_df['investment_id'].unique()
num_ids = len(investment_ids)
tr_rate = 0.8

tr_ids = investment_ids[:int(num_ids*tr_rate)]
val_ids = investment_ids[int(num_ids*tr_rate):]
print('train: ', len(tr_ids), )
print('val: ', len(val_ids),)

train = train_df[train_df['investment_id'].isin(tr_ids)]
valid = train_df[train_df['investment_id'].isin(val_ids)]

display(train)
display(valid)

In [None]:
# prepare for training
tr_y = train['target'].values
tr_x = train[features].values
val_y = valid['target'].values
val_x = valid[features].values
lgb_train = lgb.Dataset(tr_x, tr_y)
lgb_eval = lgb.Dataset(val_x, val_y)

del train_df, train, valid
gc.collect()

You can add more parameters, those are given as an example

In [None]:
def objective(trial):
    
    train_x, test_x, train_y, test_y = tr_x, val_x, tr_y, val_y
    param = {
        'metric': 'rmse', 
        'random_state': 2022,
        'n_estimators': 500,
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-8, 10.0),
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-8, 10.0),
        'max_depth': trial.suggest_int("max_depth", 2, 30),
        'num_leaves' : trial.suggest_int('num_leaves', 2, 1000),
        'min_child_samples': trial.suggest_int('min_child_samples', 1, 500),
    }
    model = LGBMRegressor(**param)  
    
    model.fit(train_x,train_y,eval_set=[(test_x,test_y)],early_stopping_rounds=10,verbose=-1)
    
    preds = model.predict(test_x)
    
    rmse = mean_squared_error(test_y, preds,squared=False)
    
    return rmse

Increase the number of trials (***n_trials***) if you have some time *to waste*

In [None]:
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=3)
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)

In [None]:
gc.collect()

In [None]:
# training with lgbm
params=study.best_params   
params['random_state'] = 2022
params['n_estimators'] = 1000
params['metric'] = 'rmse'

model = LGBMRegressor(**params)

train_x, test_x, train_y, test_y = tr_x, val_x, tr_y, val_y

model.fit(train_x,train_y,eval_set=[(test_x,test_y)],early_stopping_rounds=100,verbose=-1)

In [None]:
# inference
import ubiquant
env = ubiquant.make_env()   # initialize the environment
iter_test = env.iter_test()    # an iterator which loops over the test set and sample submission
for (test_df, sample_prediction_df) in iter_test:
    preds = model.predict(test_df[features].values)
    sample_prediction_df['target'] = preds  # make your predictions here
    env.predict(sample_prediction_df)   # register your predictions