# Train lightgbm models for clicks, carts and orders #

In [None]:
local = False
if local:
  from google.colab import drive
  drive.mount('/content/drive')
  %cd /content/drive/MyDrive/'Kaggle Otto Reccommender'/data
  path_to_module = '/content/drive/MyDrive/Kaggle Otto Reccommender/'
else:
  !mkdir /my_mnt_dir
  !google-drive-ocamlfuse /my_mnt_dir
  %cd /my_mnt_dir/'Kaggle Otto Reccommender'/data
  path_to_module = '/my_mnt_dir/Kaggle Otto Reccommender/'

import sys    
sys.path.append(path_to_module)

In [None]:
!pip install fastparquet
!pip install polars
!pip install lightgbm --install-option=--gpu

In [None]:
import glob
import numpy as np
import pandas as pd
import gc
import polars as pl
import lightgbm as lgbm
from otto_utils import convert_columns, calculate_recall, make_directory
from sklearn.model_selection import GroupKFold

In [None]:
labels = pd.read_parquet('./validation/test_labels.parquet')

In [None]:
path_to_training_data = './train_training_data'

training_data = pl.read_parquet(f'{path_to_training_data}/training_data_2.parquet')
training_data = training_data.drop('train_aid_penetration')
training_data = training_data.to_pandas()

for column in training_data.columns:
  if training_data[column].dtype in ('Int32', 'float64', 'Int16'):
    training_data[column] = training_data[column].astype('float32')
training_data.replace([np.inf, -np.inf], np.nan, inplace=True)

## Functions to train lightgbm models ##

In [None]:
def evaluate_model(model, dvalid, valid_idx, response, recall_type):
  preds = model.predict(dvalid)
  predictions = training_data.loc[valid_idx,['session', 'aid'] + [response]]
  predictions['preds'] = preds
  predictions.sort_values(by=['session', 'preds'], ascending=[True, False], inplace=True)

  predictions['n'] = predictions.groupby('session').cumcount() + 1
  submitted_predictions = predictions.loc[predictions['n'] <= 20]
  score = calculate_recall(labels, submitted_predictions, recall_type)

  return score

In [None]:
def train_and_score_model(params, folds, response, recall_type, down_sample_prop=0.10,save=False, eval_at=1):
  skf = GroupKFold(n_splits=folds)
  features = [feature for feature in training_data.columns if feature not in ['session', 'aid', 'click_response', 'cart_response', 'order_response']]
  avg_recall = 0

  for fold, (train_idx, valid_idx) in enumerate(skf.split(training_data, training_data[response], groups=training_data['session'])):

      train_df = training_data.loc[train_idx].copy()
      valid_df = training_data.loc[valid_idx].copy()

      X_valid = valid_df[features]
      y_valid = valid_df[response] 
      valid_groups = valid_df.groupby('session', as_index=False).agg({'aid' : 'count'}).aid.values.tolist()
      
      n_negatives = int(train_df.loc[train_df[response] == 0].shape[0] * down_sample_prop)
      pos_samples = train_df.loc[train_df[response] == 1]
      neg_samples = train_df.loc[train_df[response]==0].sample(n_negatives, random_state=42, replace=False)
      del train_df, valid_df
      train_df_downsampled = pd.concat([pos_samples, neg_samples], ignore_index=True).sort_values(by='session').reset_index()
      del pos_samples, neg_samples
      train_groups = train_df_downsampled.groupby('session', as_index=False).agg({'aid' : 'count'}).aid.values.tolist()
      X_train = train_df_downsampled[features]
      y_train = train_df_downsampled[response]
      del train_df_downsampled

      print('beginning model training')
      ranker = lgbm.LGBMRanker(**params)

      model = ranker.fit(
          X=X_train,
          y=y_train,
          group=train_groups,
          eval_set=[(X_valid, y_valid)],
          eval_group=[valid_groups],
          verbose=10,
          early_stopping_rounds=200,
          eval_at=eval_at
          )

      fold_recall = evaluate_model(model, X_valid, valid_idx, response, recall_type)
      print(fold_recall)
      avg_recall += fold_recall / folds
      del X_train, y_train, X_valid, y_valid

      if save:
        make_directory(f'../models/lgbm_models')
        model.booster_.save_model(f'../models/lgbm_models/{recall_type}_{fold}.lgbm')

  return avg_recall

## Train clicks boosters ##

In [None]:
params = {
  'boosting_type': 'gbdt',
  'num_leaves': 28,
  'min_data_in_leaf': 2231,
  'max_depth': 13,
  'learning_rate': 0.05,
  'feature_fraction': 0.8,
  'bagging_fraction': 0.8,
  'bagging_freq': 8,
  'n_estimators': 5000,
}

eval_at = 5
downsample_factor = 0.1
folds = 6
response = 'click_response'
recall_type = 'clicks'
down_sample_prop = training_data[response].sum() / (training_data.shape[0] * downsample_factor)
save=True

avg_score = train_and_score_model(params, folds=folds, response=response, recall_type=recall_type, save=save, down_sample_prop=down_sample_prop, eval_at=eval_at)

beginning model training
Training until validation scores don't improve for 200 rounds.
[10]	valid_0's ndcg@5: 0.679872
[20]	valid_0's ndcg@5: 0.680311
[30]	valid_0's ndcg@5: 0.680812
[40]	valid_0's ndcg@5: 0.68098
[50]	valid_0's ndcg@5: 0.681339
[60]	valid_0's ndcg@5: 0.681502
[70]	valid_0's ndcg@5: 0.68174
[80]	valid_0's ndcg@5: 0.6836
[90]	valid_0's ndcg@5: 0.683753
[100]	valid_0's ndcg@5: 0.683936
[110]	valid_0's ndcg@5: 0.68413
[120]	valid_0's ndcg@5: 0.68421
[130]	valid_0's ndcg@5: 0.684534
[140]	valid_0's ndcg@5: 0.684647
[150]	valid_0's ndcg@5: 0.684802
[160]	valid_0's ndcg@5: 0.685003
[170]	valid_0's ndcg@5: 0.68514
[180]	valid_0's ndcg@5: 0.685308
[190]	valid_0's ndcg@5: 0.685363
[200]	valid_0's ndcg@5: 0.685478
[210]	valid_0's ndcg@5: 0.685555
[220]	valid_0's ndcg@5: 0.685627
[230]	valid_0's ndcg@5: 0.68571
[240]	valid_0's ndcg@5: 0.6857
[250]	valid_0's ndcg@5: 0.685781
[260]	valid_0's ndcg@5: 0.685865
[270]	valid_0's ndcg@5: 0.685868
[280]	valid_0's ndcg@5: 0.68598
[290]	va

In [None]:
avg_score

0.539648767848666

## Train Carts Booster ##

In [None]:
params = {
    'boosting_type': 'gbdt',
    'num_leaves': 43,
    'min_data_in_leaf': 861,
    'max_depth': 9,
    'learning_rate': 0.05,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 8,
    'n_estimators' : 5000
}

eval_at = 20
downsample_factor = 0.1
folds = 5
response = 'cart_response'
recall_type = 'carts'
save=True
down_sample_prop = training_data[response].sum() / (training_data.shape[0] * downsample_factor)

avg_score = train_and_score_model(params, folds=folds, response=response, recall_type=recall_type, save=save, down_sample_prop=down_sample_prop, eval_at=eval_at)

beginning model training
Training until validation scores don't improve for 200 rounds.
[10]	valid_0's ndcg@20: 0.956519
[20]	valid_0's ndcg@20: 0.956594
[30]	valid_0's ndcg@20: 0.956698
[40]	valid_0's ndcg@20: 0.956772
[50]	valid_0's ndcg@20: 0.956826
[60]	valid_0's ndcg@20: 0.956879
[70]	valid_0's ndcg@20: 0.956946
[80]	valid_0's ndcg@20: 0.956975
[90]	valid_0's ndcg@20: 0.957003
[100]	valid_0's ndcg@20: 0.957023
[110]	valid_0's ndcg@20: 0.957066
[120]	valid_0's ndcg@20: 0.957097
[130]	valid_0's ndcg@20: 0.957105
[140]	valid_0's ndcg@20: 0.957125
[150]	valid_0's ndcg@20: 0.957139
[160]	valid_0's ndcg@20: 0.957162
[170]	valid_0's ndcg@20: 0.957185
[180]	valid_0's ndcg@20: 0.957205
[190]	valid_0's ndcg@20: 0.957224
[200]	valid_0's ndcg@20: 0.957231
[210]	valid_0's ndcg@20: 0.957231
[220]	valid_0's ndcg@20: 0.957242
[230]	valid_0's ndcg@20: 0.957243
[240]	valid_0's ndcg@20: 0.957245
[250]	valid_0's ndcg@20: 0.957275
[260]	valid_0's ndcg@20: 0.957277
[270]	valid_0's ndcg@20: 0.957286
[28

In [None]:
avg_score

0.4268612757220287

## Orders model ##

In [None]:
params = {
    'boosting_type': 'gbdt',
    'num_leaves': 95,
    'min_data_in_leaf': 345,
    'max_depth': 5,
    'learning_rate': 0.05,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 8,
    'n_estimators' : 5000
}

eval_at = 20
downsample_factor = 0.1
down_sample_prop = training_data[response].sum() / (training_data.shape[0] * downsample_factor)
folds = 6
response = 'order_response'
recall_type = 'orders'
save=True

avg_score = train_and_score_model(params, folds=folds, response=response, recall_type=recall_type, save=save, down_sample_prop=down_sample_prop, eval_at=eval_at)

beginning model training
Training until validation scores don't improve for 200 rounds.
[10]	valid_0's ndcg@20: 0.985236
[20]	valid_0's ndcg@20: 0.985464
[30]	valid_0's ndcg@20: 0.985581
[40]	valid_0's ndcg@20: 0.98564
[50]	valid_0's ndcg@20: 0.985804
[60]	valid_0's ndcg@20: 0.985849
[70]	valid_0's ndcg@20: 0.985923
[80]	valid_0's ndcg@20: 0.98599
[90]	valid_0's ndcg@20: 0.986024
[100]	valid_0's ndcg@20: 0.986066
[110]	valid_0's ndcg@20: 0.986107
[120]	valid_0's ndcg@20: 0.986133
[130]	valid_0's ndcg@20: 0.986166
[140]	valid_0's ndcg@20: 0.986184
[150]	valid_0's ndcg@20: 0.986195
[160]	valid_0's ndcg@20: 0.986207
[170]	valid_0's ndcg@20: 0.986234
[180]	valid_0's ndcg@20: 0.986238
[190]	valid_0's ndcg@20: 0.986251
[200]	valid_0's ndcg@20: 0.986244
[210]	valid_0's ndcg@20: 0.986233
[220]	valid_0's ndcg@20: 0.98625
[230]	valid_0's ndcg@20: 0.986257
[240]	valid_0's ndcg@20: 0.986276
[250]	valid_0's ndcg@20: 0.986288
[260]	valid_0's ndcg@20: 0.986294
[270]	valid_0's ndcg@20: 0.986283
[280]	

In [None]:
avg_score

0.6601109761094428