# Train xgboost models to rank clicks, carts and orders #

In [None]:
local = False
if local:
  from google.colab import drive
  drive.mount('/content/drive')
  %cd /content/drive/MyDrive/'Kaggle Otto Reccommender'/data
  path_to_module = '/content/drive/MyDrive/Kaggle Otto Reccommender/'
else:
  !mkdir /my_mnt_dir
  !google-drive-ocamlfuse /my_mnt_dir
  %cd /my_mnt_dir/'Kaggle Otto Reccommender'/data
  path_to_module = '/my_mnt_dir/Kaggle Otto Reccommender/'

import sys    
sys.path.append(path_to_module)

In [None]:
!pip install optuna
!pip install fastparquet
!pip install polars
!pip install xgboost --upgrade

In [None]:
import glob
import numpy as np
import pandas as pd
import gc
import polars as pl
import xgboost as xgb
from otto_utils import convert_columns, calculate_recall, make_directory
from sklearn.model_selection import GroupKFold
from optuna import trial
import optuna

In [None]:
gpu = False
tree_method = 'gpu_hist' if gpu else 'hist'
labels = pd.read_parquet('./validation/test_labels.parquet')

In [None]:
path_to_training_data = './train_training_data' if validation else './test_training_data'
training_data = pl.read_parquet(f'{path_to_training_data}/training_data_2.parquet')
training_data = training_data.drop(['train_aid_penetration', 'percent_of_test_weeks_interacted']) ## This column is wrong -> rekt!
training_data = training_data.to_pandas()

for column in training_data.columns:
  if training_data[column].dtype in ('Int32', 'float64', 'Int16'):
    training_data[column] = training_data[column].astype('float32')
training_data.replace([np.inf, -np.inf], np.nan, inplace=True)

Xgboost training functions

In [None]:
def evaluate_model(model, dvalid, valid_idx, response, recall_type):
  preds = model.predict(dvalid)
  predictions = training_data.loc[valid_idx,['session', 'aid'] + [response]]
  predictions['preds'] = preds
  predictions.sort_values(by=['session', 'preds'], ascending=[True, False], inplace=True)

  predictions['n'] = predictions.groupby('session').cumcount() + 1
  submitted_predictions = predictions.loc[predictions['n'] <= 20]
  score = calculate_recall(labels, submitted_predictions, recall_type)

  return score

def train_and_score_model(params, folds, response, recall_type, down_sample_prop=0.10,save=False):
  skf = GroupKFold(n_splits=folds)
  features = [feature for feature in training_data.columns if feature not in ['session', 'aid', 'click_response', 'cart_response', 'order_response']]
  avg_recall = 0

  for fold, (train_idx, valid_idx) in enumerate(skf.split(training_data, training_data[response], groups=training_data[['session']])):

      # Build X dataset
      train_df = training_data.loc[train_idx]
      pos_samples = train_df.loc[train_df[response] == 1]
      n_negatives = int(train_df.loc[train_df[response] == 0].shape[0] * down_sample_prop)
      neg_samples = train_df.loc[train_df[response]==0].sample(n_negatives, random_state=2023, replace=False)
      train_df_downsampled = pd.concat([pos_samples, neg_samples], ignore_index=True).sort_values(by='session').reset_index()
      print('built the downsampled frame')
      del neg_samples, pos_samples
      train_groups = train_df_downsampled.groupby('session', as_index=False).agg({'aid' : 'count'}).aid.values.tolist()
      X_train = train_df_downsampled[features]
      y_train = train_df_downsampled[response]
      del train_df_downsampled
      dtrain = xgb.DMatrix(X_train, y_train)
      dtrain.set_group(train_groups)
      del X_train, y_train
      print('done with x values')
      
      # Build y dataset
      valid_df = training_data.loc[valid_idx]
      X_valid = valid_df[features]
      y_valid = valid_df[response] 
      valid_groups = valid_df.groupby('session', as_index=False).agg({'aid' : 'count'}).aid.values.tolist()
      dvalid = xgb.DMatrix(X_valid, y_valid) 
      dvalid.set_group(valid_groups)
      del X_valid, y_valid, train_df

      print('beginning model training')
      model = xgb.train(
          params,
          dtrain=dtrain,
          evals=[(dtrain,'train'), (dvalid, 'valid')],
          num_boost_round=10000,
          early_stopping_rounds=200,
          verbose_eval = 20
      )

      fold_recall = evaluate_model(model, dvalid, valid_idx, response, recall_type)
      print(fold_recall)
      avg_recall += fold_recall / folds
      del dtrain, dvalid

      if save:
        make_directory(f'../models/xgb_models')
        model.save_model(f'../models/xgb_models/{recall_type}_{fold}.xgb')

  return avg_recall

## Train the clicks model ##

In [None]:
down_sample_prop = training_data.click_response.sum() / (training_data.shape[0] * 0.1)

params = {
    'tree_method':tree_method,  # this parameter means using the GPU when training our model to speedup the training process
    'objective': 'rank:ndcg',
    'subsample': 0.8,
    'colsample_bytree' : 0.6,
    'max_depth': 12,
    'learning_rate' : 0.08
}

avg_score = train_and_score_model(
    params,
    folds=5,
    response='click_response',
    recall_type='clicks',
    down_sample_prop=down_sample_prop,
    save=True)

built the downsampled frame
done with x values
beginning model training
[0]	train-map:0.91495	valid-map:0.67065
[20]	train-map:0.92148	valid-map:0.67609
[40]	train-map:0.92314	valid-map:0.67662
[60]	train-map:0.92461	valid-map:0.67709
[80]	train-map:0.92578	valid-map:0.67742
[100]	train-map:0.92692	valid-map:0.67783
[120]	train-map:0.92797	valid-map:0.67786
[140]	train-map:0.92910	valid-map:0.67802
[160]	train-map:0.93021	valid-map:0.67805
[180]	train-map:0.93125	valid-map:0.67816
[200]	train-map:0.93227	valid-map:0.67807
[220]	train-map:0.93329	valid-map:0.67815
[240]	train-map:0.93421	valid-map:0.67813
[260]	train-map:0.93500	valid-map:0.67813
[280]	train-map:0.93574	valid-map:0.67819
[300]	train-map:0.93635	valid-map:0.67816
[320]	train-map:0.93720	valid-map:0.67811
[340]	train-map:0.93797	valid-map:0.67816
[360]	train-map:0.93880	valid-map:0.67814
[380]	train-map:0.93956	valid-map:0.67806
[400]	train-map:0.94030	valid-map:0.67801
[420]	train-map:0.94102	valid-map:0.67799
[440]	trai

In [None]:
avg_score

0.5427067245595409

## Train carts booster ##

In [None]:
down_sample_prop = training_data.cart_response.sum() / (training_data.shape[0] * 0.1)
params = {
    'tree_method':tree_method,  # this parameter means using the GPU when training our model to speedup the training process
    'objective': 'rank:ndcg',
    'subsample': 0.8,
    'colsample_bytree' : 0.6,
    'max_depth': 9,
    'learning_rate' : 0.08
}

avg_score = train_and_score_model(
    params,
    folds=5,
    response='cart_response',
    recall_type='carts',
    down_sample_prop=down_sample_prop, #
    save=True)

built the downsampled frame
done with x values
beginning model training
[0]	train-map:0.99389	valid-map:0.94822
[20]	train-map:0.99456	valid-map:0.94887
[40]	train-map:0.99479	valid-map:0.94904
[60]	train-map:0.99497	valid-map:0.94914
[80]	train-map:0.99514	valid-map:0.94921
[100]	train-map:0.99529	valid-map:0.94924
[120]	train-map:0.99544	valid-map:0.94927
[140]	train-map:0.99560	valid-map:0.94926
[160]	train-map:0.99575	valid-map:0.94928
[180]	train-map:0.99590	valid-map:0.94929
[200]	train-map:0.99603	valid-map:0.94928
[220]	train-map:0.99618	valid-map:0.94926
[240]	train-map:0.99632	valid-map:0.94923
[260]	train-map:0.99643	valid-map:0.94924
[280]	train-map:0.99654	valid-map:0.94921
[300]	train-map:0.99665	valid-map:0.94918
[320]	train-map:0.99678	valid-map:0.94916
[340]	train-map:0.99689	valid-map:0.94913
[360]	train-map:0.99703	valid-map:0.94916
[380]	train-map:0.99712	valid-map:0.94917
[391]	train-map:0.99719	valid-map:0.94916
0.42812323469343416
built the downsampled frame
done

In [None]:
print(avg_score)

0.4268840074866794


## Train orders booster ##

In [None]:
down_sample_prop = training_data.order_response.sum() / (training_data.shape[0] * 0.1)

params = {
    'tree_method':tree_method,  # this parameter means using the GPU when training our model to speedup the training process
    'objective': 'rank:ndcg',
    'subsample': 0.8,
    'colsample_bytree' : 0.6,
    'max_depth': 9,
    'learning_rate' : 0.08
}

avg_score = train_and_score_model(
    params,
    folds=5,
    response='order_response',
    recall_type='orders',
    down_sample_prop=down_sample_prop,
    save=True)

built the downsampled frame
done with x values
beginning model training
[0]	train-map:0.99858	valid-map:0.97863
[20]	train-map:0.99890	valid-map:0.98114
[40]	train-map:0.99902	valid-map:0.98156
[60]	train-map:0.99911	valid-map:0.98196
[80]	train-map:0.99922	valid-map:0.98212
[100]	train-map:0.99932	valid-map:0.98224
[120]	train-map:0.99939	valid-map:0.98228
[140]	train-map:0.99945	valid-map:0.98230
[160]	train-map:0.99951	valid-map:0.98230
[180]	train-map:0.99957	valid-map:0.98225
[200]	train-map:0.99961	valid-map:0.98225
[220]	train-map:0.99965	valid-map:0.98227
[240]	train-map:0.99969	valid-map:0.98226
[260]	train-map:0.99972	valid-map:0.98225
[280]	train-map:0.99974	valid-map:0.98226
[300]	train-map:0.99976	valid-map:0.98227
[320]	train-map:0.99977	valid-map:0.98227
[340]	train-map:0.99979	valid-map:0.98228
[354]	train-map:0.99979	valid-map:0.98229
0.6606786745584786
built the downsampled frame
done with x values
beginning model training
[0]	train-map:0.99857	valid-map:0.97859
[20]	

In [None]:
avg_score

0.6600346588926345

## Estimated score (clicks weighted 0.1, carts 0.3 and orders 0.6) ##

In [None]:
0.1*0.5427067245595409 + 0.3*0.4268840074866794 + 0.6*0.6600346588926345

0.5777253079292023