# Use the xgboost models to score each of the 200 candidates #

In [None]:
local = False
if local:
  from google.colab import drive
  drive.mount('/content/drive')
  %cd /content/drive/MyDrive/'Kaggle Otto Reccommender'/data
  path_to_module = '/content/drive/MyDrive/Kaggle Otto Reccommender/'
else:
  !mkdir /my_mnt_dir
  !google-drive-ocamlfuse /my_mnt_dir
  %cd /my_mnt_dir/'Kaggle Otto Reccommender'/data
  path_to_module = '/my_mnt_dir/Kaggle Otto Reccommender/'

import sys    
sys.path.append(path_to_module)

In [None]:
!pip install xgboost --upgrade
!pip install fastparquet
!pip install polars

In [None]:
import glob
import numpy as np
import pandas as pd
import gc
from tqdm import tqdm
from copy import deepcopy
import polars as pl

import xgboost as xgb
from otto_utils import create_sub

In [None]:
path_to_training_data = './test_training_data'
training_data = pl.read_parquet(f'{path_to_training_data}/training_data.parquet')
training_data = training_data.to_pandas()

for column in training_data.columns:
  if training_data[column].dtype in ('Int32', 'float64', 'Int16'):
    training_data[column] = training_data[column].astype('float32')
training_data.replace([np.inf, -np.inf], np.NaN, inplace=True)

## Xgboost inference functions ##

In [None]:
def inference(dtrain_list, recall_type, folds, features):
  for fold in range(0, folds):
    if fold == 0:
      preds = collect_preds(dtrain_list, recall_type, fold, features)
    else:
      preds += collect_preds(dtrain_list, recall_type, fold, features)
  preds = preds / folds

  return preds

def collect_preds(dtrain_list, recall_type, fold, features):
  for i, dtrain_item in enumerate(tqdm(dtrain_list)):
    groups = dtrain_item.groupby('session', as_index=False).agg({'aid' : 'count'}).aid.values.tolist()
    dtrain = xgb.DMatrix(dtrain_item[features])
    dtrain.set_group(groups)
    model = xgb.Booster()
    model.load_model(f'../models/xgb_models/{recall_type}_{fold}.xgb')
    preds_chunk = model.predict(dtrain)
    if i == 0:
      preds = preds_chunk
    else:
      preds = np.append(preds, preds_chunk)

  return preds

def get_top_20_df(preds, training_data, return_all=True):
  predictions = training_data[['session','aid']].copy()
  predictions['preds'] = preds
  predictions.sort_values(by=['session', 'preds'], ascending=[True, False], inplace=True)
  predictions['n'] = predictions.groupby('session').cumcount() + 1
  if return_all:
    return predictions
  else:
    return predictions.loc[predictions['n'] <= 20]

In [None]:
aids_and_sessions = training_data[['session','aid']]
features = [feature for feature in training_data.columns if feature not in ['session', 'aid', 'percent_of_test_weeks_interacted']]
dtrain_list = []
sessions = training_data['session'].unique()
session_lists = [np_array.tolist() for np_array in np.array_split(np.array(sessions), 10 ) ]

for session_list in tqdm(session_lists):
  dtrain_list.append(training_data.query(f'session in {session_list}').copy())
  training_data = training_data.loc[~training_data.session.isin(session_list)]

folds=5
del training_data

## Clicks

In [None]:
click_preds = inference(dtrain_list, 'clicks', folds=folds, features=features)
clicks_df = get_top_20_df(click_preds, aids_and_sessions, return_all=True)
clicks_df['n'] = clicks_df['n'].astype('int16')
subset = clicks_df.loc[:, ['session', 'aid', 'n']]
subset = subset.reset_index(drop=True)
subset.to_feather(f'{path_to_training_data}/xgb_clicks_df.feather')
clicks_df = clicks_df.loc[clicks_df['n'] <= 20]

## Carts ##

In [None]:
cart_preds = inference(dtrain_list, 'carts', folds=folds, features=features)
carts_df = get_top_20_df(cart_preds, aids_and_sessions, return_all=True)
carts_df['n'] = carts_df['n'].astype('int16')
subset = carts_df.loc[:, ['session', 'aid', 'n']]
subset = subset.reset_index(drop=True)
subset.to_feather(f'{path_to_training_data}/xgb_carts_df.feather')
carts_df = carts_df.loc[carts_df['n'] <= 20]

## Orders

In [None]:
order_preds = inference(dtrain_list, 'orders', folds=folds, features=features)
orders_df = get_top_20_df(order_preds, aids_and_sessions, return_all=True)
orders_df['n'] = orders_df['n'].astype('int16')
subset = orders_df.loc[:, ['session', 'aid', 'n']]
subset = subset.reset_index(drop=True)
subset.to_feather(f'{path_to_training_data}/xgb_orders_df.feather')
orders_df = orders_df.loc[orders_df['n'] <= 20]