# Versions

v1 pairwise

v2 ndcg

# validation

In [None]:
VER = 2
SVER = 1
IVER = 1
UVER = 1
FEATURES = [
        'user', 'item_item_count', 'item_user_count', 
        'item_buy_ratio', 'user_user_count', 'user_item_count',
        'user_buy_ratio']

In [None]:
!nvidia-smi

Tue Jan 24 00:47:53 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  A100-SXM4-40GB      Off  | 00000000:00:04.0 Off |                    0 |
| N/A   34C    P0    53W / 400W |      0MiB / 40536MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd, numpy as np
import pickle, glob, gc

from collections import Counter
import itertools
# multiprocessing 
import psutil
from multiprocessing import Pool
from sklearn.model_selection import GroupKFold
import psutil
N_CORES = psutil.cpu_count()     # Available CPU cores
print(f"N Cores : {N_CORES}")
from multiprocessing import Pool

N Cores : 12


In [None]:
def merge_candidate(SVER,IVER,UVER,TYPE,MODE):
    candidates = pd.read_parquet(f'/content/drive/MyDrive/Colab Notebooks/kaggle/OTTO/dataset/candidate/suggest/{TYPE}/{MODE}_{TYPE}{SVER}.pqt')
    candidates['session'] = candidates.index
    candidates = candidates.set_index('session')
    item_features = pd.read_parquet(f'/content/drive/MyDrive/Colab Notebooks/kaggle/OTTO/dataset/candidate/item/{MODE}_item{IVER}.pqt')
    candidates = candidates.merge(item_features, left_on='item', right_index=True, how='left').fillna(-1)
    user_features = pd.read_parquet(f'/content/drive/MyDrive/Colab Notebooks/kaggle/OTTO/dataset/candidate/user/{MODE}_user{UVER}.pqt')
    candidates = candidates.merge(user_features, left_on='session', right_index=True, how='left').fillna(-1)
    candidates['user'] = candidates.index
    candidates = candidates.set_index('user')
    return candidates

In [None]:
def merge_target(TYPE,candidates):
    tar = pd.read_parquet('/content/drive/MyDrive/Colab Notebooks/kaggle/OTTO/dataset/otto-validation/test_labels.parquet')
    tar = tar.loc[ tar['type']==TYPE ]
    aids = tar.ground_truth.explode().astype('int32').rename('item')
    tar = tar[['session']].astype('int32').rename({'session':'user'},axis=1)
    tar = tar.merge(aids, left_index=True, right_index=True, how='left')
    tar[TYPE] = 1
    candidates = candidates.merge(tar,on=['user','item'],how='left').fillna(0)
    return candidates

In [None]:
!pip install -q xgboost==1.6.2
import xgboost as xgb
from sklearn.model_selection import GroupKFold
from sklearn.metrics import recall_score

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m255.9/255.9 MB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
def train_xgb(candidates,TARGET):
    preds = np.zeros(len(candidates))
    skf = GroupKFold(n_splits=5)
    for fold,(train_idx, valid_idx) in enumerate(skf.split(candidates, candidates[TARGET], groups=candidates['user'] )):

        X_train = candidates.loc[train_idx, FEATURES]
        y_train = candidates.loc[train_idx, TARGET]
        X_valid = candidates.loc[valid_idx, FEATURES]
        y_valid = candidates.loc[valid_idx, TARGET]

        X_train = X_train.sort_values("user").reset_index(drop=True)
        X_valid = X_valid.sort_values("user").reset_index(drop=True)

        train_group = X_train.groupby('user').user.agg('count').values
        valid_group = X_valid.groupby('user').user.agg('count').values

        X_train = X_train.drop(["user"], axis=1)
        X_valid = X_valid.drop(["user"], axis=1)

        dtrain = xgb.DMatrix(X_train, y_train,group=train_group)
        dvalid = xgb.DMatrix(X_valid, y_valid,group=valid_group)

        xgb_parms = {
            'objective':'rank:ndcg', 
            'tree_method':'gpu_hist',
            'random_state': 42, 
            'learning_rate': 0.1,
            "colsample_bytree": 0.8, 
            'max_depth': 6,
        }
        model = xgb.train(xgb_parms, 
            dtrain=dtrain,
            evals=[(dtrain,'train'),(dvalid,'valid')],
            num_boost_round=1000,
            verbose_eval=500)
        preds[valid_idx] = model.predict(dvalid)
        model.save_model(f'XGB_fold{fold}_{TARGET}.xgb')
    predictions = candidates[['user','item']].copy()
    predictions['pred'] = preds
    predictions = predictions.sort_values(['user','pred'], ascending=[True,False]).reset_index(drop=True)
    predictions['n'] = predictions.groupby('user').item.cumcount().astype('int8')
    predictions = predictions.loc[predictions.n<20]
    sub = predictions.groupby('user').item.apply(list)
    sub = sub.to_frame().reset_index()
    sub.item = sub.item.apply(lambda x: " ".join(map(str,x)))
    sub.columns = ['session','labels']
    sub.labels = sub.labels.apply(lambda x: [int(i) for i in x.split(' ')[:20]])
    test_labels = pd.read_parquet('/content/drive/MyDrive/Colab Notebooks/kaggle/OTTO/dataset/otto-validation/test_labels.parquet')
    test_labels = test_labels.loc[test_labels['type']==TARGET]
    test_labels = test_labels.merge(sub, how='left', on=['session'])
    test_labels['hits'] = test_labels.apply(lambda df: len(set(df.ground_truth).intersection(set(df.labels))), axis=1)
    test_labels['gt_count'] = test_labels.ground_truth.str.len().clip(0,20)
    recall = test_labels['hits'].sum() / test_labels['gt_count'].sum()
    print('{} Recall = {:.5f}'.format(TARGET,recall))

## clicks

In [None]:
%%time
candidates = merge_candidate(SVER,IVER,UVER,'clicks','val')
candidates = merge_target('clicks',candidates)
train_xgb(candidates,'clicks')
del candidates
_ = gc.collect()

[0]	train-map:0.65086	valid-map:0.65085
[500]	train-map:0.59623	valid-map:0.59160
[999]	train-map:0.59947	valid-map:0.59256
[0]	train-map:0.65777	valid-map:0.65854
[500]	train-map:0.59622	valid-map:0.59172
[999]	train-map:0.59933	valid-map:0.59226
[0]	train-map:0.65931	valid-map:0.65823
[500]	train-map:0.59676	valid-map:0.59117
[999]	train-map:0.59967	valid-map:0.59196
[0]	train-map:0.66069	valid-map:0.66040
[500]	train-map:0.59618	valid-map:0.59178
[999]	train-map:0.59924	valid-map:0.59268
[0]	train-map:0.65634	valid-map:0.65594
[500]	train-map:0.59616	valid-map:0.59154
[999]	train-map:0.59936	valid-map:0.59265
clicks Recall = 0.52558
CPU times: user 12min 4s, sys: 11.3 s, total: 12min 16s
Wall time: 10min 28s


## carts

In [None]:
%%time
candidates = merge_candidate(SVER,IVER,UVER,'carts','val')
candidates = merge_target('carts',candidates)
train_xgb(candidates,'carts')
del candidates
_ = gc.collect()

[0]	train-map:0.91977	valid-map:0.91940
[500]	train-map:0.91603	valid-map:0.91390
[999]	train-map:0.91699	valid-map:0.91389
[0]	train-map:0.91888	valid-map:0.91832
[500]	train-map:0.91608	valid-map:0.91355
[999]	train-map:0.91697	valid-map:0.91357
[0]	train-map:0.91981	valid-map:0.91898
[500]	train-map:0.91619	valid-map:0.91298
[999]	train-map:0.91718	valid-map:0.91298
[0]	train-map:0.92306	valid-map:0.92277
[500]	train-map:0.91617	valid-map:0.91389
[999]	train-map:0.91707	valid-map:0.91396
[0]	train-map:0.92162	valid-map:0.92205
[500]	train-map:0.91591	valid-map:0.91469
[999]	train-map:0.91682	valid-map:0.91462
carts Recall = 0.40965
CPU times: user 10min 36s, sys: 4.16 s, total: 10min 41s
Wall time: 9min 38s


## orders

In [None]:
%%time
candidates = merge_candidate(SVER,IVER,UVER,'orders','val')
candidates = merge_target('orders',candidates)
train_xgb(candidates,'orders')
del candidates
_ = gc.collect()

[0]	train-map:0.95309	valid-map:0.95296
[500]	train-map:0.94801	valid-map:0.94630
[999]	train-map:0.94884	valid-map:0.94633
[0]	train-map:0.95466	valid-map:0.95372
[500]	train-map:0.94825	valid-map:0.94529
[999]	train-map:0.94906	valid-map:0.94533
[0]	train-map:0.95513	valid-map:0.95466
[500]	train-map:0.94815	valid-map:0.94585
[999]	train-map:0.94894	valid-map:0.94592
[0]	train-map:0.95464	valid-map:0.95452
[500]	train-map:0.94814	valid-map:0.94621
[999]	train-map:0.94894	valid-map:0.94619
[0]	train-map:0.95572	valid-map:0.95638
[500]	train-map:0.94781	valid-map:0.94680
[999]	train-map:0.94871	valid-map:0.94681
orders Recall = 0.64924
CPU times: user 10min 14s, sys: 5.19 s, total: 10min 19s
Wall time: 9min 29s


# inference

In [None]:
def predict(test_candidates,TYPE):
    preds = np.zeros(len(test_candidates))
    test_candidates.reset_index(inplace=True)
    for fold in range(5):
        model = xgb.Booster()
        model.load_model(f'XGB_fold{fold}_{TYPE}.xgb')
        model.set_param({'predictor': 'gpu_predictor'})
        dtest = xgb.DMatrix(data=test_candidates[FEATURES].drop(["user"], axis=1))
        preds += model.predict(dtest)/5
    predictions = test_candidates[['user','item']].copy()
    predictions['pred'] = preds
    predictions = predictions.sort_values(['user','pred'], ascending=[True,False]).reset_index(drop=True)
    predictions['n'] = predictions.groupby('user').item.cumcount().astype('int8')
    predictions = predictions.loc[predictions.n<20]
    sub = predictions.groupby('user').item.apply(list)
    sub = sub.to_frame().reset_index()
    sub.item = sub.item.apply(lambda x: " ".join(map(str,x)))
    sub.columns = ['session_type','labels']
    sub.session_type = sub.session_type.astype('str')+ f'_{TYPE}'
    return sub

## clicks

In [None]:
%%time
test_candidates = merge_candidate(SVER,IVER,UVER,'clicks','test')
clicks_pred_df = predict(test_candidates,'clicks')
del test_candidates
_ = gc.collect()

CPU times: user 2min 2s, sys: 1.89 s, total: 2min 4s
Wall time: 1min 30s


## carts

In [None]:
%%time
test_candidates = merge_candidate(SVER,IVER,UVER,'carts','test')
carts_pred_df = predict(test_candidates,'carts')
del test_candidates
_ = gc.collect()

CPU times: user 2min 11s, sys: 1.95 s, total: 2min 13s
Wall time: 1min 31s


## orders

In [None]:
%%time
test_candidates = merge_candidate(SVER,IVER,UVER,'orders','test')
orders_pred_df = predict(test_candidates,'orders')
del test_candidates
_ = gc.collect()

CPU times: user 2min 2s, sys: 1.86 s, total: 2min 4s
Wall time: 1min 32s


# submission

In [16]:
pred_df = pd.concat([clicks_pred_df, orders_pred_df, carts_pred_df])
pred_df.columns = ["session_type", "labels"]
pred_df.to_csv(f"xgb{VER}.csv", index=False)
pred_df.to_csv(f'/content/drive/MyDrive/Colab Notebooks/kaggle/OTTO/submission/xgb{VER}.csv', index=False)

In [17]:
!pip install kaggle -q
import os
import json
f = open("/content/drive/MyDrive/Colab Notebooks/kaggle/kaggle.json", 'r')
json_data = json.load(f)
os.environ['KAGGLE_USERNAME'] = json_data['username']
os.environ['KAGGLE_KEY'] = json_data['key']

In [18]:
!kaggle competitions submit -c otto-recommender-system -f xgb2.csv -m ""

100% 780M/780M [00:16<00:00, 50.5MB/s]
Successfully submitted to OTTO – Multi-Objective Recommender System