# Versions

v1 pairwise

v2 ndcg

v3 map

# validation

In [1]:
VER = 3
SVER = 1
IVER = 1
UVER = 1
FEATURES = [
        'user', 'item_item_count', 'item_user_count', 
        'item_buy_ratio', 'user_user_count', 'user_item_count',
        'user_buy_ratio']

In [2]:
!nvidia-smi

Tue Jan 24 11:44:15 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  A100-SXM4-40GB      Off  | 00000000:00:04.0 Off |                    0 |
| N/A   29C    P0    50W / 400W |      0MiB / 40536MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
import pandas as pd, numpy as np
import pickle, glob, gc

from collections import Counter
import itertools
# multiprocessing 
import psutil
from multiprocessing import Pool
from sklearn.model_selection import GroupKFold
import psutil
N_CORES = psutil.cpu_count()     # Available CPU cores
print(f"N Cores : {N_CORES}")
from multiprocessing import Pool

N Cores : 12


In [5]:
def merge_candidate(SVER,IVER,UVER,TYPE,MODE):
    candidates = pd.read_parquet(f'/content/drive/MyDrive/Colab Notebooks/kaggle/OTTO/dataset/candidate/suggest/{TYPE}/{MODE}_{TYPE}{SVER}.pqt')
    candidates['session'] = candidates.index
    candidates = candidates.set_index('session')
    item_features = pd.read_parquet(f'/content/drive/MyDrive/Colab Notebooks/kaggle/OTTO/dataset/candidate/item/{MODE}_item{IVER}.pqt')
    candidates = candidates.merge(item_features, left_on='item', right_index=True, how='left').fillna(-1)
    user_features = pd.read_parquet(f'/content/drive/MyDrive/Colab Notebooks/kaggle/OTTO/dataset/candidate/user/{MODE}_user{UVER}.pqt')
    candidates = candidates.merge(user_features, left_on='session', right_index=True, how='left').fillna(-1)
    candidates['user'] = candidates.index
    candidates = candidates.set_index('user')
    return candidates

In [6]:
candidates = merge_candidate(SVER,IVER,UVER,'clicks','val')

In [7]:
candidates.head()

Unnamed: 0_level_0,item,item_item_count,item_user_count,item_buy_ratio,user_user_count,user_item_count,user_buy_ratio
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
11098528,11830,33776,19211,0.17397,1,1,0.0
11098528,588923,24588,14741,0.125346,1,1,0.0
11098528,1732105,10452,5770,0.280233,1,1,0.0
11098528,571762,18822,12405,0.124588,1,1,0.0
11098528,884502,31522,18163,0.135398,1,1,0.0


In [None]:
def merge_target(TYPE,candidates):
    tar = pd.read_parquet('/content/drive/MyDrive/Colab Notebooks/kaggle/OTTO/dataset/otto-validation/test_labels.parquet')
    tar = tar.loc[ tar['type']==TYPE ]
    aids = tar.ground_truth.explode().astype('int32').rename('item')
    tar = tar[['session']].astype('int32').rename({'session':'user'},axis=1)
    tar = tar.merge(aids, left_index=True, right_index=True, how='left')
    tar[TYPE] = 1
    candidates = candidates.merge(tar,on=['user','item'],how='left').fillna(0)
    return candidates

In [None]:
!pip install -q xgboost==1.6.2
import xgboost as xgb
from sklearn.model_selection import GroupKFold
from sklearn.metrics import recall_score

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m255.9/255.9 MB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
def train_xgb(candidates,TARGET):
    preds = np.zeros(len(candidates))
    skf = GroupKFold(n_splits=5)
    for fold,(train_idx, valid_idx) in enumerate(skf.split(candidates, candidates[TARGET], groups=candidates['user'] )):

        X_train = candidates.loc[train_idx, FEATURES]
        y_train = candidates.loc[train_idx, TARGET]
        X_valid = candidates.loc[valid_idx, FEATURES]
        y_valid = candidates.loc[valid_idx, TARGET]

        X_train = X_train.sort_values("user").reset_index(drop=True)
        X_valid = X_valid.sort_values("user").reset_index(drop=True)

        train_group = X_train.groupby('user').user.agg('count').values
        valid_group = X_valid.groupby('user').user.agg('count').values

        X_train = X_train.drop(["user"], axis=1)
        X_valid = X_valid.drop(["user"], axis=1)

        dtrain = xgb.DMatrix(X_train, y_train,group=train_group)
        dvalid = xgb.DMatrix(X_valid, y_valid,group=valid_group)

        xgb_parms = {
            'objective':'rank:map', 
            'tree_method':'gpu_hist',
            'random_state': 42, 
            'learning_rate': 0.1,
            "colsample_bytree": 0.8, 
            'max_depth': 6,
        }
        model = xgb.train(xgb_parms, 
            dtrain=dtrain,
            evals=[(dtrain,'train'),(dvalid,'valid')],
            num_boost_round=1000,
            verbose_eval=500)
        preds[valid_idx] = model.predict(dvalid)
        model.save_model(f'XGB_fold{fold}_{TARGET}.xgb')
    predictions = candidates[['user','item']].copy()
    predictions['pred'] = preds
    predictions = predictions.sort_values(['user','pred'], ascending=[True,False]).reset_index(drop=True)
    predictions['n'] = predictions.groupby('user').item.cumcount().astype('int8')
    predictions = predictions.loc[predictions.n<20]
    sub = predictions.groupby('user').item.apply(list)
    sub = sub.to_frame().reset_index()
    sub.item = sub.item.apply(lambda x: " ".join(map(str,x)))
    sub.columns = ['session','labels']
    sub.labels = sub.labels.apply(lambda x: [int(i) for i in x.split(' ')[:20]])
    test_labels = pd.read_parquet('/content/drive/MyDrive/Colab Notebooks/kaggle/OTTO/dataset/otto-validation/test_labels.parquet')
    test_labels = test_labels.loc[test_labels['type']==TARGET]
    test_labels = test_labels.merge(sub, how='left', on=['session'])
    test_labels['hits'] = test_labels.apply(lambda df: len(set(df.ground_truth).intersection(set(df.labels))), axis=1)
    test_labels['gt_count'] = test_labels.ground_truth.str.len().clip(0,20)
    recall = test_labels['hits'].sum() / test_labels['gt_count'].sum()
    print('{} Recall = {:.5f}'.format(TARGET,recall))

## clicks

In [None]:
%%time
candidates = merge_candidate(SVER,IVER,UVER,'clicks','val')
candidates = merge_target('clicks',candidates)
train_xgb(candidates,'clicks')
del candidates
_ = gc.collect()

[0]	train-map:0.65085	valid-map:0.65083
[500]	train-map:0.59624	valid-map:0.59132
[999]	train-map:0.59925	valid-map:0.59229
[0]	train-map:0.65722	valid-map:0.65799
[500]	train-map:0.59602	valid-map:0.59156
[999]	train-map:0.59925	valid-map:0.59222
[0]	train-map:0.63652	valid-map:0.63530
[500]	train-map:0.59667	valid-map:0.59077
[999]	train-map:0.59961	valid-map:0.59185
[0]	train-map:0.66051	valid-map:0.66028
[500]	train-map:0.59577	valid-map:0.59142
[999]	train-map:0.59907	valid-map:0.59265
[0]	train-map:0.65609	valid-map:0.65569
[500]	train-map:0.59624	valid-map:0.59163
[999]	train-map:0.59923	valid-map:0.59237
clicks Recall = 0.52558
CPU times: user 12min 2s, sys: 9.84 s, total: 12min 12s
Wall time: 10min 28s


## carts

In [None]:
%%time
candidates = merge_candidate(SVER,IVER,UVER,'carts','val')
candidates = merge_target('carts',candidates)
train_xgb(candidates,'carts')
del candidates
_ = gc.collect()

[0]	train-map:0.91974	valid-map:0.91936
[500]	train-map:0.91603	valid-map:0.91380
[999]	train-map:0.91695	valid-map:0.91378
[0]	train-map:0.91948	valid-map:0.91905
[500]	train-map:0.91598	valid-map:0.91363
[999]	train-map:0.91693	valid-map:0.91358
[0]	train-map:0.91995	valid-map:0.91911
[500]	train-map:0.91624	valid-map:0.91296
[999]	train-map:0.91718	valid-map:0.91301
[0]	train-map:0.92303	valid-map:0.92275
[500]	train-map:0.91615	valid-map:0.91394
[999]	train-map:0.91702	valid-map:0.91399
[0]	train-map:0.92152	valid-map:0.92196
[500]	train-map:0.91591	valid-map:0.91472
[999]	train-map:0.91682	valid-map:0.91474


## orders

In [None]:
%%time
candidates = merge_candidate(SVER,IVER,UVER,'orders','val')
candidates = merge_target('orders',candidates)
train_xgb(candidates,'orders')
del candidates
_ = gc.collect()

[0]	train-map:0.95292	valid-map:0.95276
[500]	train-map:0.94798	valid-map:0.94624
[999]	train-map:0.94884	valid-map:0.94626
[0]	train-map:0.95470	valid-map:0.95373
[500]	train-map:0.94826	valid-map:0.94531
[999]	train-map:0.94910	valid-map:0.94526
[0]	train-map:0.95509	valid-map:0.95477
[500]	train-map:0.94810	valid-map:0.94590
[999]	train-map:0.94891	valid-map:0.94599
[0]	train-map:0.95644	valid-map:0.95634
[500]	train-map:0.94811	valid-map:0.94619


# inference

In [None]:
def predict(test_candidates,TYPE):
    preds = np.zeros(len(test_candidates))
    test_candidates.reset_index(inplace=True)
    for fold in range(5):
        model = xgb.Booster()
        model.load_model(f'XGB_fold{fold}_{TYPE}.xgb')
        model.set_param({'predictor': 'gpu_predictor'})
        dtest = xgb.DMatrix(data=test_candidates[FEATURES].drop(["user"], axis=1))
        preds += model.predict(dtest)/5
    predictions = test_candidates[['user','item']].copy()
    predictions['pred'] = preds
    predictions = predictions.sort_values(['user','pred'], ascending=[True,False]).reset_index(drop=True)
    predictions['n'] = predictions.groupby('user').item.cumcount().astype('int8')
    predictions = predictions.loc[predictions.n<20]
    sub = predictions.groupby('user').item.apply(list)
    sub = sub.to_frame().reset_index()
    sub.item = sub.item.apply(lambda x: " ".join(map(str,x)))
    sub.columns = ['session_type','labels']
    sub.session_type = sub.session_type.astype('str')+ f'_{TYPE}'
    return sub

## clicks

In [None]:
%%time
test_candidates = merge_candidate(SVER,IVER,UVER,'clicks','test')
clicks_pred_df = predict(test_candidates,'clicks')
del test_candidates
_ = gc.collect()

CPU times: user 2min 1s, sys: 1.8 s, total: 2min 3s
Wall time: 1min 28s


## carts

In [None]:
%%time
test_candidates = merge_candidate(SVER,IVER,UVER,'carts','test')
carts_pred_df = predict(test_candidates,'carts')
del test_candidates
_ = gc.collect()

CPU times: user 1min 58s, sys: 1.69 s, total: 2min
Wall time: 1min 26s


## orders

In [None]:
%%time
test_candidates = merge_candidate(SVER,IVER,UVER,'orders','test')
orders_pred_df = predict(test_candidates,'orders')
del test_candidates
_ = gc.collect()

CPU times: user 1min 59s, sys: 1.82 s, total: 2min 1s
Wall time: 1min 26s


# submission

In [None]:
pred_df = pd.concat([clicks_pred_df, orders_pred_df, carts_pred_df])
pred_df.columns = ["session_type", "labels"]
pred_df.to_csv(f"xgb{VER}.csv", index=False)
pred_df.to_csv(f'/content/drive/MyDrive/Colab Notebooks/kaggle/OTTO/submission/xgb{VER}.csv', index=False)

In [None]:
!pip install kaggle -q
import os
import json
f = open("/content/drive/MyDrive/Colab Notebooks/kaggle/kaggle.json", 'r')
json_data = json.load(f)
os.environ['KAGGLE_USERNAME'] = json_data['username']
os.environ['KAGGLE_KEY'] = json_data['key']

In [None]:
!kaggle competitions submit -c otto-recommender-system -f xgb3.csv -m ""

100% 780M/780M [00:13<00:00, 60.9MB/s]
Successfully submitted to OTTO – Multi-Objective Recommender System