# Versions

v1 pairwise

v2 ndcg

v3 map

v4 pairwise,w_rank

v5 pairwise,SVER2

v6 ndcg,SVER2

v7 map,SVER2

v8 pairwise,SVER3

v9 pairwise,SVER3,IVER2,UVER2

v10 pairwise,SVER3,IVER2,UVER2,FRAC=0.1

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
VER = 10
SVER = 3
IVER = 2
UVER = 2
WVER = 1
FRAC = 0.1

In [3]:
!nvidia-smi

Sun Jan 29 03:40:49 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 510.47.03    Driver Version: 510.47.03    CUDA Version: 11.6     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA A100-SXM...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   29C    P0    47W / 400W |      0MiB / 40960MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [4]:
import pandas as pd, numpy as np
import pickle, glob, gc
from collections import Counter
import itertools
# multiprocessing 
import psutil
from multiprocessing import Pool
from sklearn.model_selection import GroupKFold
import psutil
import random
import os
N_CORES = psutil.cpu_count()     # Available CPU cores
print(f"N Cores : {N_CORES}")
from multiprocessing import Pool
def seed_everything(seed: int):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)

seed_everything(42)

N Cores : 12


In [5]:
import numpy as np
import pandas as pd


def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    for col in df.columns:
        col_type = df[col].dtype

        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df

In [6]:
def merge_candidate(SVER,IVER,UVER,WVER,TYPE,MODE):
    candidates = pd.read_parquet(f'/content/drive/MyDrive/Colab Notebooks/kaggle/OTTO/dataset/candidate/suggest/{TYPE}/{MODE}_{TYPE}{SVER}.pqt')
    candidates['session'] = candidates.index
    candidates = candidates.set_index('session')
    item_features = pd.read_parquet(f'/content/drive/MyDrive/Colab Notebooks/kaggle/OTTO/dataset/candidate/item/{MODE}_item{IVER}.pqt')
    candidates = candidates.merge(item_features, left_on='item', right_index=True, how='left').fillna(-1)
    user_features = pd.read_parquet(f'/content/drive/MyDrive/Colab Notebooks/kaggle/OTTO/dataset/candidate/user/{MODE}_user{UVER}.pqt')
    candidates = candidates.merge(user_features, left_on='session', right_index=True, how='left').fillna(-1)
    candidates['user'] = candidates.index
    candidates = candidates.set_index('user')
    candidates = reduce_mem_usage(candidates)
    _ = gc.collect()
    return candidates

In [7]:
def merge_target(TYPE,candidates):
    tar = pd.read_parquet('/content/drive/MyDrive/Colab Notebooks/kaggle/OTTO/dataset/otto-validation/test_labels.parquet')
    tar = tar.loc[ tar['type']==TYPE ]
    aids = tar.ground_truth.explode().astype('int32').rename('item')
    tar = tar[['session']].astype('int32').rename({'session':'user'},axis=1)
    tar = tar.merge(aids, left_index=True, right_index=True, how='left')
    tar[TYPE] = 1
    candidates = candidates.merge(tar,on=['user','item'],how='left').fillna(0)
    _ = gc.collect()
    return candidates

In [8]:
!pip install -q xgboost==1.6.2
import xgboost as xgb
from sklearn.model_selection import GroupKFold
def train_xgb(candidates,TARGET,FEATURES):
    preds = np.zeros(len(candidates))
    skf = GroupKFold(n_splits=5)
    for fold,(train_idx, valid_idx) in enumerate(skf.split(candidates, candidates[TARGET], groups=candidates['user'] )):

        X_train = candidates.loc[train_idx, FEATURES]
        y_train = candidates.loc[train_idx, TARGET]
        X_valid = candidates.loc[valid_idx, FEATURES]
        y_valid = candidates.loc[valid_idx, TARGET]

        X_train = X_train.sort_values("user").reset_index(drop=True)
        X_valid = X_valid.sort_values("user").reset_index(drop=True)

        train_group = X_train.groupby('user').user.agg('count').values
        valid_group = X_valid.groupby('user').user.agg('count').values

        X_train = X_train.drop(["user"], axis=1)
        X_valid = X_valid.drop(["user"], axis=1)

        dtrain = xgb.DMatrix(X_train, y_train,group=train_group)
        dvalid = xgb.DMatrix(X_valid, y_valid,group=valid_group)

        xgb_parms = {
            'objective':'rank:pairwise', 
            'tree_method':'gpu_hist',
            'random_state': 42, 
            'learning_rate': 0.1,
            "colsample_bytree": 0.8, 
            'max_depth': 6,
            'eta':0.1,
            'eval_metric':'ndcg@20'
        }
        model = xgb.train(xgb_parms, 
            dtrain=dtrain,
            evals=[(dtrain,'train'),(dvalid,'valid')],
            num_boost_round=200,
            verbose_eval=50)
        preds[valid_idx] = model.predict(dvalid)
        model.save_model(f'/content/drive/MyDrive/Colab Notebooks/kaggle/OTTO/weights/XGB_fold{fold}_{TARGET}_{VER}.xgb')
        
        del X_train,y_train,X_valid,y_valid,train_group,valid_group
        _ = gc.collect()
    predictions = candidates[['user','item']].copy()
    predictions['pred'] = preds
    predictions = predictions.sort_values(['user','pred'], ascending=[True,False]).reset_index(drop=True)
    predictions['n'] = predictions.groupby('user').item.cumcount().astype('int8')
    predictions = predictions.loc[predictions.n<20]
    sub = predictions.groupby('user').item.apply(list)
    sub = sub.to_frame().reset_index()
    sub.item = sub.item.apply(lambda x: " ".join(map(str,x)))
    oof = sub.copy()
    oof.columns = ['session_type','labels']
    oof.session_type = oof.session_type.astype('str')+ f'_{TARGET}'
    sub.columns = ['session','labels']
    sub.labels = sub.labels.apply(lambda x: [int(i) for i in x.split(' ')[:20]])
    test_labels = pd.read_parquet('/content/drive/MyDrive/Colab Notebooks/kaggle/OTTO/dataset/otto-validation/test_labels.parquet')
    test_labels = test_labels.loc[test_labels['type']==TARGET]
    test_labels = test_labels.merge(sub, how='left', on=['session'])
    test_labels['hits'] = test_labels.apply(lambda df: len(set(df.ground_truth).intersection(set(df.labels))), axis=1)
    test_labels['gt_count'] = test_labels.ground_truth.str.len().clip(0,20)
    recall = test_labels['hits'].sum() / test_labels['gt_count'].sum()
    print('{} Recall = {:.5f}'.format(TARGET,recall))
    del predictions,sub,test_labels,recall
    _ = gc.collect()
    return oof

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m255.9/255.9 MB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[?25h

# validation

## carts

In [9]:
%%time
candidates = merge_candidate(SVER,IVER,UVER,WVER,'carts','val')
candidates = merge_target('carts',candidates)
positives = candidates.loc[candidates['carts']==1]
negatives = candidates.loc[candidates['carts']==0].sample(frac=FRAC)
candidates = pd.concat([positives,negatives],axis=0,ignore_index=True)
FEATURES = list(candidates.columns)
FEATURES.remove('item')
FEATURES.remove('carts')
carts_oof_df = train_xgb(candidates,'carts',FEATURES)
carts_oof_df.to_parquet(f'/content/drive/MyDrive/Colab Notebooks/kaggle/OTTO/oof/xgb_carts{VER}.pqt', index=False)
del candidates
_ = gc.collect()

Memory usage of dataframe is 12368.21 MB
Memory usage after optimization is: 5909.26 MB
Decreased by 52.2%
[0]	train-ndcg@20:1.00000	valid-ndcg@20:1.00000
[50]	train-ndcg@20:1.00000	valid-ndcg@20:1.00000
[100]	train-ndcg@20:1.00000	valid-ndcg@20:1.00000
[150]	train-ndcg@20:1.00000	valid-ndcg@20:1.00000
[199]	train-ndcg@20:1.00000	valid-ndcg@20:1.00000
[0]	train-ndcg@20:1.00000	valid-ndcg@20:1.00000
[50]	train-ndcg@20:1.00000	valid-ndcg@20:1.00000
[100]	train-ndcg@20:1.00000	valid-ndcg@20:1.00000
[150]	train-ndcg@20:1.00000	valid-ndcg@20:1.00000
[199]	train-ndcg@20:1.00000	valid-ndcg@20:1.00000
[0]	train-ndcg@20:1.00000	valid-ndcg@20:1.00000
[50]	train-ndcg@20:1.00000	valid-ndcg@20:1.00000
[100]	train-ndcg@20:1.00000	valid-ndcg@20:1.00000
[150]	train-ndcg@20:1.00000	valid-ndcg@20:1.00000
[199]	train-ndcg@20:1.00000	valid-ndcg@20:1.00000
[0]	train-ndcg@20:1.00000	valid-ndcg@20:1.00000
[50]	train-ndcg@20:1.00000	valid-ndcg@20:1.00000
[100]	train-ndcg@20:1.00000	valid-ndcg@20:1.00000
[150]

TypeError: ignored