# Versions

v1 pairwise

v2 ndcg

v3 map

v4 pairwise,w_rank

v5 pairwise,SVER2

v6 ndcg,SVER2

v7 map,SVER2

v8 pairwise,SVER3

v9 pairwise,SVER3,IVER2,UVER2

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
VER = 9
SVER = 3
IVER = 2
UVER = 2
WVER = 1
FRAC = 0.5

In [3]:
!nvidia-smi

Sun Jan 29 02:34:44 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 510.47.03    Driver Version: 510.47.03    CUDA Version: 11.6     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA A100-SXM...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   30C    P0    50W / 400W |      0MiB / 40960MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [4]:
import pandas as pd, numpy as np
import pickle, glob, gc
from collections import Counter
import itertools
# multiprocessing 
import psutil
from multiprocessing import Pool
from sklearn.model_selection import GroupKFold
import psutil
import random
import os
N_CORES = psutil.cpu_count()     # Available CPU cores
print(f"N Cores : {N_CORES}")
from multiprocessing import Pool
def seed_everything(seed: int):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)

seed_everything(42)

N Cores : 12


In [5]:
import numpy as np
import pandas as pd


def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    for col in df.columns:
        col_type = df[col].dtype

        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df

In [6]:
def merge_candidate(SVER,IVER,UVER,WVER,TYPE,MODE):
    candidates = pd.read_parquet(f'/content/drive/MyDrive/Colab Notebooks/kaggle/OTTO/dataset/candidate/suggest/{TYPE}/{MODE}_{TYPE}{SVER}.pqt')
    candidates['session'] = candidates.index
    candidates = candidates.set_index('session')
    item_features = pd.read_parquet(f'/content/drive/MyDrive/Colab Notebooks/kaggle/OTTO/dataset/candidate/item/{MODE}_item{IVER}.pqt')
    candidates = candidates.merge(item_features, left_on='item', right_index=True, how='left').fillna(-1)
    user_features = pd.read_parquet(f'/content/drive/MyDrive/Colab Notebooks/kaggle/OTTO/dataset/candidate/user/{MODE}_user{UVER}.pqt')
    candidates = candidates.merge(user_features, left_on='session', right_index=True, how='left').fillna(-1)
    candidates['user'] = candidates.index
    candidates = candidates.set_index('user')
    candidates = reduce_mem_usage(candidates)
    _ = gc.collect()
    return candidates

In [7]:
!pip install -q xgboost==1.6.2
import xgboost as xgb
from sklearn.model_selection import GroupKFold

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m255.9/255.9 MB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[?25h

# inference

In [8]:
def predict(test_candidates,TYPE,FEATURES):
    preds = np.zeros(len(test_candidates))
    test_candidates.reset_index(inplace=True)
    for fold in range(5):
        model = xgb.Booster()
        model.load_model(f'/content/drive/MyDrive/Colab Notebooks/kaggle/OTTO/weights/XGB_fold{fold}_{TYPE}_{VER}.xgb')
        model.set_param({'predictor': 'gpu_predictor'})
        dtest = xgb.DMatrix(data=test_candidates[FEATURES].drop(["user"], axis=1))
        preds += model.predict(dtest)/5
        del model,dtest
        _ = gc.collect()
    predictions = test_candidates[['user','item']].copy()
    predictions['pred'] = preds
    predictions = predictions.sort_values(['user','pred'], ascending=[True,False]).reset_index(drop=True)
    predictions['n'] = predictions.groupby('user').item.cumcount().astype('int8')
    predictions = predictions.loc[predictions.n<20]
    sub = predictions.groupby('user').item.apply(list)
    sub = sub.to_frame().reset_index()
    sub.item = sub.item.apply(lambda x: " ".join(map(str,x)))
    sub.columns = ['session_type','labels']
    sub.session_type = sub.session_type.astype('str')+ f'_{TYPE}'
    del predictions
    _ = gc.collect()
    return sub

## clicks

In [9]:
%%time
test_candidates = merge_candidate(SVER,IVER,UVER,WVER,'clicks','test')
FEATURES = list(test_candidates.columns)
FEATURES = FEATURES + ['user']
FEATURES.remove('item')
clicks_pred_df = predict(test_candidates,'clicks',FEATURES)
del test_candidates
_ = gc.collect()

Memory usage of dataframe is 11479.36 MB
Memory usage after optimization is: 5484.58 MB
Decreased by 52.2%
CPU times: user 13min 43s, sys: 29.2 s, total: 14min 13s
Wall time: 4min 49s


## carts

In [10]:
%%time
test_candidates = merge_candidate(SVER,IVER,UVER,WVER,'carts','test')
FEATURES = list(test_candidates.columns)
FEATURES = FEATURES + ['user']
FEATURES.remove('item')
carts_pred_df = predict(test_candidates,'carts',FEATURES)
del test_candidates
_ = gc.collect()

Memory usage of dataframe is 11479.36 MB
Memory usage after optimization is: 5484.58 MB
Decreased by 52.2%
CPU times: user 11min 16s, sys: 11.1 s, total: 11min 27s
Wall time: 4min 9s


## orders

In [11]:
%%time
test_candidates = merge_candidate(SVER,IVER,UVER,WVER,'orders','test')
FEATURES = list(test_candidates.columns)
FEATURES = FEATURES + ['user']
FEATURES.remove('item')
orders_pred_df = predict(test_candidates,'orders',FEATURES)
del test_candidates
_ = gc.collect()

Memory usage of dataframe is 11479.36 MB
Memory usage after optimization is: 5484.58 MB
Decreased by 52.2%
CPU times: user 10min 7s, sys: 4.01 s, total: 10min 11s
Wall time: 3min 51s


# submission

In [12]:
pred_df = pd.concat([clicks_pred_df, orders_pred_df, carts_pred_df])
pred_df.columns = ["session_type", "labels"]
pred_df.to_csv(f"xgb{VER}.csv", index=False)
pred_df.to_csv(f'/content/drive/MyDrive/Colab Notebooks/kaggle/OTTO/submission/xgb{VER}.csv', index=False)

In [13]:
!pip install kaggle -q
import os
import json
f = open("/content/drive/MyDrive/Colab Notebooks/kaggle/kaggle.json", 'r')
json_data = json.load(f)
os.environ['KAGGLE_USERNAME'] = json_data['username']
os.environ['KAGGLE_KEY'] = json_data['key']

In [14]:
!kaggle competitions submit -c otto-recommender-system -f xgb9.csv -m ""

100% 778M/778M [00:15<00:00, 54.3MB/s]
Successfully submitted to OTTO – Multi-Objective Recommender System