In [1]:
!git clone https://github.com/sn09/ranking.git

Cloning into 'ranking'...
remote: Enumerating objects: 342, done.[K
remote: Counting objects: 100% (342/342), done.[K
remote: Compressing objects: 100% (198/198), done.[K
remote: Total 342 (delta 166), reused 286 (delta 119), pack-reused 0 (from 0)[K
Receiving objects: 100% (342/342), 169.45 KiB | 3.20 MiB/s, done.
Resolving deltas: 100% (166/166), done.


In [None]:
import sys

import lightgbm as lgb
import numpy as np
import pandas as pd
from lightgbm import LGBMClassifier, LGBMRanker
from sklearn.metrics import log_loss, roc_auc_score
from torch import nn

sys.path.append("./ranking/models/")

from rankfx.dcnv2.model import DCNv2
from rankfx.finalnet.model import FinalNet

In [72]:
base_path = "/kaggle/input/kkbox-x1"

df_train = pd.read_csv(f"{base_path}/train.csv")
df_test = pd.read_csv(f"{base_path}/test.csv")
df_val = pd.read_csv(f"{base_path}/valid.csv")

df_train.head()

Unnamed: 0,label,msno,song_id,source_system_tab,source_screen_name,source_type,city,bd,gender,registered_via,registration_init_time,expiration_date,song_length,genre_ids,artist_name,composer,lyricist,language,name,isrc
0,1,2972,298350,my library,Local playlist more,local-library,5,0,,9,20110221,20170922,240624.0,465,444553,149542,206321 364176,31.0,975794,
1,1,31472,684399,my library,Local playlist more,local-library,9,27,female,9,20130912,20170912,247911.0,465,506764,391787,497352,3.0,1082967,TWA470326002
2,1,1238,552192,my library,Local playlist more,local-playlist,1,0,,7,20130925,20171001,261224.0,465,507689,507689,507689,3.0,1111889,CNA651500229
3,1,1238,994686,my library,Local playlist more,local-playlist,1,0,,7,20130925,20171001,227343.0,465,508873,346049,346049,3.0,1141245,TWA531657203
4,1,6871,1631708,my library,Local playlist more,local-library,13,28,male,3,20140626,20170913,121858.0,921,444553,225688 331823 77958 336148 209890 450162 25980...,,52.0,967217,USUM71504649


In [73]:
from sklearn.preprocessing import StandardScaler


cols_to_delete = [
    "isrc",
    "name",
    "lyricist",
    "composer",
    "expiration_date",
    "registration_init_time",
]
feature_columns = df_train.columns.difference([*cols_to_delete, "label"])


num_features = ["song_length"]
cat_features = feature_columns.difference(num_features).tolist()
for df in [df_train, df_val, df_test]:
    df.drop(columns=cols_to_delete, inplace=True)
    df[cat_features] = df[cat_features].fillna("__NONE__").astype("category")
    df[num_features] = df[num_features].fillna(-1)


scaler = StandardScaler()
df_train[num_features] = scaler.fit_transform(df_train[num_features])
df_val[num_features] = scaler.transform(df_val[num_features])
df_test[num_features] = scaler.transform(df_test[num_features])

# LightGBM

## LGBMClassifier

In [76]:
booster_clf = LGBMClassifier(
    objective="binary",
    max_depth=5,
    learning_rate=1e-1,
    n_estimators=1000,
)

In [77]:
feature_names = df_train.columns.difference(["label"]).tolist()

booster_clf = booster_clf.fit(
    X=df_train.drop(columns="label"),
    y=df_train["label"],
    eval_set=(df_val.drop(columns="label"), df_val["label"]),
    feature_name=df_train.drop(columns="label").columns.tolist(),
    categorical_feature=cat_features,
    callbacks=[lgb.early_stopping(stopping_rounds=10), lgb.log_evaluation()],
)

[LightGBM] [Info] Number of positive: 2971724, number of negative: 2930208
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.124494 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 34133
[LightGBM] [Info] Number of data points in the train set: 5901932, number of used features: 13
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.503517 -> initscore=0.014069
[LightGBM] [Info] Start training from score 0.014069
[1]	valid_0's binary_logloss: 0.68428
Training until validation scores don't improve for 10 rounds
[2]	valid_0's binary_logloss: 0.677072
[3]	valid_0's binary_logloss: 0.671092
[4]	valid_0's binary_logloss: 0.666161
[5]	valid_0's binary_logloss: 0.661932
[6]	valid_0's binary_logloss: 0.658358
[7]	valid_0's binary_logloss: 0.655377
[8]	valid_0's binary_logloss: 0.652704
[9]	valid_0's binary_logloss: 0.650466
[10]	valid_0's binary_logl

In [78]:
booster_clf_preds = booster_clf.predict(
    df_test.drop(columns="label"),
    raw_score=True,
)

In [79]:
roc_auc = roc_auc_score(df_test["label"], booster_clf_preds)
logloss = log_loss(df_test["label"], booster_clf_preds)

print("LightGBM Classifier metrics")
print(f"ROC AUC: {roc_auc}, logloss: {logloss}")

LightGBM Classifier metrics
ROC AUC: 0.7726528528950893, logloss: 6.170192000842237


# DCNv2

In [84]:
dcnv2_model = DCNv2(
    model_structure="stacked_parallel",
    use_low_rank_mixture=True,
    cross_low_rank_dim=32,
    num_cross_layers=5,
    num_cross_experts=4,
    parallel_hidden_dims=[256, 512, 1024],
    parallel_dropout=0.2,
    parallel_use_batch_norm=True,
    parallel_activation=nn.ReLU,
    stacked_hidden_dims=[256, 512, 1024],
    stacked_dropout=0.2,
    stacked_use_batch_norm=True,
    stacked_activation=nn.ReLU,
    output_dim=1,
    proj_output_embeddings=True,
)

In [85]:
train_metrics_dcnv2, val_metrics_dcnv2 = dcnv2_model.fit(
    features=df_train.drop(columns="label"),
    target=df_train["label"],
    val_features=df_val.drop(columns="label"),
    val_target=df_val["label"],
    optimizer_cls="torch.optim.Adam",
    optimizer_params=dict(lr=1e-2),
    scheduler_cls="torch.optim.lr_scheduler.ReduceLROnPlateau",
    scheduler_params=dict(mode="max", factor=0.1, patience=1, min_lr=1e-6),
    grad_clip_threshold=10.,
    num_epochs=3,
    seed=42,
    artifacts_path="./dcnv2_artifacts",
    device="cuda:0",
    batch_size=4096,
    num_workers=2,
    eval_metric_name="AUC",
    eval_mode="max",
    default_embedding_size=20,
    oov_masking_proba=0.05,
    l2_net_reg=0,
    l2_embedding_reg=0,
)

[2025-05-04 00:24:34,049]{model.py:660} - INFO - Used features config: FeaturesConfig(features=[Feature(name='msno', feature_type=<FeatureType.CATEGORICAL: 'categorical'>, feature_size=1, needs_embed=True, embedding_size=20, embedding_vocab_size=30534, embedding_padding_idx=None), Feature(name='song_id', feature_type=<FeatureType.CATEGORICAL: 'categorical'>, feature_size=1, needs_embed=True, embedding_size=20, embedding_vocab_size=324210, embedding_padding_idx=None), Feature(name='source_system_tab', feature_type=<FeatureType.CATEGORICAL: 'categorical'>, feature_size=1, needs_embed=True, embedding_size=20, embedding_vocab_size=10, embedding_padding_idx=None), Feature(name='source_screen_name', feature_type=<FeatureType.CATEGORICAL: 'categorical'>, feature_size=1, needs_embed=True, embedding_size=20, embedding_vocab_size=22, embedding_padding_idx=None), Feature(name='source_type', feature_type=<FeatureType.CATEGORICAL: 'categorical'>, feature_size=1, needs_embed=True, embedding_size=20,

Train epoch #0:   0%|          | 0/1441 [00:00<?, ?it/s]

Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7b02afb44900>
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1604, in __del__
    self._shutdown_workers()
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1587, in _shutdown_workers
    if w.is_alive():
       ^^^^^^^^^^^^
  File "/usr/lib/python3.11/multiprocessing/process.py", line 160, in is_alive
    assert self._parent_pid == os.getpid(), 'can only test a child process'
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
AssertionError: can only test a child process
Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7b02afb44900>
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1604, in __del__
    self._shutdown_workers()
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 15

Validation epoch #0:   0%|          | 0/181 [00:00<?, ?it/s]

[2025-05-04 00:40:30,554]{model.py:567} - INFO - Finished Validation Epoch #0, average metrics - [AUC: 0.77446, log_loss: 0.55855]
[2025-05-04 00:40:41,447]{model.py:747} - INFO - Best model with AUC = 0.7744638398748251 was saved to dcnv2_artifacts/best_model.pt


Train epoch #1:   0%|          | 0/1441 [00:00<?, ?it/s]

Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7b02afb44900>
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1604, in __del__
    self._shutdown_workers()
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1587, in _shutdown_workers
    if w.is_alive():
       ^^^^^^^^^^^^
  File "/usr/lib/python3.11/multiprocessing/process.py", line 160, in is_alive
    assert self._parent_pid == os.getpid(), 'can only test a child process'
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
AssertionError: can only test a child process
Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7b02afb44900>
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1604, in __del__
    self._shutdown_workers()
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 15

Validation epoch #1:   0%|          | 0/181 [00:00<?, ?it/s]

[2025-05-04 00:56:39,704]{model.py:567} - INFO - Finished Validation Epoch #1, average metrics - [AUC: 0.78357, log_loss: 0.55085]
[2025-05-04 00:56:50,605]{model.py:747} - INFO - Best model with AUC = 0.7835682800415887 was saved to dcnv2_artifacts/best_model.pt


Train epoch #2:   0%|          | 0/1441 [00:00<?, ?it/s]

Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7b02afb44900>
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1604, in __del__
    self._shutdown_workers()
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1587, in _shutdown_workers
    if w.is_alive():
       ^^^^^^^^^^^^
  File "/usr/lib/python3.11/multiprocessing/process.py", line 160, in is_alive
    assert self._parent_pid == os.getpid(), 'can only test a child process'
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
AssertionError: can only test a child process
Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7b02afb44900>
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1604, in __del__
    self._shutdown_workers()
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 15

Validation epoch #2:   0%|          | 0/181 [00:00<?, ?it/s]

[2025-05-04 01:12:47,071]{model.py:567} - INFO - Finished Validation Epoch #2, average metrics - [AUC: 0.78628, log_loss: 0.54646]
[2025-05-04 01:12:57,989]{model.py:747} - INFO - Best model with AUC = 0.786282640119618 was saved to dcnv2_artifacts/best_model.pt
[2025-05-04 01:12:57,990]{model.py:763} - INFO - Loading best model from dcnv2_artifacts/best_model.pt


In [86]:
test_metrics_dcnv2 = dcnv2_model.test(
    features=df_test.drop(columns="label"),
    target=df_test["label"],
    device="cuda:0",
    batch_size=4096,
    num_workers=2,
)
test_metrics_dcnv2

[2025-05-04 01:12:58,129]{model.py:789} - INFO - Building test dataloader
[2025-05-04 01:12:58,157]{model.py:297} - INFO - Encoding feature msno
[2025-05-04 01:12:58,228]{model.py:297} - INFO - Encoding feature song_id
[2025-05-04 01:12:58,475]{model.py:297} - INFO - Encoding feature source_system_tab
[2025-05-04 01:12:58,486]{model.py:297} - INFO - Encoding feature source_screen_name
[2025-05-04 01:12:58,510]{model.py:297} - INFO - Encoding feature source_type
[2025-05-04 01:12:58,520]{model.py:297} - INFO - Encoding feature city
[2025-05-04 01:12:58,544]{model.py:297} - INFO - Encoding feature bd
[2025-05-04 01:12:58,554]{model.py:297} - INFO - Encoding feature gender
[2025-05-04 01:12:58,571]{model.py:297} - INFO - Encoding feature registered_via
[2025-05-04 01:12:58,588]{model.py:297} - INFO - Encoding feature genre_ids
[2025-05-04 01:12:58,611]{model.py:297} - INFO - Encoding feature artist_name
[2025-05-04 01:12:58,713]{model.py:297} - INFO - Encoding feature language


Test epoch #-1:   0%|          | 0/181 [00:00<?, ?it/s]

[2025-05-04 01:13:26,249]{model.py:567} - INFO - Finished Test Epoch #-1, average metrics - [AUC: 0.78646, log_loss: 0.54609]


{'AUC': 0.7864580833023873, 'log_loss': 0.5460870245117619}

# FinalNet

In [94]:
finalnet_model = FinalNet(
    block_type="2B",
    use_field_gate=True,
    use_batch_norm=True,
    add_bias=True,
    block1_hidden_dims=[256, 512, 1024],
    block1_hidden_activations=nn.ReLU,
    block1_dropout_rates=0.2,
    block2_hidden_dims=[256, 512, 1024],
    block2_hidden_activations=nn.ReLU,
    block2_dropout_rates=0.2,
    residual_type="concat",
    proj_output_embeddings=True,
)

In [95]:
train_metrics_final, val_metrics_final = finalnet_model.fit(
    features=df_train.drop(columns="label"),
    target=df_train["label"],
    val_features=df_val.drop(columns="label"),
    val_target=df_val["label"],
    optimizer_cls="torch.optim.Adam",
    optimizer_params=dict(lr=1e-2),
    scheduler_cls="torch.optim.lr_scheduler.ReduceLROnPlateau",
    scheduler_params=dict(mode="max", factor=0.1, patience=1, min_lr=1e-6),
    grad_clip_threshold=10.,
    num_epochs=3,
    seed=42,
    artifacts_path="./finalnet_artifacts",
    device="cuda:0",
    batch_size=4096,
    num_workers=4,
    eval_metric_name="AUC",
    eval_mode="max",
    oov_masking_proba=0.05,
    embedded_features=num_features,
    default_embedding_size=20, # should be equal for all features if using field gate
    l2_net_reg=0.,
    l2_embedding_reg=0,
)

[2025-05-04 01:15:20,476]{model.py:660} - INFO - Used features config: FeaturesConfig(features=[Feature(name='msno', feature_type=<FeatureType.CATEGORICAL: 'categorical'>, feature_size=1, needs_embed=True, embedding_size=20, embedding_vocab_size=30534, embedding_padding_idx=None), Feature(name='song_id', feature_type=<FeatureType.CATEGORICAL: 'categorical'>, feature_size=1, needs_embed=True, embedding_size=20, embedding_vocab_size=324210, embedding_padding_idx=None), Feature(name='source_system_tab', feature_type=<FeatureType.CATEGORICAL: 'categorical'>, feature_size=1, needs_embed=True, embedding_size=20, embedding_vocab_size=10, embedding_padding_idx=None), Feature(name='source_screen_name', feature_type=<FeatureType.CATEGORICAL: 'categorical'>, feature_size=1, needs_embed=True, embedding_size=20, embedding_vocab_size=22, embedding_padding_idx=None), Feature(name='source_type', feature_type=<FeatureType.CATEGORICAL: 'categorical'>, feature_size=1, needs_embed=True, embedding_size=20,

Train epoch #0:   0%|          | 0/1441 [00:00<?, ?it/s]

[2025-05-04 01:26:38,658]{model.py:567} - INFO - Finished Train Epoch #0, average metrics - [loss: 0.58906]


Validation epoch #0:   0%|          | 0/181 [00:00<?, ?it/s]

[2025-05-04 01:27:05,240]{model.py:567} - INFO - Finished Validation Epoch #0, average metrics - [AUC: 0.77933, log_loss: 0.55412]
[2025-05-04 01:27:26,183]{model.py:747} - INFO - Best model with AUC = 0.7793291453155989 was saved to finalnet_artifacts/best_model.pt


Train epoch #1:   0%|          | 0/1441 [00:01<?, ?it/s]

[2025-05-04 01:38:29,387]{model.py:567} - INFO - Finished Train Epoch #1, average metrics - [loss: 0.54628]


Validation epoch #1:   0%|          | 0/181 [00:00<?, ?it/s]

[2025-05-04 01:39:15,072]{model.py:567} - INFO - Finished Validation Epoch #1, average metrics - [AUC: 0.79569, log_loss: 0.53652]
[2025-05-04 01:39:36,021]{model.py:747} - INFO - Best model with AUC = 0.7956929521233412 was saved to finalnet_artifacts/best_model.pt


Train epoch #2:   0%|          | 0/1441 [00:01<?, ?it/s]

[2025-05-04 01:50:39,839]{model.py:567} - INFO - Finished Train Epoch #2, average metrics - [loss: 0.51929]


Validation epoch #2:   0%|          | 0/181 [00:00<?, ?it/s]

[2025-05-04 01:51:25,788]{model.py:567} - INFO - Finished Validation Epoch #2, average metrics - [AUC: 0.80288, log_loss: 0.53150]
[2025-05-04 01:51:46,796]{model.py:747} - INFO - Best model with AUC = 0.8028754642694195 was saved to finalnet_artifacts/best_model.pt
[2025-05-04 01:51:46,797]{model.py:763} - INFO - Loading best model from finalnet_artifacts/best_model.pt


In [96]:
test_metrics_final = finalnet_model.test(
    features=df_test.drop(columns="label"),
    target=df_test["label"],
    device="cuda:0",
    batch_size=4096,
    num_workers=2,
)
test_metrics_final

[2025-05-04 01:51:46,909]{model.py:789} - INFO - Building test dataloader
[2025-05-04 01:51:46,933]{model.py:297} - INFO - Encoding feature msno
[2025-05-04 01:51:46,990]{model.py:297} - INFO - Encoding feature song_id
[2025-05-04 01:51:47,230]{model.py:297} - INFO - Encoding feature source_system_tab
[2025-05-04 01:51:47,241]{model.py:297} - INFO - Encoding feature source_screen_name
[2025-05-04 01:51:47,265]{model.py:297} - INFO - Encoding feature source_type
[2025-05-04 01:51:47,275]{model.py:297} - INFO - Encoding feature city
[2025-05-04 01:51:47,301]{model.py:297} - INFO - Encoding feature bd
[2025-05-04 01:51:47,312]{model.py:297} - INFO - Encoding feature gender
[2025-05-04 01:51:47,338]{model.py:297} - INFO - Encoding feature registered_via
[2025-05-04 01:51:47,347]{model.py:297} - INFO - Encoding feature genre_ids
[2025-05-04 01:51:47,377]{model.py:297} - INFO - Encoding feature artist_name
[2025-05-04 01:51:47,465]{model.py:297} - INFO - Encoding feature language


Test epoch #-1:   0%|          | 0/181 [00:00<?, ?it/s]

[2025-05-04 01:52:14,811]{model.py:567} - INFO - Finished Test Epoch #-1, average metrics - [AUC: 0.80282, log_loss: 0.53141]


{'AUC': 0.8028208149455214, 'log_loss': 0.5314084704530784}