In [3]:
!git clone https://github.com/sn09/ranking.git

Cloning into 'ranking'...
remote: Enumerating objects: 354, done.[K
remote: Counting objects: 100% (354/354), done.[K
remote: Compressing objects: 100% (209/209), done.[K
remote: Total 354 (delta 170), reused 294 (delta 119), pack-reused 0 (from 0)[K
Receiving objects: 100% (354/354), 230.55 KiB | 2.99 MiB/s, done.
Resolving deltas: 100% (170/170), done.


In [1]:
import sys

import lightgbm as lgb
import numpy as np
import pandas as pd
from lightgbm import LGBMClassifier, LGBMRanker
from sklearn.metrics import log_loss, roc_auc_score
from torch import nn

sys.path.append("./ranking/models/")

from dcnv2.model import DCNv2
from finalnet.model import FinalNet

In [2]:
base_path = "/kaggle/input/avazu-x1"

df_train = pd.read_csv(f"{base_path}/train.csv")
df_test = pd.read_csv(f"{base_path}/test.csv")
df_val = pd.read_csv(f"{base_path}/valid.csv")

df_train.head()

Unnamed: 0,label,feat_1,feat_2,feat_3,feat_4,feat_5,feat_6,feat_7,feat_8,feat_9,...,feat_13,feat_14,feat_15,feat_16,feat_17,feat_18,feat_19,feat_20,feat_21,feat_22
0,0,1,9,18,3582,7908,7931,12997,13304,13335,...,1541008,1541030,1543495,1543504,1543514,1543945,1543950,1544018,1544189,1544270
1,0,1,9,18,3582,7908,7931,12997,13304,13335,...,1541007,1541026,1543495,1543504,1543514,1543945,1543950,1544019,1544189,1544270
2,0,1,9,18,3582,7908,7931,12997,13304,13335,...,1541007,1541026,1543495,1543504,1543514,1543945,1543950,1544019,1544189,1544270
3,0,1,9,18,3582,7908,7931,12997,13304,13335,...,1541007,1541030,1543495,1543504,1543514,1543945,1543950,1544019,1544189,1544270
4,0,1,10,177,3712,7918,7931,12997,13304,13335,...,1541007,1541250,1543495,1543504,1543628,1543945,1543950,1544018,1544193,1544270


In [3]:
feature_columns = df_train.columns.difference(["label"]).tolist()
for df in [df_train, df_val, df_test]:
    df[feature_columns] = df[feature_columns].astype("category")

# LightGBM

## LGBMClassifier

In [101]:
booster_clf = LGBMClassifier(
    objective="binary",
    max_depth=8,
    learning_rate=1e-1,
    n_estimators=2000,
)

In [102]:
feature_names = df_train.columns.difference(["label"]).tolist()

booster_clf = booster_clf.fit(
    X=df_train.drop(columns="label"),
    y=df_train["label"],
    eval_set=(df_val.drop(columns="label"), df_val["label"]),
    feature_name=df_train.drop(columns="label").columns.tolist(),
    categorical_feature=feature_columns,
    callbacks=[lgb.early_stopping(stopping_rounds=10), lgb.log_evaluation()],
)

[LightGBM] [Info] Number of positive: 4953382, number of negative: 23346894
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 2.553294 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 14346
[LightGBM] [Info] Number of data points in the train set: 28300276, number of used features: 22
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.175029 -> initscore=-1.550393
[LightGBM] [Info] Start training from score -1.550393
[1]	valid_0's binary_logloss: 0.445751
Training until validation scores don't improve for 10 rounds
[2]	valid_0's binary_logloss: 0.440344
[3]	valid_0's binary_logloss: 0.436194
[4]	valid_0's binary_logloss: 0.432942
[5]	valid_0's binary_logloss: 0.43024
[6]	valid_0's binary_logloss: 0.428063
[7]	valid_0's binary_logloss: 0.426096
[8]	valid_0's binary_logloss: 0.424563
[9]	valid_0's binary_logloss: 0.423314
[10]	valid_0's binary_

In [103]:
booster_clf_preds = booster_clf.predict(
    df_test.drop(columns="label"),
    raw_score=True,
)

In [104]:
roc_auc = roc_auc_score(df_test["label"], booster_clf_preds)
logloss = log_loss(df_test["label"], booster_clf_preds)

print("LightGBM Classifier metrics")
print(f"ROC AUC: {roc_auc}, logloss: {logloss}")

LightGBM Classifier metrics
ROC AUC: 0.7558925197214148, logloss: 5.135342602797571


# DCNv2

In [4]:
dcnv2_model = DCNv2(
    model_structure="stacked_parallel",
    use_low_rank_mixture=True,
    cross_low_rank_dim=32,
    num_cross_layers=5,
    num_cross_experts=4,
    parallel_hidden_dims=[256, 512, 1024],
    parallel_dropout=0.2,
    parallel_use_batch_norm=True,
    parallel_activation=nn.ReLU,
    stacked_hidden_dims=[256, 512, 1024],
    stacked_dropout=0.2,
    stacked_use_batch_norm=True,
    stacked_activation=nn.ReLU,
    output_dim=1,
    proj_output_embeddings=True,
)

In [5]:
train_metrics_dcnv2, val_metrics_dcnv2 = dcnv2_model.fit(
    features=df_train.drop(columns="label"),
    target=df_train["label"],
    val_features=df_val.drop(columns="label"),
    val_target=df_val["label"],
    optimizer_cls="torch.optim.Adam",
    optimizer_params=dict(lr=1e-2),
    scheduler_cls="torch.optim.lr_scheduler.ReduceLROnPlateau",
    scheduler_params=dict(mode="max", factor=0.1, patience=1, min_lr=1e-6),
    grad_clip_threshold=10.,
    num_epochs=1,
    seed=42,
    artifacts_path="./dcnv2_artifacts",
    device="cuda:0",
    batch_size=4096,
    num_workers=4,
    eval_metric_name="AUC",
    eval_mode="max",
    default_embedding_size=10,
    oov_masking_proba=0.05,
    l2_net_reg=0,
    l2_embedding_reg=0,
)

[2025-05-04 02:38:04,886]{model.py:660} - INFO - Used features config: FeaturesConfig(features=[Feature(name='feat_1', feature_type=<FeatureType.CATEGORICAL: 'categorical'>, feature_size=1, needs_embed=True, embedding_size=10, embedding_vocab_size=8, embedding_padding_idx=None), Feature(name='feat_2', feature_type=<FeatureType.CATEGORICAL: 'categorical'>, feature_size=1, needs_embed=True, embedding_size=10, embedding_vocab_size=8, embedding_padding_idx=None), Feature(name='feat_3', feature_type=<FeatureType.CATEGORICAL: 'categorical'>, feature_size=1, needs_embed=True, embedding_size=10, embedding_vocab_size=3479, embedding_padding_idx=None), Feature(name='feat_4', feature_type=<FeatureType.CATEGORICAL: 'categorical'>, feature_size=1, needs_embed=True, embedding_size=10, embedding_vocab_size=4270, embedding_padding_idx=None), Feature(name='feat_5', feature_type=<FeatureType.CATEGORICAL: 'categorical'>, feature_size=1, needs_embed=True, embedding_size=10, embedding_vocab_size=25, embedd

Train epoch #0:   0%|          | 0/6910 [00:03<?, ?it/s]

[2025-05-04 04:03:46,368]{model.py:567} - INFO - Finished Train Epoch #0, average metrics - [loss: 0.39880]


Validation epoch #0:   0%|          | 0/988 [00:00<?, ?it/s]

[2025-05-04 04:06:07,086]{model.py:567} - INFO - Finished Validation Epoch #0, average metrics - [AUC: 0.73463, log_loss: 0.40229]
[2025-05-04 04:06:29,024]{model.py:747} - INFO - Best model with AUC = 0.7346270845082805 was saved to dcnv2_artifacts/best_model.pt
[2025-05-04 04:06:29,025]{model.py:763} - INFO - Loading best model from dcnv2_artifacts/best_model.pt


In [6]:
test_metrics_dcnv2 = dcnv2_model.test(
    features=df_test.drop(columns="label"),
    target=df_test["label"],
    device="cuda:0",
    batch_size=4096,
    num_workers=2,
)
test_metrics_dcnv2

[2025-05-04 04:06:29,582]{model.py:789} - INFO - Building test dataloader
[2025-05-04 04:06:30,015]{model.py:297} - INFO - Encoding feature feat_1
[2025-05-04 04:06:30,392]{model.py:297} - INFO - Encoding feature feat_2
[2025-05-04 04:06:30,590]{model.py:297} - INFO - Encoding feature feat_3
[2025-05-04 04:06:30,837]{model.py:297} - INFO - Encoding feature feat_4
[2025-05-04 04:06:31,071]{model.py:297} - INFO - Encoding feature feat_5
[2025-05-04 04:06:31,304]{model.py:297} - INFO - Encoding feature feat_6
[2025-05-04 04:06:31,454]{model.py:297} - INFO - Encoding feature feat_7
[2025-05-04 04:06:31,685]{model.py:297} - INFO - Encoding feature feat_8
[2025-05-04 04:06:31,810]{model.py:297} - INFO - Encoding feature feat_9
[2025-05-04 04:06:32,177]{model.py:297} - INFO - Encoding feature feat_10
[2025-05-04 04:06:32,994]{model.py:297} - INFO - Encoding feature feat_11
[2025-05-04 04:06:33,199]{model.py:297} - INFO - Encoding feature feat_12
[2025-05-04 04:06:33,396]{model.py:297} - INFO 

Test epoch #-1:   0%|          | 0/1975 [00:00<?, ?it/s]

Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7c3514d632e0>
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1604, in __del__
    self._shutdown_workers()
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1587, in _shutdown_workers
    if w.is_alive():
       ^^^^^^^^^^^^
  File "/usr/lib/python3.11/multiprocessing/process.py", line 160, in is_alive
    assert self._parent_pid == os.getpid(), 'can only test a child process'
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
AssertionError: can only test a child process
Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7c3514d632e0>
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1604, in __del__
    self._shutdown_workers()
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 15

{'AUC': 0.7540177975669551, 'log_loss': 0.37445330387777553}

# FinalNet

In [7]:
finalnet_model = FinalNet(
    block_type="2B",
    use_field_gate=True,
    use_batch_norm=True,
    add_bias=True,
    block1_hidden_dims=[256, 512, 1024],
    block1_hidden_activations=nn.ReLU,
    block1_dropout_rates=0.2,
    block2_hidden_dims=[256, 512, 1024],
    block2_hidden_activations=nn.ReLU,
    block2_dropout_rates=0.2,
    residual_type="concat",
    proj_output_embeddings=True,
)

In [8]:
train_metrics_final, val_metrics_final = finalnet_model.fit(
    features=df_train.drop(columns="label"),
    target=df_train["label"],
    val_features=df_val.drop(columns="label"),
    val_target=df_val["label"],
    optimizer_cls="torch.optim.Adam",
    optimizer_params=dict(lr=1e-2),
    scheduler_cls="torch.optim.lr_scheduler.ReduceLROnPlateau",
    scheduler_params=dict(mode="max", factor=0.1, patience=1, min_lr=1e-6),
    grad_clip_threshold=10.,
    num_epochs=1,
    seed=42,
    artifacts_path="./finalnet_artifacts",
    device="cuda:0",
    batch_size=4096,
    num_workers=4,
    eval_metric_name="AUC",
    eval_mode="max",
    oov_masking_proba=0.05,
    default_embedding_size=10, # should be equal for all features if using field gate
    l2_net_reg=0.,
    l2_embedding_reg=0,
)

[2025-05-04 04:11:46,001]{model.py:660} - INFO - Used features config: FeaturesConfig(features=[Feature(name='feat_1', feature_type=<FeatureType.CATEGORICAL: 'categorical'>, feature_size=1, needs_embed=True, embedding_size=10, embedding_vocab_size=8, embedding_padding_idx=None), Feature(name='feat_2', feature_type=<FeatureType.CATEGORICAL: 'categorical'>, feature_size=1, needs_embed=True, embedding_size=10, embedding_vocab_size=8, embedding_padding_idx=None), Feature(name='feat_3', feature_type=<FeatureType.CATEGORICAL: 'categorical'>, feature_size=1, needs_embed=True, embedding_size=10, embedding_vocab_size=3479, embedding_padding_idx=None), Feature(name='feat_4', feature_type=<FeatureType.CATEGORICAL: 'categorical'>, feature_size=1, needs_embed=True, embedding_size=10, embedding_vocab_size=4270, embedding_padding_idx=None), Feature(name='feat_5', feature_type=<FeatureType.CATEGORICAL: 'categorical'>, feature_size=1, needs_embed=True, embedding_size=10, embedding_vocab_size=25, embedd

Train epoch #0:   0%|          | 0/6910 [00:04<?, ?it/s]

Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7c3514d632e0>
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1604, in __del__
    self._shutdown_workers()
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1587, in _shutdown_workers
    if w.is_alive():
       ^^^^^^^^^^^^
  File "/usr/lib/python3.11/multiprocessing/process.py", line 160, in is_alive
    assert self._parent_pid == os.getpid(), 'can only test a child process'
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
AssertionError: can only test a child process
Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7c3514d632e0>
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1604, in __del__
    self._shutdown_workers()
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 15

Validation epoch #0:   0%|          | 0/988 [00:00<?, ?it/s]

[2025-05-04 05:38:48,478]{model.py:567} - INFO - Finished Validation Epoch #0, average metrics - [AUC: 0.73775, log_loss: 0.39812]
[2025-05-04 05:39:10,485]{model.py:747} - INFO - Best model with AUC = 0.7377471633309797 was saved to finalnet_artifacts/best_model.pt
[2025-05-04 05:39:10,486]{model.py:763} - INFO - Loading best model from finalnet_artifacts/best_model.pt


In [9]:
test_metrics_final = finalnet_model.test(
    features=df_test.drop(columns="label"),
    target=df_test["label"],
    device="cuda:0",
    batch_size=4096,
    num_workers=2,
)
test_metrics_final

[2025-05-04 05:39:11,010]{model.py:789} - INFO - Building test dataloader
[2025-05-04 05:39:11,384]{model.py:297} - INFO - Encoding feature feat_1
[2025-05-04 05:39:11,744]{model.py:297} - INFO - Encoding feature feat_2
[2025-05-04 05:39:11,937]{model.py:297} - INFO - Encoding feature feat_3
[2025-05-04 05:39:12,192]{model.py:297} - INFO - Encoding feature feat_4
[2025-05-04 05:39:12,425]{model.py:297} - INFO - Encoding feature feat_5
[2025-05-04 05:39:12,653]{model.py:297} - INFO - Encoding feature feat_6
[2025-05-04 05:39:12,807]{model.py:297} - INFO - Encoding feature feat_7
[2025-05-04 05:39:13,039]{model.py:297} - INFO - Encoding feature feat_8
[2025-05-04 05:39:13,231]{model.py:297} - INFO - Encoding feature feat_9
[2025-05-04 05:39:13,627]{model.py:297} - INFO - Encoding feature feat_10
[2025-05-04 05:39:14,509]{model.py:297} - INFO - Encoding feature feat_11
[2025-05-04 05:39:14,678]{model.py:297} - INFO - Encoding feature feat_12
[2025-05-04 05:39:14,914]{model.py:297} - INFO 

Test epoch #-1:   0%|          | 0/1975 [00:00<?, ?it/s]

Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7c3514d632e0>
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1604, in __del__
Traceback (most recent call last):
    self._shutdown_workers()
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1587, in _shutdown_workers
    if w.is_alive():
      ^ ^^^^^^^^^^^
  File "/usr/lib/python3.11/multiprocessing/process.py", line 160, in is_alive
    assert self._parent_pid == os.getpid(), 'can only test a child process'
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
AssertionError: can only test a child process
Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7c3514d632e0>
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1604, in __del__
    self._shutdown_workers()
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 15

{'AUC': 0.7584461618508356, 'log_loss': 0.36968690108798863}