In [5]:
!unzip /content/MovielensLatest_x1.zip

Archive:  /content/MovielensLatest_x1.zip
  inflating: valid.csv               
  inflating: test.csv                
  inflating: train.csv               


In [2]:
!git clone https://github.com/sn09/ranking.git

Cloning into 'ranking'...
remote: Enumerating objects: 283, done.[K
remote: Counting objects: 100% (283/283), done.[K
remote: Compressing objects: 100% (164/164), done.[K
remote: Total 283 (delta 131), reused 241 (delta 97), pack-reused 0 (from 0)[K
Receiving objects: 100% (283/283), 136.00 KiB | 769.00 KiB/s, done.
Resolving deltas: 100% (131/131), done.


In [3]:
import sys

import pandas as pd

sys.path.append("./ranking/models/")

from dcnv2.model.model import DCNv2

In [4]:
df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")
df_val = pd.read_csv("valid.csv")

df_train.head()

Unnamed: 0,label,user_id,item_id,tag_id
0,0,84982,58,39525
1,0,87756,8757,22786
2,0,80311,14912,45676
3,1,82036,84509,33556
4,0,66867,57349,15092


In [5]:
for df in [df_train, df_val, df_test]:
    df["user_id"] = df["user_id"].astype("category")
    df["item_id"] = df["item_id"].astype("category")
    df["tag_id"] = df["tag_id"].astype("category")

In [6]:
import torch
from torch import nn


model = DCNv2(
    model_structure="parallel",
    use_low_rank_mixture=False,
    cross_low_rank_dim=32,
    num_cross_layers=5,
    num_cross_experts=4,
    parallel_hidden_dims=[400, 400, 400],
    parallel_dropout=0.2,
    parallel_use_batch_norm=True,
    parallel_activation=nn.ReLU,
    stacked_hidden_dims=[500, 500, 500],
    stacked_dropout=0.2,
    stacked_use_batch_norm=True,
    stacked_activation=nn.ReLU,
    output_dim=1,
    proj_output_embeddings=False,
)

In [7]:
model

DCNv2(
  (loss_fn): BCEWithLogitsLoss()
)

In [8]:
import os

train_metrics, val_metrics = model.fit(
    features=df_train.drop(columns="label"),
    target=df_train["label"],
    val_features=df_val.drop(columns="label"),
    val_target=df_val["label"],
    optimizer_cls="torch.optim.Adam",
    optimizer_params=dict(lr=1e-3),
    num_epochs=8,
    seed=42,
    artifacts_path="./dcnv2_artifacts",
    device="cuda:0",
    batch_size=4096,
    num_workers=2,
    eval_metric_name="log_loss",
    eval_mode="min",
    embedded_features=["user_id", "item_id", "tag_id"],
    oov_masking_proba=0.05,
)

[2025-05-02 14:58:17,803]{model.py:614} - INFO - Used features config: FeaturesConfig(features=[Feature(name='user_id', feature_type=<FeatureType.CATEGORICAL: 'categorical'>, feature_size=1, needs_embed=True, embedding_size=10, embedding_vocab_size=16976, embedding_padding_idx=None), Feature(name='item_id', feature_type=<FeatureType.CATEGORICAL: 'categorical'>, feature_size=1, needs_embed=True, embedding_size=10, embedding_vocab_size=23605, embedding_padding_idx=None), Feature(name='tag_id', feature_type=<FeatureType.CATEGORICAL: 'categorical'>, feature_size=1, needs_embed=True, embedding_size=10, embedding_vocab_size=49658, embedding_padding_idx=None)])
[2025-05-02 14:58:18,785]{model.py:629} - INFO - Artifacts path is /content/dcnv2_artifacts
[2025-05-02 14:58:18,786]{model.py:635} - INFO - Best model path is dcnv2_artifacts/best_model.pt
[2025-05-02 14:58:18,930]{model.py:640} - INFO - Building features mappings
[2025-05-02 14:58:18,931]{model.py:267} - INFO - Building mapping for f

Train epoch #0:   0%|          | 0/343 [00:00<?, ?it/s]

[2025-05-02 15:00:22,496]{model.py:525} - INFO - Finished Train Epoch #0, average metrics - [loss: 0.58473]


Validation epoch #0:   0%|          | 0/98 [00:00<?, ?it/s]

[2025-05-02 15:00:38,758]{model.py:525} - INFO - Finished Validation Epoch #0, average metrics - [AUC: 0.79449, log_loss: 0.48807]
[2025-05-02 15:00:49,128]{model.py:699} - INFO - Best model with log_loss = 0.4880737599358855 was saved to dcnv2_artifacts/best_model.pt


Train epoch #1:   0%|          | 0/343 [00:00<?, ?it/s]

[2025-05-02 15:02:53,339]{model.py:525} - INFO - Finished Train Epoch #1, average metrics - [loss: 0.44448]


Validation epoch #1:   0%|          | 0/98 [00:00<?, ?it/s]

[2025-05-02 15:03:19,982]{model.py:525} - INFO - Finished Validation Epoch #1, average metrics - [AUC: 0.87708, log_loss: 0.37407]
[2025-05-02 15:03:30,307]{model.py:699} - INFO - Best model with log_loss = 0.3740705386587714 was saved to dcnv2_artifacts/best_model.pt


Train epoch #2:   0%|          | 0/343 [00:00<?, ?it/s]

[2025-05-02 15:05:31,047]{model.py:525} - INFO - Finished Train Epoch #2, average metrics - [loss: 0.36930]


Validation epoch #2:   0%|          | 0/98 [00:00<?, ?it/s]

[2025-05-02 15:05:57,133]{model.py:525} - INFO - Finished Validation Epoch #2, average metrics - [AUC: 0.90196, log_loss: 0.33358]
[2025-05-02 15:06:07,470]{model.py:699} - INFO - Best model with log_loss = 0.3335841177552086 was saved to dcnv2_artifacts/best_model.pt


Train epoch #3:   0%|          | 0/343 [00:00<?, ?it/s]

[2025-05-02 15:08:09,109]{model.py:525} - INFO - Finished Train Epoch #3, average metrics - [loss: 0.33780]


Validation epoch #3:   0%|          | 0/98 [00:00<?, ?it/s]

[2025-05-02 15:08:35,257]{model.py:525} - INFO - Finished Validation Epoch #3, average metrics - [AUC: 0.91342, log_loss: 0.31684]
[2025-05-02 15:08:45,592]{model.py:699} - INFO - Best model with log_loss = 0.3168417935052822 was saved to dcnv2_artifacts/best_model.pt


Train epoch #4:   0%|          | 0/343 [00:00<?, ?it/s]

[2025-05-02 15:10:44,832]{model.py:525} - INFO - Finished Train Epoch #4, average metrics - [loss: 0.32104]


Validation epoch #4:   0%|          | 0/98 [00:00<?, ?it/s]

[2025-05-02 15:11:10,737]{model.py:525} - INFO - Finished Validation Epoch #4, average metrics - [AUC: 0.91952, log_loss: 0.30996]
[2025-05-02 15:11:21,077]{model.py:699} - INFO - Best model with log_loss = 0.30995672137627245 was saved to dcnv2_artifacts/best_model.pt


Train epoch #5:   0%|          | 0/343 [00:00<?, ?it/s]

[2025-05-02 15:13:20,808]{model.py:525} - INFO - Finished Train Epoch #5, average metrics - [loss: 0.31016]


Validation epoch #5:   0%|          | 0/98 [00:00<?, ?it/s]

[2025-05-02 15:13:47,395]{model.py:525} - INFO - Finished Validation Epoch #5, average metrics - [AUC: 0.92341, log_loss: 0.30422]
[2025-05-02 15:13:57,810]{model.py:699} - INFO - Best model with log_loss = 0.30422494563473973 was saved to dcnv2_artifacts/best_model.pt


Train epoch #6:   0%|          | 0/343 [00:00<?, ?it/s]

[2025-05-02 15:15:58,197]{model.py:525} - INFO - Finished Train Epoch #6, average metrics - [loss: 0.30188]


Validation epoch #6:   0%|          | 0/98 [00:00<?, ?it/s]

[2025-05-02 15:16:25,569]{model.py:525} - INFO - Finished Validation Epoch #6, average metrics - [AUC: 0.92612, log_loss: 0.30093]
[2025-05-02 15:16:35,913]{model.py:699} - INFO - Best model with log_loss = 0.3009284389328689 was saved to dcnv2_artifacts/best_model.pt


Train epoch #7:   0%|          | 0/343 [00:00<?, ?it/s]

[2025-05-02 15:18:36,976]{model.py:525} - INFO - Finished Train Epoch #7, average metrics - [loss: 0.29520]


Validation epoch #7:   0%|          | 0/98 [00:00<?, ?it/s]

[2025-05-02 15:19:03,306]{model.py:525} - INFO - Finished Validation Epoch #7, average metrics - [AUC: 0.92818, log_loss: 0.29941]
[2025-05-02 15:19:13,626]{model.py:699} - INFO - Best model with log_loss = 0.29940591576735626 was saved to dcnv2_artifacts/best_model.pt


In [9]:
test_metrics = model.test(
    features=df_test.drop(columns="label"),
    target=df_test["label"],
    device="cuda:0",
    batch_size=4096,
    num_workers=2,
)

[2025-05-02 15:19:13,643]{model.py:735} - INFO - Building test dataloader
[2025-05-02 15:19:13,647]{model.py:292} - INFO - Encoding feature user_id
[2025-05-02 15:19:13,666]{model.py:292} - INFO - Encoding feature item_id
[2025-05-02 15:19:13,684]{model.py:292} - INFO - Encoding feature tag_id


Test epoch #-1:   0%|          | 0/49 [00:00<?, ?it/s]

[2025-05-02 15:19:21,788]{model.py:525} - INFO - Finished Test Epoch #-1, average metrics - [AUC: 0.92778, log_loss: 0.29982]


In [10]:
train_metrics_v2 = model.test(
    features=df_train.drop(columns="label"),
    target=df_train["label"],
    device="cuda:0",
    batch_size=4096,
    num_workers=2,
)

[2025-05-02 15:19:32,109]{model.py:735} - INFO - Building test dataloader
[2025-05-02 15:19:32,125]{model.py:292} - INFO - Encoding feature user_id
[2025-05-02 15:19:32,184]{model.py:292} - INFO - Encoding feature item_id
[2025-05-02 15:19:32,222]{model.py:292} - INFO - Encoding feature tag_id


Test epoch #-1:   0%|          | 0/343 [00:00<?, ?it/s]

[2025-05-02 15:20:27,052]{model.py:525} - INFO - Finished Test Epoch #-1, average metrics - [AUC: 0.94947, log_loss: 0.26023]


In [11]:
model

DCNv2(
  (loss_fn): BCEWithLogitsLoss()
  (embedding_layer): EmbeddingLayer(
    (output_proj): Identity()
    (embedding_modules): ModuleDict(
      (user_id): Embedding(16976, 10)
      (item_id): Embedding(23605, 10)
      (tag_id): Embedding(49658, 10)
    )
    (dummy_fn): Identity()
  )
  (crossnet): CrossNetV2(
    (cross_layers): ModuleList(
      (0-4): 5 x Linear(in_features=30, out_features=30, bias=True)
    )
  )
  (parallel_dnn): MLPBlock(
    (mlp): Sequential(
      (0): Linear(in_features=30, out_features=400, bias=True)
      (1): BatchNorm1d(400, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU()
      (3): Dropout(p=0.2, inplace=False)
      (4): Linear(in_features=400, out_features=400, bias=True)
      (5): BatchNorm1d(400, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (6): ReLU()
      (7): Dropout(p=0.2, inplace=False)
      (8): Linear(in_features=400, out_features=400, bias=True)
      (9): BatchNorm1d(400