In [1]:
!unzip /content/MovielensLatest_x1.zip

unzip:  cannot find or open /content/MovielensLatest_x1.zip, /content/MovielensLatest_x1.zip.zip or /content/MovielensLatest_x1.zip.ZIP.


In [1]:
!git clone https://github.com/sn09/ranking.git

Cloning into 'ranking'...
remote: Enumerating objects: 230, done.[K
remote: Counting objects: 100% (230/230), done.[K
remote: Compressing objects: 100% (133/133), done.[K
remote: Total 230 (delta 98), reused 202 (delta 75), pack-reused 0 (from 0)[K
Receiving objects: 100% (230/230), 116.36 KiB | 6.12 MiB/s, done.
Resolving deltas: 100% (98/98), done.


In [2]:
import sys

import pandas as pd

sys.path.append("./ranking/models/")

from dcnv2.model.model import DCNv2

In [3]:
df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")
df_val = pd.read_csv("valid.csv")

df_train.head()

Unnamed: 0,label,user_id,item_id,tag_id
0,0,84982,58,39525
1,0,87756,8757,22786
2,0,80311,14912,45676
3,1,82036,84509,33556
4,0,66867,57349,15092


In [4]:
import numpy as np


class Vocab:
    def __init__(self, data: np.ndarray, add_oov_idx: bool = True):
        self._data = data

        self.add_oov_idx = add_oov_idx
        self.oov_idx = None

        self.create_mappings()

    def create_mappings(self):
        self.data2id = dict(zip(set(self._data), range(len(self._data))))
        self.id2data = dict(zip(set(self._data), range(len(self._data))))

        if self.add_oov_idx:
            self.oov_idx = len(self.data2id)
        self.vocab_size = len(self.data2id) + 1


vocab_user = Vocab(df_train.user_id.values)
vocab_item = Vocab(df_train.item_id.values)
vocab_tag = Vocab(df_train.tag_id.values)

In [5]:
def preprocess_feature(col: pd.Series, vocab: Vocab) -> pd.Series:
    return col.map(vocab.data2id).fillna(vocab.oov_idx)

In [6]:
for df in [df_train, df_val, df_test]:
    df["user_id"] = preprocess_feature(df["user_id"], vocab_user).astype("category")
    df["item_id"] = preprocess_feature(df["item_id"], vocab_item).astype("category")
    df["tag_id"] = preprocess_feature(df["tag_id"], vocab_tag).astype("category")

In [7]:
import torch
from torch import nn


model = DCNv2(
    model_structure="parallel",
    use_low_rank_mixture=False,
    cross_low_rank_dim=32,
    num_cross_layers=5,
    num_cross_experts=4,
    parallel_hidden_dims=[400, 400, 400],
    parallel_dropout=0.2,
    parallel_use_batch_norm=True,
    parallel_activation=nn.ReLU,
    stacked_hidden_dims=[500, 500, 500],
    stacked_dropout=0.2,
    stacked_use_batch_norm=True,
    stacked_activation=nn.ReLU,
    output_dim=1,
    proj_output_embeddings=False,
)

In [8]:
model

DCNv2(
  (loss_fn): BCEWithLogitsLoss()
)

In [9]:
import os

os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

train_metrics, val_metrics = model.fit(
    features=df_train.drop(columns="label"),
    target=df_train["label"],
    val_features=df_val.drop(columns="label"),
    val_target=df_val["label"],
    optimizer_cls="torch.optim.Adam",
    optimizer_params=dict(lr=1e-3),
    num_epochs=8,
    seed=42,
    artifacts_path="./dcnv2_artifacts",
    device="cuda:0",
    batch_size=4096,
    num_workers=2,
    eval_metric_name="log_loss",
    eval_mode="min",
    embedded_features=["user_id", "item_id", "tag_id"],
)

[2025-04-16 21:57:23,287]{model.py:540} - INFO - Used features config: FeaturesConfig(features=[Feature(name='user_id', feature_type=<FeatureType.CATEGORICAL: 'categorical'>, feature_size=1, needs_embed=True, embedding_size=10, embedding_vocab_size=16976, embedding_padding_idx=None), Feature(name='item_id', feature_type=<FeatureType.CATEGORICAL: 'categorical'>, feature_size=1, needs_embed=True, embedding_size=10, embedding_vocab_size=23605, embedding_padding_idx=None), Feature(name='tag_id', feature_type=<FeatureType.CATEGORICAL: 'categorical'>, feature_size=1, needs_embed=True, embedding_size=10, embedding_vocab_size=49658, embedding_padding_idx=None)])
[2025-04-16 21:57:25,384]{model.py:555} - INFO - Artifacts path is /content/dcnv2_artifacts
[2025-04-16 21:57:25,385]{model.py:561} - INFO - Best model path is dcnv2_artifacts/best_model.pt
[2025-04-16 21:57:25,737]{model.py:586} - INFO - Starting training process


Train epoch #0:   0%|          | 0/343 [00:00<?, ?it/s]

[2025-04-16 21:58:50,355]{model.py:471} - INFO - Finished Train Epoch #0, average metrics - [loss: 0.58202]


Validation epoch #0:   0%|          | 0/98 [00:00<?, ?it/s]

[2025-04-16 21:59:17,856]{model.py:471} - INFO - Finished Validation Epoch #0, average metrics - [AUC: 0.79798, log_loss: 0.48530]
[2025-04-16 21:59:28,203]{model.py:619} - INFO - Best model with log_loss = 0.48529880546142506 was saved to dcnv2_artifacts/best_model.pt


Train epoch #1:   0%|          | 0/343 [00:00<?, ?it/s]

[2025-04-16 22:00:50,425]{model.py:471} - INFO - Finished Train Epoch #1, average metrics - [loss: 0.43030]


Validation epoch #1:   0%|          | 0/98 [00:00<?, ?it/s]

[2025-04-16 22:01:28,649]{model.py:471} - INFO - Finished Validation Epoch #1, average metrics - [AUC: 0.87729, log_loss: 0.37211]
[2025-04-16 22:01:39,065]{model.py:619} - INFO - Best model with log_loss = 0.3721055105247786 was saved to dcnv2_artifacts/best_model.pt


Train epoch #2:   0%|          | 0/343 [00:00<?, ?it/s]

[2025-04-16 22:03:03,872]{model.py:471} - INFO - Finished Train Epoch #2, average metrics - [loss: 0.35303]


Validation epoch #2:   0%|          | 0/98 [00:00<?, ?it/s]

[2025-04-16 22:03:41,776]{model.py:471} - INFO - Finished Validation Epoch #2, average metrics - [AUC: 0.90201, log_loss: 0.33310]
[2025-04-16 22:03:52,142]{model.py:619} - INFO - Best model with log_loss = 0.33309537818270374 was saved to dcnv2_artifacts/best_model.pt


Train epoch #3:   0%|          | 0/343 [00:00<?, ?it/s]

[2025-04-16 22:05:18,396]{model.py:471} - INFO - Finished Train Epoch #3, average metrics - [loss: 0.32040]


Validation epoch #3:   0%|          | 0/98 [00:00<?, ?it/s]

[2025-04-16 22:05:56,902]{model.py:471} - INFO - Finished Validation Epoch #3, average metrics - [AUC: 0.91288, log_loss: 0.31783]
[2025-04-16 22:06:07,296]{model.py:619} - INFO - Best model with log_loss = 0.31783066635955576 was saved to dcnv2_artifacts/best_model.pt


Train epoch #4:   0%|          | 0/343 [00:00<?, ?it/s]

[2025-04-16 22:07:31,372]{model.py:471} - INFO - Finished Train Epoch #4, average metrics - [loss: 0.30281]


Validation epoch #4:   0%|          | 0/98 [00:00<?, ?it/s]

[2025-04-16 22:08:10,524]{model.py:471} - INFO - Finished Validation Epoch #4, average metrics - [AUC: 0.91904, log_loss: 0.30972]
[2025-04-16 22:08:20,881]{model.py:619} - INFO - Best model with log_loss = 0.309715503355518 was saved to dcnv2_artifacts/best_model.pt


Train epoch #5:   0%|          | 0/343 [00:00<?, ?it/s]

[2025-04-16 22:09:43,966]{model.py:471} - INFO - Finished Train Epoch #5, average metrics - [loss: 0.29070]


Validation epoch #5:   0%|          | 0/98 [00:00<?, ?it/s]

[2025-04-16 22:10:22,022]{model.py:471} - INFO - Finished Validation Epoch #5, average metrics - [AUC: 0.92324, log_loss: 0.30487]
[2025-04-16 22:10:32,388]{model.py:619} - INFO - Best model with log_loss = 0.30486998605589927 was saved to dcnv2_artifacts/best_model.pt


Train epoch #6:   0%|          | 0/343 [00:00<?, ?it/s]

[2025-04-16 22:11:57,335]{model.py:471} - INFO - Finished Train Epoch #6, average metrics - [loss: 0.28160]


Validation epoch #6:   0%|          | 0/98 [00:00<?, ?it/s]

[2025-04-16 22:12:35,936]{model.py:471} - INFO - Finished Validation Epoch #6, average metrics - [AUC: 0.92607, log_loss: 0.30102]
[2025-04-16 22:12:46,336]{model.py:619} - INFO - Best model with log_loss = 0.3010155244007091 was saved to dcnv2_artifacts/best_model.pt


Train epoch #7:   0%|          | 0/343 [00:00<?, ?it/s]

[2025-04-16 22:14:09,409]{model.py:471} - INFO - Finished Train Epoch #7, average metrics - [loss: 0.27362]


Validation epoch #7:   0%|          | 0/98 [00:00<?, ?it/s]

[2025-04-16 22:14:47,409]{model.py:471} - INFO - Finished Validation Epoch #7, average metrics - [AUC: 0.92807, log_loss: 0.29905]
[2025-04-16 22:14:57,753]{model.py:619} - INFO - Best model with log_loss = 0.2990462041411652 was saved to dcnv2_artifacts/best_model.pt


In [10]:
test_metrics = model.test(
    features=df_test.drop(columns="label"),
    target=df_test["label"],
    device="cuda:0",
    batch_size=4096,
    num_workers=2,
)

Test epoch #-1:   0%|          | 0/49 [00:00<?, ?it/s]

[2025-04-16 22:15:12,187]{model.py:471} - INFO - Finished Test Epoch #-1, average metrics - [AUC: 0.92804, log_loss: 0.29928]


In [11]:
train_metrics_v2 = model.test(
    features=df_train.drop(columns="label"),
    target=df_train["label"],
    device="cuda:0",
    batch_size=4096,
    num_workers=2,
)

Test epoch #-1:   0%|          | 0/343 [00:00<?, ?it/s]

[2025-04-16 22:16:45,171]{model.py:471} - INFO - Finished Test Epoch #-1, average metrics - [AUC: 0.95088, log_loss: 0.25635]


In [12]:
model

DCNv2(
  (loss_fn): BCEWithLogitsLoss()
  (embedding_layer): EmbeddingLayer(
    (output_proj): Identity()
    (embedding_modules): ModuleDict(
      (user_id): Embedding(16976, 10)
      (item_id): Embedding(23605, 10)
      (tag_id): Embedding(49658, 10)
    )
    (dummy_fn): Identity()
  )
  (crossnet): CrossNetV2(
    (cross_layers): ModuleList(
      (0-4): 5 x Linear(in_features=30, out_features=30, bias=True)
    )
  )
  (parallel_dnn): MLPBlock(
    (mlp): Sequential(
      (0): Linear(in_features=30, out_features=400, bias=True)
      (1): BatchNorm1d(400, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU()
      (3): Dropout(p=0.2, inplace=False)
      (4): Linear(in_features=400, out_features=400, bias=True)
      (5): BatchNorm1d(400, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (6): ReLU()
      (7): Dropout(p=0.2, inplace=False)
      (8): Linear(in_features=400, out_features=400, bias=True)
      (9): BatchNorm1d(400