In [3]:
!unzip /content/MovielensLatest_x1.zip

Archive:  /content/MovielensLatest_x1.zip
  inflating: valid.csv               
  inflating: test.csv                
  inflating: train.csv               


In [1]:
!git clone https://github.com/sn09/ranking.git

Cloning into 'ranking'...
remote: Enumerating objects: 149, done.[K
remote: Counting objects: 100% (149/149), done.[K
remote: Compressing objects: 100% (92/92), done.[K
remote: Total 149 (delta 45), reused 134 (delta 35), pack-reused 0 (from 0)[K
Receiving objects: 100% (149/149), 72.45 KiB | 3.81 MiB/s, done.
Resolving deltas: 100% (45/45), done.


In [2]:
import sys

import pandas as pd

sys.path.append("../ranking/models/")

from dcnv2.model.model import DCNv2
from dcnv2.config.model_config import DCNv2Config, ModelStructure
from common.base.config.training_config import TrainingConfig
from common.features.config import Feature, FeaturesConfig
from common.features.types import FeatureType

In [3]:
# df_train = pd.read_csv("train.csv").iloc[:50_000]
df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")
df_val = pd.read_csv("valid.csv")
# df_val = pd.read_csv("valid.csv").iloc[:10_000]

df_train.head()

Unnamed: 0,label,user_id,item_id,tag_id
0,0,84982,58,39525
1,0,87756,8757,22786
2,0,80311,14912,45676
3,1,82036,84509,33556
4,0,66867,57349,15092


In [4]:
import numpy as np


class Vocab:
    def __init__(self, data: np.ndarray, add_oov_idx: bool = True):
        self._data = data

        self.add_oov_idx = add_oov_idx
        self.oov_idx = None

        self.create_mappings()

    def create_mappings(self):
        self.data2id = dict(zip(set(self._data), range(len(self._data))))
        self.id2data = dict(zip(set(self._data), range(len(self._data))))

        if self.add_oov_idx:
            self.oov_idx = len(self.data2id)
        self.vocab_size = len(self.data2id) + 1


vocab_user = Vocab(df_train.user_id.values)
vocab_item = Vocab(df_train.item_id.values)
vocab_tag = Vocab(df_train.tag_id.values)

In [5]:
features = [
    Feature(
        name="user_id",
        feature_type=FeatureType.CATEGORICAL,
        needs_embed=True,
        embedding_size=10,
        embedding_vocab_size=vocab_user.vocab_size,
    ),
    Feature(
        name="item_id",
        feature_type=FeatureType.CATEGORICAL,
        needs_embed=True,
        embedding_size=10,
        embedding_vocab_size=vocab_item.vocab_size,
    ),
    Feature(
        name="tag_id",
        feature_type=FeatureType.CATEGORICAL,
        needs_embed=True,
        embedding_size=10,
        embedding_vocab_size=vocab_tag.vocab_size,
    ),
]
features_config = FeaturesConfig(features=features)

In [6]:
def preprocess_feature(col: pd.Series, vocab: Vocab) -> pd.Series:
    return col.map(vocab.data2id).fillna(vocab.oov_idx)

In [7]:
for df in [df_train, df_val, df_test]:
    df["user_id"] = preprocess_feature(df["user_id"], vocab_user).astype(int)
    df["item_id"] = preprocess_feature(df["item_id"], vocab_item).astype(int)
    df["tag_id"] = preprocess_feature(df["tag_id"], vocab_tag).astype(int)

In [16]:
import torch
from torch import nn


training_config = TrainingConfig(
    batch_size=4096,
    device="cuda:0",
    return_dict_batches=False,
    num_epochs=7,
    eval_metric_name="log_loss",
)

model_config = DCNv2Config(
    model_structure=ModelStructure.PARALLEL,
    # CrossNet parameters
    use_low_rank_mixture=False,
    cross_low_rank_dim=32,
    num_cross_layers=5,
    num_cross_experts=4,
    # parallel
    parallel_hidden_dims=[400, 400, 400],
    parallel_dropout=0.2,
    parallel_use_batch_norm=True,
    parallel_activation=nn.ReLU,
    # stacked
    stacked_hidden_dims=[500, 500, 500],
    stacked_dropout=0.2,
    stacked_use_batch_norm=True,
    stacked_activation=nn.ReLU,
)

model = DCNv2(model_config, features_config, is_dict_input=False)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

In [17]:
model

DCNv2(
  (loss_fn): BCEWithLogitsLoss()
  (embedding_layer): EmbeddingLayer(
    (embedding_modules): ModuleDict(
      (user_id): Embedding(16976, 10)
      (item_id): Embedding(23605, 10)
      (tag_id): Embedding(49658, 10)
    )
    (dummy_fn): Identity()
  )
  (crossnet): CrossNetV2(
    (cross_layers): ModuleList(
      (0-4): 5 x Linear(in_features=30, out_features=30, bias=True)
    )
  )
  (parallel_dnn): MLPBlock(
    (mlp): Sequential(
      (0): Linear(in_features=30, out_features=400, bias=True)
      (1): BatchNorm1d(400, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU()
      (3): Dropout(p=0.2, inplace=False)
      (4): Linear(in_features=400, out_features=400, bias=True)
      (5): BatchNorm1d(400, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (6): ReLU()
      (7): Dropout(p=0.2, inplace=False)
      (8): Linear(in_features=400, out_features=400, bias=True)
      (9): BatchNorm1d(400, eps=1e-05, momentum=0.1, aff

In [18]:
train_metrics, val_metrics = model.fit(
  features=df_train.drop(columns="label"),
  target=df_train["label"],
  config=training_config,
  optimizer=optimizer,
  val_features=df_val.drop(columns="label"),
  val_target=df_val["label"],
)

[2025-02-12 22:45:20,082]{model.py:357} - INFO - Starting training process


Train epoch #0:   0%|          | 0/343 [00:01<?, ?it/s]

[2025-02-12 22:50:48,933]{model.py:307} - INFO - Finished Train Epoch #0, average metrics - [loss: 0.57687]


Validation epoch #0:   0%|          | 0/98 [00:00<?, ?it/s]

[2025-02-12 22:52:41,536]{model.py:307} - INFO - Finished Validation Epoch #0, average metrics - [AUC: 0.80445, log_loss: 0.47914]
[2025-02-12 22:53:42,124]{model.py:390} - INFO - Best model with log_loss = 0.4791399623796479 was saved to /content/artifacts/best_model.pth


Train epoch #1:   0%|          | 0/343 [00:00<?, ?it/s]

[2025-02-12 22:59:12,422]{model.py:307} - INFO - Finished Train Epoch #1, average metrics - [loss: 0.42580]


Validation epoch #1:   0%|          | 0/98 [00:00<?, ?it/s]

[2025-02-12 23:02:05,279]{model.py:307} - INFO - Finished Validation Epoch #1, average metrics - [AUC: 0.88054, log_loss: 0.36876]


Train epoch #2:   0%|          | 0/343 [00:00<?, ?it/s]

[2025-02-12 23:08:35,178]{model.py:307} - INFO - Finished Train Epoch #2, average metrics - [loss: 0.35079]


Validation epoch #2:   0%|          | 0/98 [00:00<?, ?it/s]

[2025-02-12 23:11:24,542]{model.py:307} - INFO - Finished Validation Epoch #2, average metrics - [AUC: 0.90355, log_loss: 0.33250]


Train epoch #3:   0%|          | 0/343 [00:00<?, ?it/s]

[2025-02-12 23:17:51,815]{model.py:307} - INFO - Finished Train Epoch #3, average metrics - [loss: 0.31945]


Validation epoch #3:   0%|          | 0/98 [00:00<?, ?it/s]

[2025-02-12 23:20:43,910]{model.py:307} - INFO - Finished Validation Epoch #3, average metrics - [AUC: 0.91452, log_loss: 0.31548]


Train epoch #4:   0%|          | 0/343 [00:00<?, ?it/s]

[2025-02-12 23:27:13,306]{model.py:307} - INFO - Finished Train Epoch #4, average metrics - [loss: 0.30235]


Validation epoch #4:   0%|          | 0/98 [00:01<?, ?it/s]

[2025-02-12 23:30:06,681]{model.py:307} - INFO - Finished Validation Epoch #4, average metrics - [AUC: 0.92012, log_loss: 0.30833]


Train epoch #5:   0%|          | 0/343 [00:00<?, ?it/s]

[2025-02-12 23:36:35,987]{model.py:307} - INFO - Finished Train Epoch #5, average metrics - [loss: 0.29061]


Validation epoch #5:   0%|          | 0/98 [00:01<?, ?it/s]

[2025-02-12 23:39:28,589]{model.py:307} - INFO - Finished Validation Epoch #5, average metrics - [AUC: 0.92392, log_loss: 0.30349]


Train epoch #6:   0%|          | 0/343 [00:00<?, ?it/s]

[2025-02-12 23:46:01,467]{model.py:307} - INFO - Finished Train Epoch #6, average metrics - [loss: 0.28166]


Validation epoch #6:   0%|          | 0/98 [00:00<?, ?it/s]

[2025-02-12 23:48:54,452]{model.py:307} - INFO - Finished Validation Epoch #6, average metrics - [AUC: 0.92646, log_loss: 0.30013]
[2025-02-12 23:49:55,068]{model.py:400} - INFO - Metrics not increasing during 5 epochs. Stop training


In [19]:
test_metrics = model.test(
  features=df_test.drop(columns="label"),
  target=df_test["label"],
  config=training_config,
)



Test epoch #-1:   0%|          | 0/49 [00:00<?, ?it/s]

[2025-02-12 23:51:06,449]{model.py:307} - INFO - Finished Test Epoch #-1, average metrics - [AUC: 0.92596, log_loss: 0.30109]


In [20]:
train_metrics_v2 = model.test(
  features=df_train.drop(columns="label"),
  target=df_train["label"],
  config=training_config,
)



Test epoch #-1:   0%|          | 0/343 [00:00<?, ?it/s]

[2025-02-12 23:57:37,087]{model.py:307} - INFO - Finished Test Epoch #-1, average metrics - [AUC: 0.94689, log_loss: 0.26443]
