<a href="https://colab.research.google.com/github/saandeep17/two_tower_retrieval/blob/main/tte_retrieval.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# donwload data
!wget https://files.grouplens.org/datasets/movielens/ml-1m.zip
!unzip ml-1m.zip

--2025-04-25 02:28:43--  https://files.grouplens.org/datasets/movielens/ml-1m.zip
Resolving files.grouplens.org (files.grouplens.org)... 128.101.65.152
Connecting to files.grouplens.org (files.grouplens.org)|128.101.65.152|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5917549 (5.6M) [application/zip]
Saving to: ‘ml-1m.zip’


2025-04-25 02:28:44 (22.5 MB/s) - ‘ml-1m.zip’ saved [5917549/5917549]

Archive:  ml-1m.zip
   creating: ml-1m/
  inflating: ml-1m/movies.dat        
  inflating: ml-1m/ratings.dat       
  inflating: ml-1m/README            
  inflating: ml-1m/users.dat         


In [12]:
# use MovieLens 1M data
import pandas as pd
# Load ratings file
df = pd.read_csv('ml-1m/ratings.dat', sep='::', engine='python',
                 names=['UserID', 'MovieID', 'Rating', 'Timestamp'])

#pre-process. create an index for each distinct user and movie ids
user_map = {user_id:idx for idx, user_id in enumerate(df['UserID'].unique())}
movie_map = {movie_id:idx for idx, movie_id in enumerate(df['MovieID'].unique())}
df['user_idx'] = df['UserID'].map(user_map)
df['item_idx'] = df['MovieID'].map(movie_map)
df['label'] = (df['Rating'] > 3).astype(int)
df.head()

Unnamed: 0,UserID,MovieID,Rating,Timestamp,user_idx,item_idx,label
0,1,1193,5,978300760,0,0,1
1,1,661,3,978302109,0,1,0
2,1,914,3,978301968,0,2,0
3,1,3408,4,978300275,0,3,1
4,1,2355,5,978824291,0,4,1


In [14]:
import torch
import torch.nn as nn
import numpy as np

num_users = df['user_idx'].nunique()
num_items = df['item_idx'].nunique()
embedding_dim = 32
interactions = len(df)

np.random.seed(42)
user_ids = np.random.randint(0, num_users, interactions)
item_ids = np.random.randint(0, num_items, interactions)

# Simple two-tower model
class TwoTowerModel(nn.Module):
  def __init__(self, num_users, num_items, embedding_dim):
    super().__init__()
    self.user_embedding = nn.Embedding(num_users, embedding_dim)
    self.item_embedding = nn.Embedding(num_items, embedding_dim)

  def forward(self, user_ids, item_ids):
    u = self.user_embedding(user_ids)
    v = self.item_embedding(item_ids)
    score = (u*v).sum(dim=1)
    return score

  def encode_user(self, user_id):
    return self.user_embedding(user_id)

  def encode_item(self, item_id):
    return self.item_embedding(item_id)


# model training
model = TwoTowerModel(num_users, num_items, embedding_dim)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-2)
loss_fn = nn.BCEWithLogitsLoss()

BATCH_SIZE = 10000
EPOCHS = 10

for epoch in range(EPOCHS):
  df_shuffled = df.sample(frac=1)
  for i in range(0, interactions, BATCH_SIZE):
    u = torch.LongTensor(df_shuffled.iloc[i:(i+1)*BATCH_SIZE]['user_idx'].to_numpy())
    v  = torch.LongTensor(df_shuffled.iloc[i:(i+1)*BATCH_SIZE]['item_idx'].to_numpy())
    y = torch.FloatTensor(df_shuffled.iloc[i:(i+1)*BATCH_SIZE]['label'].to_numpy())

    optimizer.zero_grad()
    scores = model(u,v)
    loss = loss_fn(scores, y)
    loss.backward()
    optimizer.step()
  print(f"Epoch {epoch+1} loss: {loss.item():.4f}")


Epoch 1 loss: 0.3393
Epoch 2 loss: 0.2489
Epoch 3 loss: 0.2382
Epoch 4 loss: 0.2137
Epoch 5 loss: 0.2092
Epoch 6 loss: 0.1755
Epoch 7 loss: 0.1507
Epoch 8 loss: 0.1445
Epoch 9 loss: 0.1377
Epoch 10 loss: 0.1020
