In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sentence_transformers import SentenceTransformer

  from tqdm.autonotebook import tqdm, trange


In [2]:
torch.cuda.is_available()

False

In [3]:
def encode_titles(titles, model_name='all-MiniLM-L6-v2'):
    """
    Encode movie titles using sentence transformer
    """
    model = SentenceTransformer(model_name)
    embeddings = model.encode(titles, convert_to_tensor=True)
    return embeddings, model.get_sentence_embedding_dimension()

In [4]:
def create_dataset_tensors(user_data, movie_data, ratings, title_embeddings):
    """
    Create tensors for the dataset
    """
    return {
        'user_ids': torch.tensor(user_data['user_id'].values, dtype=torch.long),
        'gender': torch.tensor(user_data['gender_encoded'].values, dtype=torch.long),
        'age': torch.tensor(user_data['age'].values, dtype=torch.float),
        'occupation': torch.tensor(user_data['occupation'].values, dtype=torch.long),
        'movie_ids': torch.tensor(movie_data['movie_id'].values, dtype=torch.long),
        'genres': torch.tensor(np.stack(movie_data['genres_encoded']), dtype=torch.float),
        'title_embeddings': title_embeddings,
        'ratings': torch.tensor(ratings, dtype=torch.float)
    }

In [5]:
# Reading ratings file
ratings = pd.read_csv('data-1m/ratings.csv', 
                    sep='\t', #Note that the separator here is "\t"
                    encoding='latin-1',
                    engine='python',
                    index_col=0
                     ) 

# Reading users file
users = pd.read_csv('data-1m/users.csv', 
                    sep='\t', #Note that the separator here is "\t"
                    encoding='latin-1',
                    engine='python',
                    index_col=0
                     )

# # Reading movies file
movies = pd.read_csv('data-1m/movies.csv', 
                    sep='\t', #Note that the separator here is "\t"
                    encoding='latin-1',
                    engine='python',
                    index_col=0
                     ) 

ratings = ratings.drop(columns=['user_emb_id', 'movie_emb_id'])

In [6]:
# Create mappings for movie and user IDs
movie_id_map = {old_id: new_id for new_id, old_id in enumerate(sorted(movies['movie_id'].unique()))}
user_id_map = {old_id: new_id for new_id, old_id in enumerate(sorted(users['user_id'].unique()))}

# Save the mappings for later use (optional)
movie_id_reverse_map = {v: k for k, v in movie_id_map.items()}
user_id_reverse_map = {v: k for k, v in user_id_map.items()}

# Apply the mappings
movies['movie_id'] = movies['movie_id'].map(movie_id_map)
users['user_id'] = users['user_id'].map(user_id_map)
ratings['movie_id'] = ratings['movie_id'].map(movie_id_map)
ratings['user_id'] = ratings['user_id'].map(user_id_map)

# Verify the mapping worked correctly
print("Movie ID range:", movies['movie_id'].min(), "to", movies['movie_id'].max())
print("User ID range:", users['user_id'].min(), "to", users['user_id'].max())

Movie ID range: 0 to 3882
User ID range: 0 to 6039


In [16]:
title_embeddings.shape

torch.Size([3883, 384])

In [20]:
title_embeddings

tensor([[-0.0828,  0.0530,  0.0536,  ...,  0.0226,  0.0538,  0.1030],
        [-0.1053,  0.1508, -0.0264,  ...,  0.0106, -0.0726,  0.0086],
        [-0.0988,  0.0177, -0.0527,  ..., -0.0120,  0.0303,  0.0004],
        ...,
        [-0.0730, -0.0164, -0.0606,  ..., -0.0033, -0.0181,  0.0027],
        [-0.0611, -0.0336,  0.0070,  ...,  0.0937, -0.0130,  0.0063],
        [-0.1084,  0.0172, -0.0453,  ...,  0.0170,  0.0158,  0.0596]],
       device='mps:0')

In [17]:
nan_mask = torch.isnan(title_embeddings).any(dim=1)
nan_indices = torch.where(nan_mask)[0]

In [18]:
nan_indices

tensor([], device='mps:0', dtype=torch.int64)

In [21]:
# First move the tensor to CPU
title_embeddings_cpu = title_embeddings.to('cpu')

# Now check for NaNs
nan_mask = torch.isnan(title_embeddings_cpu).any(dim=1)
nan_indices = torch.where(nan_mask)[0]

print(f"Number of embeddings with NaN values: {len(nan_indices)}")

if len(nan_indices) > 0:
    print("\nTitles with NaN embeddings:")
    for idx in nan_indices:
        print(f"Index {idx}: {movies.iloc[idx]['title']}")
    
    # Get detailed statistics about NaN values
    nan_per_dimension = torch.isnan(title_embeddings_cpu).sum(dim=0)
    print(f"\nDimensions with NaNs: {torch.sum(nan_per_dimension > 0).item()}")
    
    # Show a sample of problematic embeddings
    print("\nSample of first problematic embedding:")
    print(title_embeddings_cpu[nan_indices[0]])
else:
    print("No NaN values found after moving tensor to CPU")
    
    # Let's double check with a different method
    print("\nDouble checking with numpy:")
    title_embeddings_np = title_embeddings_cpu.numpy()
    nan_mask_np = np.isnan(title_embeddings_np).any(axis=1)
    nan_indices_np = np.where(nan_mask_np)[0]
    print(f"Number of embeddings with NaN values (numpy check): {len(nan_indices_np)}")

Number of embeddings with NaN values: 0
No NaN values found after moving tensor to CPU

Double checking with numpy:
Number of embeddings with NaN values (numpy check): 0


In [22]:
# Additional checks
print("\nTensor statistics:")
print(f"Min value: {title_embeddings_cpu.min().item()}")
print(f"Max value: {title_embeddings_cpu.max().item()}")
print(f"Mean value: {title_embeddings_cpu.mean().item()}")
print(f"Shape: {title_embeddings_cpu.shape}")

# Check for any infinite values
inf_mask = torch.isinf(title_embeddings_cpu).any(dim=1)
inf_indices = torch.where(inf_mask)[0]
print(f"\nNumber of embeddings with infinite values: {len(inf_indices)}")

# Check for unusually large values
large_mask = (title_embeddings_cpu.abs() > 100).any(dim=1)
large_indices = torch.where(large_mask)[0]
print(f"Number of embeddings with values >100: {len(large_indices)}")


Tensor statistics:
Min value: -0.24947617948055267
Max value: 0.24644768238067627
Mean value: 0.0005378836067393422
Shape: torch.Size([3883, 384])

Number of embeddings with infinite values: 0
Number of embeddings with values >100: 0


In [19]:
nan_mask

tensor([False, False, False,  ..., False, False, False], device='mps:0')

In [7]:
# Encode titles
title_embeddings, title_embedding_dim = encode_titles(movies['title'].values)
movies['genres'] = movies['genres'].str.split('|')
movies['genres'] = movies['genres'].fillna("").astype('str') #Convert genres to string values

# Process genres
mlb = MultiLabelBinarizer()
movies['genres'] = movies['genres'].apply(eval) #List is a string, convert that into actual list
genres_encoded = mlb.fit_transform(movies['genres'])
movies['genres_encoded'] = list(genres_encoded)

# Encode categorical variables
gender_encoder = LabelEncoder()
users['gender_encoded'] = gender_encoder.fit_transform(users['gender'])

# Normalize age
users['age'] = users['age'].astype(float)

# Merge data
data = ratings.merge(users, on='user_id').merge(movies, on='movie_id')

# Split data
train_data, val_data = train_test_split(data, test_size=0.2, random_state=42)

# Create tensors for train and validation sets
train_tensors = create_dataset_tensors(
    train_data[['user_id', 'gender_encoded', 'age', 'occupation']],
    train_data[['movie_id', 'genres_encoded']],
    train_data['rating'].values / 5.0,
    title_embeddings[train_data.index]
)

val_tensors = create_dataset_tensors(
    val_data[['user_id', 'gender_encoded', 'age', 'occupation']],
    val_data[['movie_id', 'genres_encoded']],
    val_data['rating'].values / 5.0,
    title_embeddings[val_data.index]
)

model_dims = {
    'n_users': len(users['user_id'].unique()),
    'n_movies': len(movies['movie_id'].unique()),
    'n_genres': genres_encoded.shape[1],
    'n_genders': len(gender_encoder.classes_),
    'n_occupations': users['occupation'].nunique(),
    'title_embedding_dim': title_embedding_dim
}

In [8]:
class TwoTowerNet(nn.Module):
    def __init__(self, n_users, n_movies, n_genres, n_genders, n_occupations, 
                 title_embedding_dim, embedding_dim=64):
        super(TwoTowerNet, self).__init__()
        
        # Print dimensions during initialization
        print(f"Initializing model with dimensions:")
        print(f"n_users: {n_users}")
        print(f"n_movies: {n_movies}")
        print(f"n_genres: {n_genres}")
        print(f"n_genders: {n_genders}")
        print(f"n_occupations: {n_occupations}")
        print(f"title_embedding_dim: {title_embedding_dim}")
        
        # User embeddings
        self.user_embedding = nn.Embedding(n_users, embedding_dim)
        self.gender_embedding = nn.Embedding(n_genders, embedding_dim // 4)
        self.occupation_embedding = nn.Embedding(n_occupations, embedding_dim // 4)
        
        # User tower - simplified
        self.user_tower = nn.Sequential(
            nn.Linear(embedding_dim + embedding_dim//4 + embedding_dim//4 + 1, 64),
            nn.ReLU(),
        )
        
        # Movie embeddings
        self.movie_embedding = nn.Embedding(n_movies, embedding_dim)
        self.title_projection = nn.Linear(title_embedding_dim, embedding_dim)
        
        # Movie tower - simplified
        self.movie_tower = nn.Sequential(
            nn.Linear(embedding_dim * 2 + n_genres, 64),
            nn.ReLU(),
        )
        
        # Final prediction - simplified
        self.predictor = nn.Sequential(
            nn.Linear(128, 1),
            nn.Sigmoid()
        )
    
    def forward(self, batch):
        # Debug input shapes and values
        for key, value in batch.items():
            if torch.is_tensor(value):
                print(f"\n{key}:")
                print(f"Shape: {value.shape}")
                print(f"Contains NaN: {torch.isnan(value).any().item()}")
                print(f"Min: {value.min().item():.4f}, Max: {value.max().item():.4f}")
        
        # 1. User embeddings
        user_emb = self.user_embedding(batch['user_ids'])
        print("\nAfter user embedding:")
        print(f"Shape: {user_emb.shape}")
        print(f"Contains NaN: {torch.isnan(user_emb).any().item()}")
        
        gender_emb = self.gender_embedding(batch['gender'])
        print("\nAfter gender embedding:")
        print(f"Shape: {gender_emb.shape}")
        print(f"Contains NaN: {torch.isnan(gender_emb).any().item()}")
        
        occ_emb = self.occupation_embedding(batch['occupation'])
        print("\nAfter occupation embedding:")
        print(f"Shape: {occ_emb.shape}")
        print(f"Contains NaN: {torch.isnan(occ_emb).any().item()}")
        
        # 2. Process age
        age = batch['age'].unsqueeze(1) / 100.0
        print("\nAfter age processing:")
        print(f"Shape: {age.shape}")
        print(f"Contains NaN: {torch.isnan(age).any().item()}")
        
        # 3. Combine user features
        user_features = torch.cat([user_emb, gender_emb, occ_emb, age], dim=1)
        print("\nAfter concatenating user features:")
        print(f"Shape: {user_features.shape}")
        print(f"Contains NaN: {torch.isnan(user_features).any().item()}")
        
        # 4. User tower
        user_vector = self.user_tower(user_features)
        print("\nAfter user tower:")
        print(f"Shape: {user_vector.shape}")
        print(f"Contains NaN: {torch.isnan(user_vector).any().item()}")
        
        # 5. Movie embeddings
        movie_emb = self.movie_embedding(batch['movie_ids'])
        print("\nAfter movie embedding:")
        print(f"Shape: {movie_emb.shape}")
        print(f"Contains NaN: {torch.isnan(movie_emb).any().item()}")
        
        title_emb = self.title_projection(batch['title_embeddings'])
        print("\nAfter title projection:")
        print(f"Shape: {title_emb.shape}")
        print(f"Contains NaN: {torch.isnan(title_emb).any().item()}")
        
        # 6. Combine movie features
        movie_features = torch.cat([movie_emb, title_emb, batch['genres']], dim=1)
        print("\nAfter concatenating movie features:")
        print(f"Shape: {movie_features.shape}")
        print(f"Contains NaN: {torch.isnan(movie_features).any().item()}")
        
        # 7. Movie tower
        movie_vector = self.movie_tower(movie_features)
        print("\nAfter movie tower:")
        print(f"Shape: {movie_vector.shape}")
        print(f"Contains NaN: {torch.isnan(movie_vector).any().item()}")
        
        # 8. Final prediction
        combined = torch.cat([user_vector, movie_vector], dim=1)
        print("\nAfter combining vectors:")
        print(f"Shape: {combined.shape}")
        print(f"Contains NaN: {torch.isnan(combined).any().item()}")
        
        output = self.predictor(combined)
        print("\nFinal output:")
        print(f"Shape: {output.shape}")
        print(f"Contains NaN: {torch.isnan(output).any().item()}")
        print(f"Min: {output.min().item():.4f}, Max: {output.max().item():.4f}")
        
        return output.squeeze()

In [9]:
def train_step(model, batch, optimizer, criterion, device):
    """Single training step"""
    # Move batch to device and convert to correct types
    batch = {k: v.to(device) for k, v in batch.items()}
    
    # Zero gradients
    optimizer.zero_grad()
    
    # Forward pass
    predictions = model(batch)
    
    # Print shapes and check for NaNs
    print(f"Predictions shape: {predictions.shape}")
    print(f"Ratings shape: {batch['ratings'].shape}")
    print(f"Contains NaN - Predictions: {torch.isnan(predictions).any()}, Ratings: {torch.isnan(batch['ratings']).any()}")
    
    # Compute loss
    loss = criterion(predictions, batch['ratings'])
    
    # Backward pass
    loss.backward()
    optimizer.step()
    
    return loss.item()

In [10]:
# Initialize model and move everything to CPU
model = TwoTowerNet(**model_dims)
model = model.to('cpu')
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
epochs = 10
best_val_loss = float('inf')

for epoch in range(epochs):
    print(f"epoch: {epoch}")
    # Training
    model.train()
    
    # Forward pass on entire dataset
    predictions = model(train_tensors)
    
    # Compute loss
    loss = criterion(predictions, train_tensors['ratings'])
    
    # Backward pass
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    # Validation
    model.eval()
    with torch.no_grad():
        val_predictions = model(val_tensors)
        val_loss = criterion(val_predictions, val_tensors['ratings'])
    
    print(f'Epoch {epoch+1}/{epochs}:')
    print(f'Training Loss: {loss.item():.4f}')
    print(f'Validation Loss: {val_loss.item():.4f}')
    
    # Save best model
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'best_val_loss': best_val_loss,
        }, 'best_model.pth')

Initializing model with dimensions:
n_users: 6040
n_movies: 3883
n_genres: 18
n_genders: 2
n_occupations: 21
title_embedding_dim: 384
epoch: 0

user_ids:
Shape: torch.Size([800167])
Contains NaN: False
Min: 0.0000, Max: 6039.0000

gender:
Shape: torch.Size([800167])
Contains NaN: False
Min: 0.0000, Max: 1.0000

age:
Shape: torch.Size([800167])
Contains NaN: False
Min: 1.0000, Max: 56.0000

occupation:
Shape: torch.Size([800167])
Contains NaN: False
Min: 0.0000, Max: 20.0000

movie_ids:
Shape: torch.Size([800167])
Contains NaN: False
Min: 0.0000, Max: 3882.0000

genres:
Shape: torch.Size([800167, 18])
Contains NaN: False
Min: 0.0000, Max: 1.0000

title_embeddings:
Shape: torch.Size([800167, 384])
Contains NaN: True
Min: nan, Max: nan

ratings:
Shape: torch.Size([800167])
Contains NaN: False
Min: 0.2000, Max: 1.0000

After user embedding:
Shape: torch.Size([800167, 64])
Contains NaN: False

After gender embedding:
Shape: torch.Size([800167, 16])
Contains NaN: False

After occupation embe

RuntimeError: Tensor for argument weight is on cpu but expected on mps

In [None]:
print("Model dimensions:", model_dims)

In [87]:
def validate(model, val_tensors, criterion, device, batch_size=256):
    """
    Validation step
    """
    model.eval()
    total_loss = 0
    n_batches = 0
    
    with torch.no_grad():
        for i in range(0, len(val_tensors['ratings']), batch_size):
            batch = {k: v[i:i+batch_size].to(device) for k, v in val_tensors.items()}
            predictions = model(batch)
            loss = criterion(predictions, batch['ratings'])
            total_loss += loss.item()
            n_batches += 1
    
    return total_loss / n_batches