In [1]:
!pip install torch torchvision




In [251]:
import torch
import torch.nn as nn
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import Dataset, DataLoader
from torch.nn.functional import mse_loss
from tqdm import tqdm


RANDOM_SEED =42


In [266]:
# Load data
df = pd.read_csv('merged.csv')

# Encode categorical variables
user_encoder = LabelEncoder()
movie_encoder = LabelEncoder()
occupation_encoder = LabelEncoder()
zip_code_encoder = LabelEncoder()
release_year_encoder = LabelEncoder()

df['zip_code'] = zip_code_encoder.fit_transform(df['zip_code'])
df['release_year'] = release_year_encoder.fit_transform(df['release_year'])
df['occupation'] = occupation_encoder.fit_transform(df['occupation'])

# One-hot encoding for gender
df = pd.get_dummies(df, columns=['gender'])

# Normalize age
df['age'] = (df['age'] - df['age'].mean()) / df['age'].std()

# Process genres
all_genres = ["unknown", "Action", "Adventure", "Animation", "Children's",
          "Comedy", "Crime", "Documentary", "Drama", "Fantasy",
          "Film-Noir", "Horror", "Musical", "Mystery", "Romance",
          "Sci-Fi", "Thriller", "War", "Western"]

# Train-test split
train_df, test_df = train_test_split(df, test_size=0.2, random_state=RANDOM_SEED)

In [253]:
class MovieDataset(Dataset):
    def __init__(self, dataframe):
        self.ages = torch.tensor(dataframe['age'].values, dtype=torch.float32)
        self.occupations = torch.tensor(dataframe['occupation'].values)
        self.genders = torch.tensor(dataframe[['gender_F', 'gender_M']].values, dtype=torch.float32)
        self.genre_features = torch.tensor(dataframe[list(all_genres)].values, dtype=torch.float32)
        self.zip_codes = torch.tensor(dataframe['zip_code'].values)
        self.release_years = torch.tensor(dataframe['release_year'].values)
        self.ratings = torch.tensor(dataframe['rating'].values, dtype=torch.float32)

    def __len__(self):
        return len(self.ratings)

    def __getitem__(self, idx):
        return (self.zip_codes[idx], self.release_years[idx],
                self.ages[idx], self.occupations[idx],
                self.genders[idx], self.genre_features[idx]), self.ratings[idx]

train_dataset = MovieDataset(train_df)
test_dataset = MovieDataset(test_df)


In [254]:
class RecommenderNet(nn.Module):
    def __init__(self, num_zip_codes, num_release_years, num_occupations, num_genres, embedding_size):
        super(RecommenderNet, self).__init__()
        # Embeddings
        self.zip_code_embedding = nn.Embedding(num_zip_codes, embedding_size)
        self.release_year_embedding = nn.Embedding(num_release_years, embedding_size)
        self.occupation_embedding = nn.Embedding(num_occupations, embedding_size)

        # Linear layers for age and gender
        self.age_lin = nn.Linear(1, embedding_size)
        self.gender_lin = nn.Linear(2, embedding_size)

        # Fully connected layers
        self.fc1 = nn.Linear(embedding_size * 5 + num_genres, 128)
        self.bn1 = nn.BatchNorm1d(128)
        self.fc2 = nn.Linear(128, 64)
        self.bn2 = nn.BatchNorm1d(64)
        self.fc3 = nn.Linear(64, 1)

        # Dropout layer
        self.dropout = nn.Dropout(0.2)

    def forward(self, zip_codes, release_years, ages, occupations, genders, genre_features):
        zip_code_embedding = self.zip_code_embedding(zip_codes)
        release_year_embedding = self.release_year_embedding(release_years)
        occupation_embedding = self.occupation_embedding(occupations)
        age_embedding = self.age_lin(ages.unsqueeze(1))
        gender_embedding = self.gender_lin(genders)

        x = torch.cat([zip_code_embedding, release_year_embedding, occupation_embedding,
                       age_embedding, gender_embedding, genre_features], dim=1)
        x = nn.ReLU()(self.bn1(self.fc1(x)))
        x = self.dropout(x)
        x = nn.ReLU()(self.bn2(self.fc2(x)))
        x = torch.sigmoid(self.fc3(x))
        return x.squeeze()

In [275]:
embedding_size = 50
num_zip_codes = df['zip_code'].nunique()
num_release_years = df['release_year'].nunique()
num_movies = df['film_id'].nunique()
num_occupations = df['occupation'].nunique()
num_generes = len(all_genres)
model = RecommenderNet(num_zip_codes, num_release_years, num_occupations, num_generes, embedding_size)

In [276]:
# Hyperparameters
learning_rate = 0.001
epochs = 50
batch_size = 32
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# DataLoader
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)


# Best model initialization
best_loss = float('inf')
best_model_path = 'best.pth'

# Training loop
model.train()
for epoch in tqdm(range(epochs), desc='Epochs'):
    running_loss = 0.0
    for (zip_codes, release_years, ages, occupations, genders, genre_features), ratings in train_loader:
        # Zero the parameter gradients
        optimizer.zero_grad()

        # Forward pass
        outputs = model(zip_codes, release_years, ages, occupations, genders, genre_features)

        loss = criterion(outputs * 5, ratings)

        # Backward pass and optimize
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    # Calculate average loss for the epoch
    epoch_loss = running_loss / len(train_loader)
    tqdm.write(f'Epoch {epoch+1}/{epochs}, Loss: {epoch_loss}')

    # Save the best model
    if epoch_loss < best_loss:
        best_loss = epoch_loss
        torch.save(model.state_dict(), best_model_path)

print(f'Training complete. Best model saved to {best_model_path}')

Epochs:   2%|▏         | 1/50 [00:11<09:32, 11.68s/it]

Epoch 1/50, Loss: 1.13294924390316


Epochs:   4%|▍         | 2/50 [00:22<09:04, 11.35s/it]

Epoch 2/50, Loss: 1.0553961817264557


Epochs:   6%|▌         | 3/50 [00:33<08:37, 11.01s/it]

Epoch 3/50, Loss: 1.0289430654883385


Epochs:   8%|▊         | 4/50 [00:43<08:10, 10.67s/it]

Epoch 4/50, Loss: 1.0152815279006957


Epochs:  10%|█         | 5/50 [00:54<08:06, 10.80s/it]

Epoch 5/50, Loss: 0.9993054326534271


Epochs:  12%|█▏        | 6/50 [01:05<07:58, 10.88s/it]

Epoch 6/50, Loss: 0.9900121225118637


Epochs:  14%|█▍        | 7/50 [01:16<07:52, 10.98s/it]

Epoch 7/50, Loss: 0.979590152490139


Epochs:  16%|█▌        | 8/50 [01:27<07:36, 10.86s/it]

Epoch 8/50, Loss: 0.9704590464472771


Epochs:  18%|█▊        | 9/50 [01:38<07:24, 10.85s/it]

Epoch 9/50, Loss: 0.9615266884088516


Epochs:  20%|██        | 10/50 [01:49<07:16, 10.91s/it]

Epoch 10/50, Loss: 0.9537577672600747


Epochs:  22%|██▏       | 11/50 [02:00<07:07, 10.96s/it]

Epoch 11/50, Loss: 0.9458072616577149


Epochs:  24%|██▍       | 12/50 [02:11<06:55, 10.94s/it]

Epoch 12/50, Loss: 0.9366094302296638


Epochs:  26%|██▌       | 13/50 [02:21<06:32, 10.62s/it]

Epoch 13/50, Loss: 0.9313637337803841


Epochs:  28%|██▊       | 14/50 [02:32<06:26, 10.74s/it]

Epoch 14/50, Loss: 0.9232876451969146


Epochs:  30%|███       | 15/50 [02:43<06:19, 10.83s/it]

Epoch 15/50, Loss: 0.913116271173954


Epochs:  32%|███▏      | 16/50 [02:54<06:10, 10.90s/it]

Epoch 16/50, Loss: 0.90924215914011


Epochs:  34%|███▍      | 17/50 [03:04<05:51, 10.66s/it]

Epoch 17/50, Loss: 0.9032498761057853


Epochs:  36%|███▌      | 18/50 [03:15<05:42, 10.71s/it]

Epoch 18/50, Loss: 0.898997637963295


Epochs:  38%|███▊      | 19/50 [03:26<05:36, 10.86s/it]

Epoch 19/50, Loss: 0.8894464609503746


Epochs:  40%|████      | 20/50 [03:37<05:29, 10.98s/it]

Epoch 20/50, Loss: 0.8901925159573555


Epochs:  42%|████▏     | 21/50 [03:48<05:20, 11.06s/it]

Epoch 21/50, Loss: 0.8810836313009263


Epochs:  44%|████▍     | 22/50 [03:58<04:59, 10.71s/it]

Epoch 22/50, Loss: 0.874543854033947


Epochs:  46%|████▌     | 23/50 [04:10<04:53, 10.87s/it]

Epoch 23/50, Loss: 0.8736166504383087


Epochs:  48%|████▊     | 24/50 [04:21<04:44, 10.92s/it]

Epoch 24/50, Loss: 0.8733540386199952


Epochs:  50%|█████     | 25/50 [04:32<04:33, 10.95s/it]

Epoch 25/50, Loss: 0.8665073402881622


Epochs:  52%|█████▏    | 26/50 [04:42<04:21, 10.90s/it]

Epoch 26/50, Loss: 0.8612413920164108


Epochs:  54%|█████▍    | 27/50 [04:53<04:08, 10.80s/it]

Epoch 27/50, Loss: 0.8590361646652221


Epochs:  56%|█████▌    | 28/50 [05:04<04:01, 10.99s/it]

Epoch 28/50, Loss: 0.8552828449130059


Epochs:  58%|█████▊    | 29/50 [05:16<03:55, 11.20s/it]

Epoch 29/50, Loss: 0.849537077987194


Epochs:  60%|██████    | 30/50 [05:27<03:44, 11.23s/it]

Epoch 30/50, Loss: 0.8473216050744057


Epochs:  62%|██████▏   | 31/50 [05:38<03:29, 11.04s/it]

Epoch 31/50, Loss: 0.8422981479167938


Epochs:  64%|██████▍   | 32/50 [05:48<03:14, 10.82s/it]

Epoch 32/50, Loss: 0.8383844039440155


Epochs:  66%|██████▌   | 33/50 [05:59<03:05, 10.92s/it]

Epoch 33/50, Loss: 0.836379934489727


Epochs:  68%|██████▊   | 34/50 [06:11<02:55, 10.98s/it]

Epoch 34/50, Loss: 0.8322285580158234


Epochs:  70%|███████   | 35/50 [06:22<02:45, 11.02s/it]

Epoch 35/50, Loss: 0.8307256586670876


Epochs:  72%|███████▏  | 36/50 [06:32<02:30, 10.72s/it]

Epoch 36/50, Loss: 0.8298491563796997


Epochs:  74%|███████▍  | 37/50 [06:43<02:19, 10.76s/it]

Epoch 37/50, Loss: 0.8274852422237396


Epochs:  76%|███████▌  | 38/50 [06:54<02:10, 10.86s/it]

Epoch 38/50, Loss: 0.8248467433035374


Epochs:  78%|███████▊  | 39/50 [07:05<02:00, 10.92s/it]

Epoch 39/50, Loss: 0.8200993158578873


Epochs:  80%|████████  | 40/50 [07:15<01:48, 10.84s/it]

Epoch 40/50, Loss: 0.8169411064982415


Epochs:  82%|████████▏ | 41/50 [07:26<01:35, 10.66s/it]

Epoch 41/50, Loss: 0.8145286303758621


Epochs:  84%|████████▍ | 42/50 [07:37<01:26, 10.79s/it]

Epoch 42/50, Loss: 0.814997122490406


Epochs:  86%|████████▌ | 43/50 [07:48<01:16, 10.86s/it]

Epoch 43/50, Loss: 0.8089045756101608


Epochs:  88%|████████▊ | 44/50 [07:59<01:05, 10.90s/it]

Epoch 44/50, Loss: 0.8092224455595016


Epochs:  90%|█████████ | 45/50 [08:09<00:53, 10.61s/it]

Epoch 45/50, Loss: 0.8094413524627686


Epochs:  92%|█████████▏| 46/50 [08:19<00:42, 10.69s/it]

Epoch 46/50, Loss: 0.8043764558315277


Epochs:  94%|█████████▍| 47/50 [08:31<00:32, 10.85s/it]

Epoch 47/50, Loss: 0.802017207801342


Epochs:  96%|█████████▌| 48/50 [08:42<00:21, 10.93s/it]

Epoch 48/50, Loss: 0.8003044772267341


Epochs:  98%|█████████▊| 49/50 [08:53<00:10, 10.94s/it]

Epoch 49/50, Loss: 0.7984246416091919


Epochs: 100%|██████████| 50/50 [09:03<00:00, 10.87s/it]

Epoch 50/50, Loss: 0.7972482315540314
Training complete. Best model saved to best.pth





In [289]:
import numpy as np

def evaluate_model(model, test_loader):
    model.eval()
    mse = 0
    with torch.no_grad():
        for (zip_codes, release_years, ages, occupations, genders, genre_features), ratings in test_loader:
            outputs =  model(zip_codes, release_years, ages, occupations, genders,genre_features) * 5
            mse += mse_loss(outputs, ratings)
            break
    mse = mse / len(test_loader)
    return np.sqrt(mse)

test_loader = DataLoader(test_dataset, batch_size=batch_size)
rmse = evaluate_model(model, test_loader)
print(f'RMSE on test set: {rmse}')


tensor([62, 58, 67, 65, 68, 61, 66, 69, 11, 68, 62, 69, 68, 70, 58, 40, 54, 69,
        42, 66, 13, 26, 67, 69, 65, 67, 69, 68, 52, 49, 68, 69])
RMSE on test set: 0.039171721786260605


In [339]:
def recommend_movies(model, age, gender, occupation, zip_code, num_recommendations=5):
    # Convert inputs using encoders and normalization
    encoded_occupation = occupation_encoder.transform([occupation])[0]
    encoded_zip_code = zip_code_encoder.transform([zip_code])[0]
    normalized_age = (age - df['age'].mean()) / df['age'].std()

    # Prepare gender input
    gender_input = np.array([[1, 0] if gender == 'F' else [0, 1]])

    # Prepare inputs for all movies
    movie_ids = np.arange(num_movies)
    zip_codes = np.full_like(movie_ids, encoded_zip_code)
    ages = np.full_like(movie_ids, normalized_age, dtype=np.float32)
    occupations = np.full_like(movie_ids, encoded_occupation)
    genders = np.tile(gender_input, (num_movies, 1))

    # Convert to tensors
    zip_codes_tensor = torch.tensor(zip_codes)
    ages_tensor = torch.tensor(ages, dtype=torch.float32)
    occupations_tensor = torch.tensor(occupations)
    genders_tensor = torch.tensor(genders, dtype=torch.float32)

    # Generate predictions for all movies
    model.eval()
    predictions = np.zeros(num_movies)
    with torch.no_grad():
        for i in range(0, num_movies, batch_size):
            if i + batch_size > num_movies:
              i = num_movies - batch_size
            genre_features = torch.tensor(df.loc[:, list(all_genres)].values, dtype=torch.float32)
            release_years = torch.tensor(df['release_year'].tolist())
            batch_predictions = model(zip_codes_tensor[i:i+batch_size],
                                      release_years[i:i+batch_size],
                                      ages_tensor[i:i+batch_size],
                                      occupations_tensor[i:i+batch_size],
                                      genders_tensor[i:i+batch_size],
                                      genre_features[i:i+batch_size])
            predictions[i:i+batch_size] = batch_predictions.numpy()

    # Sort by predicted rating
    sorted_indices = np.argsort(predictions)[::-1]
    top_movie_ids = sorted_indices[:num_recommendations]
    top_movie_ratings = predictions[top_movie_ids]

    # Convert movie IDs back to movie names and pair with their ratings
    movies_data = []
    for movie_id, rating in zip(top_movie_ids, top_movie_ratings * 5):
        movie_title = df[df['film_id'] == movie_id]['title'].iloc[0]
        movies_data.append({"id": movie_id, "title": movie_title, "rating": rating})

    # Convert to DataFrame
    recommended_movies_df = pd.DataFrame(movies_data)
    recommended_movies_df.index = range(1, len(recommended_movies_df) + 1)
    return recommended_movies_df

In [351]:
# Example user details
age = 20
gender = 'F'
occupation = 'writer'
zip_code = '12345'

# Number of recommendations
num_recommendations = 10

# Get recommendations
top_movies_df = recommend_movies(model, age, gender, occupation, zip_code, num_recommendations)
print(top_movies_df)

      id                                      title    rating
1    327                            Cop Land (1997)  4.966209
2    203                          Unforgiven (1992)  4.966209
3    881                         Money Talks (1997)  4.961248
4    566            Clear and Present Danger (1994)  4.961248
5   1614            Reluctant Debutante, The (1958)  4.961248
6    212  Unbearable Lightness of Being, The (1988)  4.961248
7   1251                      A Chef in Love (1996)  4.961248
8    344                        Apostle, The (1997)  4.961248
9    368                            Bio-Dome (1996)  4.961243
10   609                 Father of the Bride (1950)  4.961243
