In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from scipy.sparse import csr_matrix

In [2]:
movies = pd.read_csv("movies.csv")
ratings = pd.read_csv("ratings.csv")

In [3]:
print(movies.isnull().sum())


movieId    0
title      0
genres     0
dtype: int64


In [4]:
print(ratings.isnull().sum())


userId       0
movieId      0
rating       0
timestamp    0
dtype: int64


In [5]:
movies['genres'] = movies['genres'].str.split('|')


In [6]:
movies.head(2)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]"
1,2,Jumanji (1995),"[Adventure, Children, Fantasy]"


# Content Based

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Convert genres list back to a string
movies['genres_str'] = movies['genres'].apply(lambda x: ' '.join(x))

# Apply TF-IDF Vectorization
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(movies['genres_str'])

print(tfidf_matrix.shape)  # Output: (9742, Number of unique genres)


(9742, 23)


In [8]:
from sklearn.metrics.pairwise import cosine_similarity

cosine_sim = cosine_similarity(tfidf_matrix)


In [9]:
def get_content_recommendations(movie_title, movies, cosine_sim):
    idx = movies[movies['title'] == movie_title].index[0]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:11]  # Top 10 movies
    movie_indices = [i[0] for i in sim_scores]
    return movies.iloc[movie_indices]['title']

# Example Usage
print(get_content_recommendations("Toy Story (1995)", movies, cosine_sim))


1706                                          Antz (1998)
2355                                   Toy Story 2 (1999)
2809       Adventures of Rocky and Bullwinkle, The (2000)
3000                     Emperor's New Groove, The (2000)
3568                                Monsters, Inc. (2001)
6194                                     Wild, The (2006)
6486                               Shrek the Third (2007)
6948                       Tale of Despereaux, The (2008)
7760    Asterix and the Vikings (Astérix et les Viking...
8219                                         Turbo (2013)
Name: title, dtype: object


# Collaborative Filtering

In [10]:
import numpy as np
from sklearn.metrics.pairwise import pairwise_distances

user_movie_matrix = ratings.pivot(index='userId', columns='movieId', values='rating').fillna(0)

# Compute User-User and Item-Item Similarities
user_similarity = 1 - pairwise_distances(user_movie_matrix, metric="cosine")
item_similarity = 1 - pairwise_distances(user_movie_matrix.T, metric="cosine")



In [11]:
def get_user_based_recommendations(user_id, user_movie_matrix, user_similarity, movies):
    similar_users = np.argsort(user_similarity[user_id])[-5:-1]  # Top 4 similar users
    similar_users_ratings = user_movie_matrix.iloc[similar_users].mean(axis=0)  # Average their ratings
    recommendations = similar_users_ratings.sort_values(ascending=False).head(10)
    return movies[movies['movieId'].isin(recommendations.index)]['title']

print(get_user_based_recommendations(10, user_movie_matrix, user_similarity, movies))


97                     Braveheart (1995)
123                     Apollo 13 (1995)
126                Batman Forever (1995)
134                  Crimson Tide (1995)
277     Shawshank Redemption, The (1994)
287        Star Trek: Generations (1994)
307      Clear and Present Danger (1994)
398                 Fugitive, The (1993)
507    Terminator 2: Judgment Day (1991)
509                        Batman (1989)
Name: title, dtype: object


In [14]:
# !pip install surprise

Collecting surprise
  Downloading surprise-0.1-py2.py3-none-any.whl.metadata (327 bytes)
Collecting scikit-surprise (from surprise)
  Downloading scikit_surprise-1.1.4.tar.gz (154 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.4/154.4 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Downloading surprise-0.1-py2.py3-none-any.whl (1.8 kB)
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (pyproject.toml) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.4-cp311-cp311-linux_x86_64.whl size=2505175 sha256=e61edc0d1bc89d2c93d6a0083c30364e32f72a3a950fe38df5dc37f3a120c4e1
  Stored in directory: /root/.cache/pip/wheels/2a/8f/6e/7e2899163e2d85d8266daab4aa1cdabec7a6c56f83c015b5af
Successfully built scikit-surprise
Install

In [15]:
from surprise import SVD, Dataset, Reader
from surprise.model_selection import cross_validate

# Load Data
reader = Reader(rating_scale=(0.5, 5.0))
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)

# Train Model
svd = SVD()
cross_validate(svd, data, cv=5, verbose=True)


Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8806  0.8768  0.8767  0.8632  0.8717  0.8738  0.0060  
MAE (testset)     0.6784  0.6728  0.6744  0.6636  0.6692  0.6717  0.0050  
Fit time          2.61    4.27    3.96    4.14    4.80    3.96    0.73    
Test time         0.46    0.40    0.39    0.28    0.21    0.35    0.09    


{'test_rmse': array([0.88061497, 0.87683513, 0.87672947, 0.8632303 , 0.87165003]),
 'test_mae': array([0.67836912, 0.67275467, 0.67442653, 0.66356715, 0.66924769]),
 'fit_time': (2.60556960105896,
  4.272253513336182,
  3.9649181365966797,
  4.142176866531372,
  4.8000547885894775),
 'test_time': (0.4604363441467285,
  0.3961308002471924,
  0.3944129943847656,
  0.27955150604248047,
  0.21375155448913574)}

## Precision @k Recall @k

In [16]:
from sklearn.metrics import precision_score, recall_score

# Example: Compute Precision and Recall
y_true = [1, 0, 1, 1, 0, 1]  # True ratings (binary)
y_pred = [1, 0, 1, 0, 0, 1]  # Predicted ratings

precision = precision_score(y_true, y_pred)
recall = recall_score(y_true, y_pred)

print(f"Precision: {precision:.4f}, Recall: {recall:.4f}")


Precision: 1.0000, Recall: 0.7500


In [17]:
## NDCG
from sklearn.metrics import ndcg_score

# Compute NDCG
ndcg = ndcg_score([y_true], [y_pred])
print(f"NDCG: {ndcg:.4f}")


NDCG: 0.9846


In [18]:
from sklearn.metrics import mean_squared_error

# Compute RMSE
rmse = np.sqrt(mean_squared_error(y_true, y_pred))
print(f"RMSE: {rmse:.4f}")


RMSE: 0.4082


## Neural Collaborative Filtering

In [19]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [20]:
ratings = pd.read_csv("ratings.csv")

# Encode userId and movieId into continuous indices
user_mapping = {id: idx for idx, id in enumerate(ratings["userId"].unique())}
item_mapping = {id: idx for idx, id in enumerate(ratings["movieId"].unique())}

ratings["userId"] = ratings["userId"].map(user_mapping)
ratings["movieId"] = ratings["movieId"].map(item_mapping)

# Split data into train and test sets
train_data, test_data = train_test_split(ratings, test_size=0.2, random_state=42)

# Convert to tensors
train_users = torch.tensor(train_data["userId"].values, dtype=torch.long)
train_items = torch.tensor(train_data["movieId"].values, dtype=torch.long)
train_ratings = torch.tensor(train_data["rating"].values, dtype=torch.float32)

test_users = torch.tensor(test_data["userId"].values, dtype=torch.long)
test_items = torch.tensor(test_data["movieId"].values, dtype=torch.long)
test_ratings = torch.tensor(test_data["rating"].values, dtype=torch.float32)

class MovieLensDataset(Dataset):
    def __init__(self, users, items, ratings):
        self.users = users
        self.items = items
        self.ratings = ratings

    def __len__(self):
        return len(self.ratings)

    def __getitem__(self, idx):
        return self.users[idx], self.items[idx], self.ratings[idx]

train_dataset = MovieLensDataset(train_users, train_items, train_ratings)
test_dataset = MovieLensDataset(test_users, test_items, test_ratings)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)


In [21]:
class NeuralCF(nn.Module):
    def __init__(self, num_users, num_items, embedding_size=50):
        super(NeuralCF, self).__init__()
        self.user_embedding = nn.Embedding(num_users, embedding_size)
        self.item_embedding = nn.Embedding(num_items, embedding_size)

        self.fc_layers = nn.Sequential(
            nn.Linear(embedding_size * 2, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 1)
        )

    def forward(self, user, item):
        user_emb = self.user_embedding(user)
        item_emb = self.item_embedding(item)
        x = torch.cat([user_emb, item_emb], dim=-1)
        return self.fc_layers(x).squeeze()


In [22]:
# Get number of users and items
num_users = ratings["userId"].nunique()
num_items = ratings["movieId"].nunique()

# Initialize model, optimizer, and loss function
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = NeuralCF(num_users, num_items).to(device)
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.MSELoss()

# Training loop
def train(model, train_loader, optimizer, criterion, epochs=5):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for users, items, ratings in train_loader:
            users, items, ratings = users.to(device), items.to(device), ratings.to(device)

            optimizer.zero_grad()
            predictions = model(users, items)
            loss = criterion(predictions, ratings)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss / len(train_loader):.4f}")

train(model, train_loader, optimizer, criterion, epochs=10)


Epoch 1/10, Loss: 1.0896
Epoch 2/10, Loss: 0.8192
Epoch 3/10, Loss: 0.7574
Epoch 4/10, Loss: 0.7098
Epoch 5/10, Loss: 0.6731
Epoch 6/10, Loss: 0.6410
Epoch 7/10, Loss: 0.6098
Epoch 8/10, Loss: 0.5788
Epoch 9/10, Loss: 0.5477
Epoch 10/10, Loss: 0.5176


In [27]:
# test_dataset

In [33]:
import torch
import numpy as np
from sklearn.metrics import mean_squared_error, precision_score, recall_score

def evaluate(model, test_loader):
    model.eval()
    predictions, true_ratings = [], []

    with torch.no_grad():
        for batch in test_loader:
            if len(batch) == 3:
                users, items, ratings = batch
                users, items, ratings = users.to(device), items.to(device), ratings.to(device)

                preds = model(users, items).cpu().numpy()
                ratings = ratings.cpu().numpy()

                predictions.extend(preds)
                true_ratings.extend(ratings)  # Ensure we collect true ratings

    # Check if we have valid ratings before proceeding
    if len(true_ratings) == 0 or len(predictions) == 0:
        print("No valid ratings available for evaluation!")
        return None, None

    # Compute RMSE
    rmse = mean_squared_error(true_ratings, predictions)
    print(f"RMSE: {rmse:.4f}")

    # Convert ratings into binary classes (1 = liked, 0 = not liked)
    threshold = 3.5
    binary_true = np.array(true_ratings) >= threshold
    binary_preds = np.array(predictions) >= threshold

    # Ensure both arrays have the same length
    if len(binary_true) != len(binary_preds):
        print("Error: Inconsistent lengths between true and predicted ratings!")
        return None, None

    # Precision@K and Recall@K
    precision_at_k = precision_score(binary_true, binary_preds, zero_division=1)
    recall_at_k = recall_score(binary_true, binary_preds, zero_division=1)

    print(f"Precision@K: {precision_at_k:.4f}")
    print(f"Recall@K: {recall_at_k:.4f}")

    return rmse, precision_at_k, recall_at_k

# Run Evaluation
evaluate(model, test_loader)


RMSE: 0.8685
Precision@K: 0.7816
Recall@K: 0.6471


(0.8685087940698402, 0.7815956926089085, 0.6470540562444282)

    Conclusion
    Content-Based Filtering recommends movies based on genres.
    Collaborative Filtering recommends movies based on similar users or similar movies.
    Matrix Factorization (SVD) predicts unknown ratings effectively.
    Evaluation Metrics (Precision, Recall, NDCG, RMSE) measure model performance.
