# Week 10 - Recommendation Systems
## Content-Based Filtering & Collaborative Filtering

In [14]:
# Import required libraries
import pandas as pd
import numpy as np
import json
from sklearn.metrics.pairwise import cosine_similarity
import warnings
warnings.filterwarnings('ignore')

## Part 1: Content-Based Filtering (Questions 1-3)

In [15]:
# Load JSON dataset
with open('dataset_ga_10_part_1.json', 'r') as file:
    data = json.load(file)

# Extract all unique genres
all_genres = set()
for movie, genres in data.items():
    all_genres.update(genres)

all_genres = sorted(list(all_genres))
print(f"Q1: Unique number of genres: {len(all_genres)}")
print(f"Genres: {all_genres}")

Q1: Unique number of genres: 15
Genres: ['Action', 'Adventure', 'Animation', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Horror', 'Musical', 'Mystery', 'Romance', 'Science Fiction', 'Thriller', 'Western']


In [16]:
# Create binary matrix: movies as rows, genres as columns
movie_genre_matrix = []
movie_names = []

for movie, genres in sorted(data.items(), key=lambda x: int(x[0].split()[1])):
    movie_names.append(movie)
    # Create binary vector: 1 if genre present, 0 otherwise
    genre_vector = [1 if genre in genres else 0 for genre in all_genres]
    movie_genre_matrix.append(genre_vector)

# Convert to DataFrame
df_movies = pd.DataFrame(movie_genre_matrix, columns=all_genres, index=movie_names)

print("Movie-Genre Matrix Shape:", df_movies.shape)
print("\nFirst 5 movies:")
print(df_movies.head())

Movie-Genre Matrix Shape: (100, 15)

First 5 movies:
         Action  Adventure  Animation  Comedy  Crime  Documentary  Drama  \
Movie 1       0          0          1       0      0            0      1   
Movie 2       0          0          1       1      0            0      0   
Movie 3       0          0          0       0      1            0      0   
Movie 4       0          0          1       1      0            0      0   
Movie 5       0          0          0       0      1            0      1   

         Fantasy  Horror  Musical  Mystery  Romance  Science Fiction  \
Movie 1        0       0        0        0        1                1   
Movie 2        0       1        0        0        0                0   
Movie 3        1       0        0        0        0                1   
Movie 4        0       0        0        0        0                0   
Movie 5        0       1        0        0        0                0   

         Thriller  Western  
Movie 1         1        0  

In [17]:
# Calculate cosine similarity matrix
cosine_sim_matrix = cosine_similarity(df_movies)
cosine_sim_df = pd.DataFrame(cosine_sim_matrix, index=movie_names, columns=movie_names)

# Q2: Cosine similarity between Movie 1 and Movie 10
similarity_1_10 = cosine_sim_df.loc['Movie 1', 'Movie 10']
print(f"Q2: Cosine similarity between Movie 1 and Movie 10: {similarity_1_10:.4f}")

Q2: Cosine similarity between Movie 1 and Movie 10: 0.3162


In [18]:
# Q3: Find 5 most similar movies to Movie 50
movie_50_similarities = cosine_sim_df['Movie 50'].sort_values(ascending=False)

# Exclude Movie 50 itself and get top 5
top_5_similar = movie_50_similarities[movie_50_similarities.index != 'Movie 50'].head(5)

print("Q3: Top 5 most similar movies to Movie 50:")
print(list(top_5_similar.index))
print("\nSimilarity scores:")
print(top_5_similar)

Q3: Top 5 most similar movies to Movie 50:
['Movie 72', 'Movie 1', 'Movie 2', 'Movie 61', 'Movie 76']

Similarity scores:
Movie 72    0.75000
Movie 1     0.67082
Movie 2     0.57735
Movie 61    0.57735
Movie 76    0.57735
Name: Movie 50, dtype: float64


## Part 2: Collaborative Filtering (Questions 4-5)

In [19]:
# Load user-item ratings dataset (items A-J, rows represent items)
df_ratings = pd.read_csv('dataset_ga_10_part_2.csv')

print("Dataset Shape:", df_ratings.shape)
print("\nFirst few rows:")
print(df_ratings.head())

Dataset Shape: (10, 10)

First few rows:
   User 1  User 2  User 3  User 4  User 5  User 6  User 7  User 8  User 9  \
0       4       4       2       3       4       4       2       0       5   
1       5       3       5       4       1       4       4       5       1   
2       3       5       4       4       4       4       5       5       5   
3       5       2       1       1       2       5       2       2       5   
4       5       4       1       3       2       3       2       5       1   

   User 10  
0        3  
1        1  
2        3  
3        3  
4        1  


In [20]:
# Transpose to get user-item matrix (users as rows, items as columns)
user_item_matrix = df_ratings.T
user_item_matrix.columns = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J']
user_item_matrix.index = [f'User {i}' for i in range(1, len(user_item_matrix) + 1)]

print("User-Item Matrix:")
print(user_item_matrix)

User-Item Matrix:
         A  B  C  D  E  F  G  H  I  J
User 1   4  5  3  5  5  2  3  3  3  0
User 2   4  3  5  2  4  0  4  5  1  4
User 3   2  5  4  1  1  3  3  2  4  4
User 4   3  4  4  1  3  5  3  5  1  2
User 5   4  1  4  2  2  1  2  5  2  4
User 6   4  4  4  5  3  1  4  2  4  2
User 7   2  4  5  2  2  4  2  2  4  4
User 8   0  5  5  2  5  2  1  4  4  4
User 9   5  1  5  5  1  1  1  1  4  3
User 10  3  1  3  3  1  3  5  2  2  1


In [21]:
# Calculate User-User similarity matrix using cosine similarity
# Replace NaN with 0 for similarity calculation
user_item_filled = user_item_matrix.fillna(0)
user_similarity = cosine_similarity(user_item_filled)
user_sim_df = pd.DataFrame(user_similarity, 
                           index=user_item_matrix.index, 
                           columns=user_item_matrix.index)

print("User-User Similarity Matrix:")
print(user_sim_df.round(4))

User-User Similarity Matrix:
         User 1  User 2  User 3  User 4  User 5  User 6  User 7  User 8  \
User 1   1.0000  0.8186  0.7650  0.8229  0.7510  0.9454  0.7950  0.8061   
User 2   0.8186  1.0000  0.8003  0.8572  0.9451  0.8607  0.8043  0.8309   
User 3   0.7650  0.8003  1.0000  0.8536  0.7927  0.8523  0.9721  0.8747   
User 4   0.8229  0.8572  0.8536  1.0000  0.8407  0.7820  0.8753  0.8279   
User 5   0.7510  0.9451  0.7927  0.8407  1.0000  0.8223  0.8233  0.7938   
User 6   0.9454  0.8607  0.8523  0.7820  0.8223  1.0000  0.8636  0.8083   
User 7   0.7950  0.8043  0.9721  0.8753  0.8233  0.8636  1.0000  0.9087   
User 8   0.8061  0.8309  0.8747  0.8279  0.7938  0.8083  0.9087  1.0000   
User 9   0.7674  0.7504  0.7477  0.6370  0.8286  0.8887  0.8132  0.6795   
User 10  0.8134  0.7917  0.7857  0.8242  0.7907  0.8820  0.8015  0.6360   

         User 9  User 10  
User 1   0.7674   0.8134  
User 2   0.7504   0.7917  
User 3   0.7477   0.7857  
User 4   0.6370   0.8242  
User 5   0

In [22]:
# Q4: Find highest similarity score among given pairs
pairs = {
    'a': ('User 8', 'User 10'),
    'b': ('User 1', 'User 9'),
    'c': ('User 2', 'User 5'),
    'd': ('User 7', 'User 3')
}

print("Q4: Similarity scores for given pairs:")
for option, (user1, user2) in pairs.items():
    similarity = user_sim_df.loc[user1, user2]
    print(f"{option}. {user1} & {user2}: {similarity:.4f}")

# Find highest
max_pair = max(pairs.items(), key=lambda x: user_sim_df.loc[x[1][0], x[1][1]])
print(f"\nHighest similarity: Option {max_pair[0]}")

Q4: Similarity scores for given pairs:
a. User 8 & User 10: 0.6360
b. User 1 & User 9: 0.7674
c. User 2 & User 5: 0.9451
d. User 7 & User 3: 0.9721

Highest similarity: Option d


In [23]:
# Q5: User-User Collaborative Filtering - Predict rating for Item J by User 1
target_user = 'User 1'
target_item = 'J'

# Get users who rated item J (excluding target user)
users_rated_J = user_item_matrix[target_item].dropna()
users_rated_J = users_rated_J[users_rated_J.index != target_user]

# Get similarity scores between target user and users who rated item J
similarities = user_sim_df.loc[target_user, users_rated_J.index]

# Calculate weighted average rating
numerator = (similarities * users_rated_J).sum()
denominator = similarities.sum()

predicted_rating = numerator / denominator

print(f"Q5: Predicted rating of item J for {target_user}: {predicted_rating:.2f}")
print(f"\nDetails:")
print(f"Users who rated item J: {list(users_rated_J.index)}")
print(f"Their ratings: {list(users_rated_J.values)}")
print(f"Similarity scores: {list(similarities.values)}")

Q5: Predicted rating of item J for User 1: 3.07

Details:
Users who rated item J: ['User 2', 'User 3', 'User 4', 'User 5', 'User 6', 'User 7', 'User 8', 'User 9', 'User 10']
Their ratings: [np.int64(4), np.int64(4), np.int64(2), np.int64(4), np.int64(2), np.int64(4), np.int64(4), np.int64(3), np.int64(1)]
Similarity scores: [np.float64(0.8185877337601605), np.float64(0.7650438660612602), np.float64(0.8228805661536404), np.float64(0.751030385791759), np.float64(0.9453509010924139), np.float64(0.7950138754255777), np.float64(0.8060894771674497), np.float64(0.7673830673678427), np.float64(0.8134393832333672)]


## Summary of Answers

In [24]:

print("WEEK 10 - RECOMMENDATION SYSTEMS - SUMMARY OF ANSWERS\n")

print(f"\n{'CONTENT-BASED FILTERING':-^70}")
print(f"Q1: Unique number of genres: {len(all_genres)}")
print(f"Q2: Cosine similarity (Movie 1 & Movie 10): {similarity_1_10:.4f}")
print(f"Q3: Top 5 similar movies to Movie 50: {list(top_5_similar.index)}")
print(f"\n{'COLLABORATIVE FILTERING':-^70}")
print(f"Q4: Highest similarity pair - See output above")
print(f"Q5: Predicted rating (Item J, User 1): {predicted_rating:.2f}")

WEEK 10 - RECOMMENDATION SYSTEMS - SUMMARY OF ANSWERS


-----------------------CONTENT-BASED FILTERING------------------------
Q1: Unique number of genres: 15
Q2: Cosine similarity (Movie 1 & Movie 10): 0.3162
Q3: Top 5 similar movies to Movie 50: ['Movie 72', 'Movie 1', 'Movie 2', 'Movie 61', 'Movie 76']

-----------------------COLLABORATIVE FILTERING------------------------
Q4: Highest similarity pair - See output above
Q5: Predicted rating (Item J, User 1): 3.07
