### Chapter 3: (ML) Collaborative Filtering

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_squared_error
from scipy.sparse.linalg import svds
from sklearn.decomposition import NMF

In [2]:
# Reading ratings file
ratings = pd.read_csv('data-1m/ratings.csv', 
                    sep='\t', #Note that the separator here is "\t"
                    encoding='latin-1',
                    engine='python',
                    index_col=0
                     ) 

# Reading users file
users = pd.read_csv('data-1m/users.csv', 
                    sep='\t', #Note that the separator here is "\t"
                    encoding='latin-1',
                    engine='python',
                    index_col=0
                     )

# # Reading movies file
movies = pd.read_csv('data-1m/movies.csv', 
                    sep='\t', #Note that the separator here is "\t"
                    encoding='latin-1',
                    engine='python',
                    index_col=0
                     )

#Reading the combined file
combined = pd.read_csv('data-1m/dataset_combined.csv')

In [3]:
combined.head()

Unnamed: 0,movie_id,title,genres,user_id,rating,timestamp,gender,age,occupation,zipcode,age_desc,occ_desc
0,1,Toy Story (1995),Animation|Children's|Comedy,1,5,978824268,F,1,10,48067,Under 18,K-12 student
1,1,Toy Story (1995),Animation|Children's|Comedy,6,4,978237008,F,50,9,55117,50-55,homemaker
2,1,Toy Story (1995),Animation|Children's|Comedy,8,4,978233496,M,25,12,11413,25-34,programmer
3,1,Toy Story (1995),Animation|Children's|Comedy,9,5,978225952,M,25,17,61614,25-34,technician/engineer
4,1,Toy Story (1995),Animation|Children's|Comedy,10,5,978226474,F,35,1,95370,35-44,academic/educator


### Data Preparation

Let's first create a **User x Movies matrix**

<img src="img/Screenshot 2024-11-16 at 10.05.41 PM.png" width="750">

In [4]:
#Create user-movie rating matrix
rating_matrix = combined.pivot(index='user_id', columns='movie_id', values='rating').fillna(0)

In [5]:
#Let's print the head of the ratings matrix
rating_matrix.head()

movie_id,1,2,3,4,5,6,7,8,9,10,...,3943,3944,3945,3946,3947,3948,3949,3950,3951,3952
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### User-User Collaborative Filtering

Here we find look alike users based on similarity and recommend movies which first user’s look-alike has chosen in past. This algorithm is very effective but takes a lot of time and resources. It requires to compute every user pair information which takes time. Therefore, for big base platforms, this algorithm is hard to implement without a very strong parallelizable system.

<img src="img/ub.png" width="300">

In [9]:
# Calculate user-user similarity matrix
user_similarity = cosine_similarity(rating_matrix)

In [10]:
# Create DataFrame for user similarity
user_similarity_df = pd.DataFrame(
    user_similarity,
    index=rating_matrix.index,
    columns=rating_matrix.index
)

In [11]:
user_similarity_df.head()

user_id,1,2,3,4,5,6,7,8,9,10,...,6031,6032,6033,6034,6035,6036,6037,6038,6039,6040
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,0.096382,0.12061,0.132455,0.090158,0.179222,0.059678,0.138241,0.226148,0.255288,...,0.170588,0.082006,0.069807,0.033663,0.114877,0.186329,0.135979,0.0,0.174604,0.13359
2,0.096382,1.0,0.151479,0.171176,0.114394,0.100865,0.305787,0.203337,0.190198,0.226861,...,0.112503,0.091222,0.268565,0.014286,0.183384,0.228241,0.206274,0.066118,0.066457,0.218276
3,0.12061,0.151479,1.0,0.151227,0.062907,0.074603,0.138332,0.077656,0.126457,0.213655,...,0.09296,0.125864,0.161507,0.0,0.097308,0.143264,0.107744,0.120234,0.094675,0.133144
4,0.132455,0.171176,0.151227,1.0,0.045094,0.013529,0.130339,0.100856,0.093651,0.120738,...,0.163629,0.093041,0.382803,0.0,0.082097,0.170583,0.127464,0.062907,0.064634,0.137968
5,0.090158,0.114394,0.062907,0.045094,1.0,0.047449,0.126257,0.220817,0.26133,0.117052,...,0.100652,0.035732,0.061806,0.054151,0.179083,0.293365,0.172686,0.020459,0.027689,0.241437


In [12]:
# Find similar users
n_similar_users = 5
user_id = 1
similar_users = user_similarity_df[user_id].sort_values(ascending=False)[1:n_similar_users+1]

# # Get ratings from similar users
similar_user_ratings = rating_matrix.loc[similar_users.index]

# # Calculate weighted average of ratings
weights = similar_users.values.reshape(-1, 1)
weighted_ratings = (similar_user_ratings * weights).sum(axis=0)
norm_weights = weights.sum()

predicted_ratings = weighted_ratings / norm_weights

In [13]:
predicted_ratings.drop(columns = ratings[ratings['user_id'] == 1]['movie_id'].values)

movie_id
1       4.790235
2       0.735169
3       0.000000
4       0.000000
5       0.000000
          ...   
3948    0.000000
3949    0.000000
3950    0.000000
3951    0.000000
3952    0.000000
Length: 3706, dtype: float64

In [14]:
# Get top recommendations
n_recommendations = 5
top_recommendations = predicted_ratings.drop(ratings[ratings['user_id'] == 1]['movie_id'].values). \
sort_values(ascending=False)[:n_recommendations]

print("-" * 60)
print(f"{'Movie Title':<50} Similarity")
print("-" * 60)

for movie_id, pred_rating in top_recommendations.items():
    # Get ratings from similar users for this movie
    similar_ratings = similar_user_ratings[movie_id]    
    print(f"{movies[movies['movie_id'] == movie_id]['title'].values[0]}                 {pred_rating:.2f}")

------------------------------------------------------------
Movie Title                                        Similarity
------------------------------------------------------------
Little Mermaid, The (1989)                 4.21
Jungle Book, The (1967)                 3.44
Silence of the Lambs, The (1991)                 3.37
Lady and the Tramp (1955)                 2.60
Sleeping Beauty (1959)                 2.39


### Item-Item Collaborative Filtering

We try finding movie's look-alike. Once we have movie's look-alike matrix, we can easily recommend alike movies to user who have rated any movie from the dataset. This algorithm is far less resource consuming than user-user collaborative filtering. Hence, for a new user, the algorithm takes far lesser time than user-user collaborate as we don’t need all similarity scores between users. And with fixed number of movies, movie-movie look alike matrix is fixed over time.

<img src="img/ib.png" width="300">

In [6]:
# Calculate item-item similarity matrix using cosine similarity
item_similarity = cosine_similarity(rating_matrix.T)

# # Create DataFrame for item similarity
item_similarity_df = pd.DataFrame(
    item_similarity,
    index=rating_matrix.columns,
    columns=rating_matrix.columns
)

In [7]:
item_similarity_df.head()

movie_id,1,2,3,4,5,6,7,8,9,10,...,3943,3944,3945,3946,3947,3948,3949,3950,3951,3952
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,0.390349,0.267943,0.178789,0.256569,0.347373,0.30149,0.125709,0.10662,0.377459,...,0.099502,0.020966,0.084105,0.081826,0.045949,0.309676,0.186633,0.093479,0.042829,0.182691
2,0.390349,1.0,0.240946,0.155457,0.24997,0.244827,0.262772,0.196521,0.158469,0.3862,...,0.061819,0.015209,0.07531,0.095573,0.074271,0.21365,0.140781,0.087013,0.026063,0.122185
3,0.267943,0.240946,1.0,0.192788,0.30829,0.18702,0.29223,0.092122,0.128378,0.245601,...,0.038492,0.065507,0.049512,0.087377,0.050985,0.190575,0.104837,0.062258,0.010073,0.097786
4,0.178789,0.155457,0.192788,1.0,0.27199,0.12517,0.220024,0.049554,0.060334,0.133707,...,0.055486,0.0533,0.002227,0.025278,0.025204,0.118902,0.096318,0.022588,0.024769,0.095154
5,0.256569,0.24997,0.30829,0.27199,1.0,0.148114,0.305107,0.095512,0.138392,0.237681,...,0.026632,0.083898,0.046399,0.047542,0.016156,0.174554,0.092403,0.051633,0.01075,0.112835


In [8]:
# Get similarity scores for the movie
n_similar = 5
movie_id = 1
similar_scores = item_similarity_df[movie_id]

# Sort similarities in descending order (excluding the movie itself)
similar_movies = similar_scores.sort_values(ascending=False)[1:n_similar+1]

# Print header
print(f"\nMovies similar to '{movies[movies['movie_id'] == movie_id]['title'].values[0]}':")
print("-" * 60)
print(f"{'Movie Title':<50} Similarity")
print("-" * 60)

# Print each similar movie 
for movie_id, similarity in similar_movies.items():
    print(f"{movies[movies['movie_id'] == movie_id]['title'].values[0]}                          {similarity:.3f}")


Movies similar to 'Toy Story (1995)':
------------------------------------------------------------
Movie Title                                        Similarity
------------------------------------------------------------
Toy Story 2 (1999)                          0.633
Groundhog Day (1993)                          0.611
Aladdin (1992)                          0.606
Bug's Life, A (1998)                          0.579
Back to the Future (1985)                          0.570


### Matrix Factorization

<img src="img/Screenshot 2024-11-16 at 10.06.05 PM.png" width="750">

In [18]:
# Fill missing values with 0
R = rating_matrix.fillna(0).values
n_components = 50

# Initialize and fit NMF
model = NMF(n_components=n_components, init='random', random_state=0)

# Fit the model
# W: user latent factor matrix
# H: item latent factor matrix
W = model.fit_transform(R)
H = model.components_

# Reconstruct rating matrix
R_pred = np.dot(W, H)

# Convert to DataFrame for easier handling
predicted_ratings = pd.DataFrame(
    R_pred,
    index=rating_matrix.index,
    columns=rating_matrix.columns
)



In [19]:
predicted_ratings = predicted_ratings.loc[1]

top_recommendations = predicted_ratings.drop(ratings[ratings['user_id'] == 1]['movie_id'].values). \
sort_values(ascending=False)[:n_recommendations]

print("-" * 60)
print(f"{'Movie Title':<50} Similarity")
print("-" * 60)

for movie_id, pred_rating in top_recommendations.items():
    # Get ratings from similar users for this movie
    similar_ratings = similar_user_ratings[movie_id]    
    print(f"{movies[movies['movie_id'] == movie_id]['title'].values[0]}                 {pred_rating:.2f}")

------------------------------------------------------------
Movie Title                                        Similarity
------------------------------------------------------------
Lion King, The (1994)                 2.81
Shawshank Redemption, The (1994)                 2.41
Babe (1995)                 2.31
Little Mermaid, The (1989)                 2.10
Fantasia (1940)                 2.04
