In [12]:
import zipfile
import pandas as pd
import numpy as np
import random
# import re
from scipy.sparse import csr_matrix, coo_matrix
import matplotlib.pyplot as plt
# import seaborn as sns
# from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.metrics.pairwise import cosine_similarity

import warnings
warnings.filterwarnings('ignore')

In [2]:
movies = pd.read_csv('/kaggle/input/movie-recommendation-system/movies.csv')
ratings = pd.read_csv('/kaggle/input/movie-recommendation-system/ratings.csv').drop(columns=['timestamp'])

In [3]:
print(len(movies))
movies['old_id'] = movies['movieId']
movies['movieId'] = movies.index
movies = movies[['movieId', 'title', 'genres', 'old_id']]
movies.head()

62423


Unnamed: 0,movieId,title,genres,old_id
0,0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1
1,1,Jumanji (1995),Adventure|Children|Fantasy,2
2,2,Grumpier Old Men (1995),Comedy|Romance,3
3,3,Waiting to Exhale (1995),Comedy|Drama|Romance,4
4,4,Father of the Bride Part II (1995),Comedy,5


In [4]:
id_mapper = {k: v for k, v in zip(movies['old_id'], movies['movieId'])}

In [5]:
print(len(ratings))
ratings['movieId'] = ratings['movieId'].map(id_mapper)
ratings['userId'] = ratings['userId'] - 1
ratings.head()

25000095


Unnamed: 0,userId,movieId,rating
0,0,292,5.0
1,0,302,3.5
2,0,303,5.0
3,0,654,5.0
4,0,878,3.5


# Similarity search

In [None]:
def clean(text):
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    return text

In [None]:
# Clean title
movies['clean_title'] = movies['title'].apply(lambda x: clean(x))

# Fit TF-IDF vectorizer for embedding words for similarity check
vectorizer = TfidfVectorizer(ngram_range=(1, 2))
tfidf_matrix = vectorizer.fit_transform(movies['clean_title'])

## Book search engine

In [None]:
# Search movie titles similar to queried text using Cosine similarity
def query_search(query):
    query_vector = vectorizer.transform([query])
    cosine_similarities = cosine_similarity(query_vector, tfidf_matrix).flatten()
    indices = np.argpartition(cosine_similarities, -10)[-10:]
    results = movies.iloc[indices][::-1]
    return results

## Recommendation engine

In [None]:
def get_recommendations(movie_id):

    # Find users who watched the same movie and liked it (i.e. rated above 5)
    similar_users = ratings[(ratings['movieId']==movie_id) & (ratings['rating'] > 4)]['userId'].unique()
    # Find other movies watched by above users and rated highly
    similar_user_recs = ratings[(ratings['userId'].isin(similar_users)) & (ratings['rating'] > 4)]['movieId']


    # Filter out the movies that have been watched by at least 20% of the users
    similar_user_recs = similar_user_recs.value_counts() / len(similar_users)
    similar_user_recs = similar_user_recs[similar_user_recs > 0.2]


    # Now that we know how much similar users liked the recommendedations. Now we will get how all users liked the recommendations. Similar and dissimilar to user
    all_users = ratings[(ratings['movieId'].isin(similar_user_recs.index)) & (ratings['rating'] > 4)]
    # Normalize the counts with total number of unique uses there are
    all_user_recs = all_users['movieId'].value_counts() / len(all_users['userId'].unique())


    # Create a score metric (how much similar people like it / how much people like it in general). A Higher score suggests a more targeted recommendation
    rec_percentages = pd.concat([similar_user_recs, all_user_recs], axis=1)
    rec_percentages.columns = ['similar', 'all']
    rec_percentages['score'] = rec_percentages['similar'] / rec_percentages['all']
    rec_percentages = rec_percentages.sort_values('score', ascending=False)

    # Return top k recommendations
    return rec_percentages.head(5).merge(movies, left_index=True, right_on='movieId')[['score', 'title', 'genres']]

In [None]:
query = "Willy wonka and the chocolate factory"
results = query_search(query)

movie_id = results.iloc[0]['movieId']
recommendations = get_recommendations(movie_id)

print(f"If you liked: {query}\n\nYou will also like:\n")
try:
    for index in recommendations.index:
        print(f"--> {recommendations['title'].loc[index]}")
except:
    print(recommendations)

If you liked: Willy wonka and the chocolate factory



You will also like:



--> Chocolate (2008)

--> Ong-Bak: The Thai Warrior (Ong Bak) (2003)

--> Ip Man (2008)

--> House of Flying Daggers (Shi mian mai fu) (2004)

--> Kung Fu Hustle (Gong fu) (2004)


# Matrix Factorization

In [6]:
movie_dict = {k: v for k, v in zip(movies.movieId, movies.title)}

# all_genres = [genre for genres in movies['genres'].str.split('|') for genre in genres]
# unique_genres = list(set(all_genres))
# len(unique_genres)

In [28]:
ratings['userId'] = ratings['userId'].astype('category')
ratings['movieId'] = ratings['movieId'].astype('category')

arr = csr_matrix((ratings['rating'].values, (ratings['userId'].cat.codes, ratings['movieId'].cat.codes)))

In [29]:
coom = arr.tocoo()
row_indices = coom.row
column_indices = coom.col
values = coom.data

# Calculate sum and count of values per row
row_sums = np.bincount(row_indices, weights=values)
row_counts = np.bincount(row_indices)

# Compute the mean for each row
row_means = row_sums / row_counts

# Normalize values by subtracting the mean
normalized_values = values - row_means[row_indices]

arr = csr_matrix((normalized_values, (row_indices, column_indices)))

In [55]:
arr.shape

(162541, 59047)

# Training

In [24]:
import torch
import torch.optim as optim
# import torch.nn as nn
import torch.nn.functional as F

In [58]:
torch.manual_seed(9)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
test_P = torch.rand(arr.shape[0], 200, requires_grad=True, device=device)
# test_P = torch.bernoulli(torch.full((arr.shape[0], 20), 0.5, device=device)).requires_grad_(True)
test_Q = torch.rand(200, arr.shape[1], requires_grad=True, device=device)
test_Q.data *= 5
chunk = 20000
size = int(np.ceil(arr.shape[0]/chunk) * np.ceil(arr.shape[1]/chunk))
ctr = 0
# test_set = torch.tensor(arr[:k, :k].todense(), dtype=torch.float32)
# mask = (test_set > 0).int()
# loss_ref = torch.mul(test_set, mask)
size

27

In [59]:
optimizer = optim.AdamW(params=[test_P, test_Q], lr=0.1)

In [60]:
epochs = 20
lambda_reg = 0.01
for epoch in range(epochs):
    # running_loss = torch.zeros((size, ), requires_grad=True, device=device)
    running_loss = 0
    for i in range(0, arr.shape[0], chunk):
        P_chnk = test_P[i:i+chunk, :]
        for j in range(0, arr.shape[1], chunk):
            Q_chnk = test_Q[:, j:j+chunk]
            loss_ref = torch.tensor(arr[i:i+chunk, j:j+chunk].todense(), dtype=torch.float32)
            mask = (loss_ref > 0).int()
            loss_ref = torch.mul(loss_ref, mask).to(device)

            out = P_chnk @ Q_chnk
            loss = F.mse_loss(loss_ref, out) + lambda_reg * (torch.sum(P_chnk**2) + torch.sum(Q_chnk**2))
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            # running_loss[ctr] = loss.item()
            # ctr += 1
            running_loss += loss.item()

    avg_loss = running_loss/size
    print(f"Epoch: {epoch}; Loss: {avg_loss}")



Epoch: 0; Loss: 247417.76909722222
Epoch: 1; Loss: 75407.89814814815
Epoch: 2; Loss: 16643.633879484954
Epoch: 3; Loss: 3424.4671450014466
Epoch: 4; Loss: 577.0381243670428
Epoch: 5; Loss: 75.54351382785373
Epoch: 6; Loss: 7.477562701260602
Epoch: 7; Loss: 0.5881617538355015
Epoch: 8; Loss: 0.05329093840231912
Epoch: 9; Loss: 0.008852184294179702
Epoch: 10; Loss: 0.0023425920344716696
Epoch: 11; Loss: 0.001211185123355352
Epoch: 12; Loss: 0.0010601179460031239
Epoch: 13; Loss: 0.0010805893127528382


KeyboardInterrupt: 

In [61]:
# torch.cuda.empty_cache()
with torch.no_grad():
    P = test_P.cpu().detach().numpy()
    Q = test_Q.cpu().detach().numpy()

np.save('/kaggle/working/P.npy', P)
np.save('/kaggle/working/Q.npy', Q)

# Inference

In [None]:
P = np.load('/kaggle/working/P.npy')
Q = np.load('/kaggle/working/Q.npy')

In [66]:
user = random.randint(0, arr.shape[0])
user_all_movies = sorted({k: v for k, v in enumerate(np.array(arr[user, :].todense())[0]) if v>0}.items(), key=lambda x: x[1], reverse=True)
user_movies = dict(user_all_movies[:5])
user_movie_id = [x[0] for x in user_all_movies]

print("User's top rated movies:")
for k in user_movies.keys():
    print(f"--> {movie_dict[k]}")
    print(f"    Genre: {movies['genres'].loc[k]}", end="\n\n")

User's top rated movies:
--> Babe (1995)
    Genre: Children|Drama

--> Braveheart (1995)
    Genre: Action|Drama|War

--> Interview with the Vampire: The Vampire Chronicles (1994)
    Genre: Drama|Horror

--> Shawshank Redemption, The (1994)
    Genre: Crime|Drama

--> Forrest Gump (1994)
    Genre: Comedy|Drama|Romance|War



In [67]:
preds = P[user, :] @ Q
# preds = [round(x) if x>0 else 0 for x in preds]

for k in user_movie_id:
    preds[k] = 0

preds = sorted({k: v for k, v in enumerate(preds)}.items(), key=lambda x: x[1], reverse=True)[:5]
preds = [x[0] for x in preds]

print("Recommended movies:")
for k in preds:
    print(f"--> {movie_dict[k]}")
    print(f"    Genre: {movies['genres'].loc[k]}", end="\n\n")

Recommended movies:
--> Big and Little Wong Tin Bar (1962)
    Genre: Action

--> Sixty Million Dollar Man (1995)
    Genre: Comedy|Fantasy|Sci-Fi

--> The Snowman (2017)
    Genre: Crime|Thriller

--> Paris in Spring (1935)
    Genre: (no genres listed)

--> Recess: School's Out (2001)
    Genre: Animation|Children

