In [7]:
!pip install scikit-surprise



In [8]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors

from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix
from scipy.sparse.linalg import svds

from surprise import KNNWithMeans, Dataset, accuracy, Reader
from surprise.model_selection import train_test_split

In [9]:
#df_movies = pd.read_csv("/content/sample_data/movies.csv")
df_movies = pd.read_csv("Data/movies.csv")
df_movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [10]:
#df_ratings = pd.read_csv("/content/sample_data/ratings.csv")
df_ratings = pd.read_csv("Data/ratings.csv")
df_ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [11]:
df_ratings.drop(columns='timestamp', inplace=True)

In [12]:
df_cleaned = df_movies.merge(df_ratings, on='movieId')

In [13]:
df_cleaned['num_viewers'] = df_cleaned.groupby('movieId')['userId'].transform('count')


In [14]:
df_clean = df_cleaned.drop_duplicates(subset='title', keep='first')
df_clean.head()

Unnamed: 0,movieId,title,genres,userId,rating,num_viewers
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1,4.0,215
215,2,Jumanji (1995),Adventure|Children|Fantasy,6,4.0,110
325,3,Grumpier Old Men (1995),Comedy|Romance,1,4.0,52
377,4,Waiting to Exhale (1995),Comedy|Drama|Romance,6,3.0,7
384,5,Father of the Bride Part II (1995),Comedy,6,5.0,49


In [15]:
genres_split = df_clean.genres.apply(lambda x: x.split(sep='|')).apply(pd.value_counts, 1.0).fillna(0.0)
df_clean = pd.concat([df_clean.iloc[:,:], genres_split], axis=1)
df_clean.head()


Unnamed: 0,movieId,title,genres,userId,rating,num_viewers,Comedy,Animation,Adventure,Children,...,Horror,Mystery,Sci-Fi,War,Musical,Documentary,IMAX,Western,Film-Noir,(no genres listed)
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1,4.0,215,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
215,2,Jumanji (1995),Adventure|Children|Fantasy,6,4.0,110,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
325,3,Grumpier Old Men (1995),Comedy|Romance,1,4.0,52,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
377,4,Waiting to Exhale (1995),Comedy|Drama|Romance,6,3.0,7,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
384,5,Father of the Bride Part II (1995),Comedy,6,5.0,49,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
df_clean['(no genres listed)'].sum()

34.0

In [17]:
df_clean.drop(columns=['(no genres listed)'], inplace=True)

# Hybrid Model

In [18]:
import pandas as pd
import surprise
from surprise import accuracy
from surprise import Dataset, Reader, SVD, KNNWithMeans
from surprise.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import NearestNeighbors

In [20]:
# create a reader to read the dataframe
reader = Reader(rating_scale=(1.0, 5.0))

# create a surprise dataset from the dataframe
data = Dataset.load_from_df(df_clean[['userId', 'movieId', 'rating']], reader)

# split the data into training and testing sets
trainset, testset = train_test_split(data, test_size=0.25)

# train a content-based model using the KNN algorithm
k = 10
sim_options = {'name': 'cosine', 'user_based': False}
algo_cb = KNNWithMeans(k=k, sim_options=sim_options)
algo_cb.fit(trainset)

# train a collaborative filtering model using SVD
algo_cf = SVD()
algo_cf.fit(trainset)

# for each user and item pair in the test set, make predictions using both models
predictions_cf = algo_cf.test(testset)
predictions_cb = algo_cb.test(testset)


Computing the cosine similarity matrix...
Done computing similarity matrix.


In [21]:
# assign weights to the models
weight_cf = 0.5
weight_cb = 0.5

# combine the predictions from both models using a weighted average
predictions_combined = []

# iterate over each prediction in the test set
for pred_cf, pred_cb in zip(predictions_cf, predictions_cb):
    # extract the user ID and item ID from the prediction
    uid, iid = pred_cf.uid, pred_cf.iid
    
    # calculate the weighted sum of the predicted ratings
    rating_combined = (weight_cf * pred_cf.est) + (weight_cb * pred_cb.est)
    
    # create a new prediction object with the combined rating
    pred_combined = surprise.prediction_algorithms.predictions.Prediction(uid, iid, r_ui=None, est=rating_combined, details=None)
    
    # add the combined prediction to the list
    predictions_combined.append(pred_combined)


In [80]:
*******************************************

SyntaxError: ignored

In [23]:
##make predictions for all the movies that the user has not yet rated using the hybrid model

# get the list of all movie IDs that the user has not yet rated
user_id = 1  # replace with the user ID you want to get recommendations for
movie_ids = df_clean[~df_clean['movieId'].isin(df_clean[df_clean['userId'] == user_id]['movieId'])]['movieId']

# create a list of (user_id, movie_id, 0) tuples to make predictions on
testset = [[user_id, movie_id, 0] for movie_id in movie_ids]

# make predictions using both models
#predictions_cf = algo_cf.test(testset)
#predictions_cb = algo_cb.test(testset)

# combine the predictions from both models using a weighted average
predictions_combined = [(weight_cf * pred_cf.est) + (weight_cb * pred_cb.est) for pred_cf, pred_cb in zip(predictions_cf, predictions_cb)]

# add the predictions to the dataframe
df_clean.loc[~df_clean['movieId'].isin(df_clean[df_clean['userId'] == user_id]['movieId']), 'hybrid_score'] = predictions_combined

ValueError: cannot set using a multi-index selection indexer with a different length than the value

In [87]:
#sort the movies by their hybrid scores and select the top 5 movies

# get the top 5 movie recommendations based on the hybrid scores
top_5_movies = df_clean[df_clean['userId'] != user_id].sort_values(by='hybrid_score', ascending=False).head(5)['title']
print(top_5_movies)

43846                         Tea with Mussolini (1999)
67916    Son of the Bride (Hijo de la novia, El) (2001)
61087                   Center of the World, The (2001)
78044                Play Time (a.k.a. Playtime) (1967)
87129                              Son of Rambow (2007)
Name: title, dtype: object


# Recommendation for Top 5

In [None]:
# fit the nearest neighbors model
nn = NearestNeighbors(metric='cosine', algorithm='brute')
nn.fit(df_clean[['rating']])

In [None]:
# create a new DataFrame with movie title, average rating, and number of viewers
movie_stats = df_clean.groupby('title').agg({'rating': [np.size, np.mean]})

# select the 5 movies with the highest average rating
top_movies = movie_stats['rating']['mean'].nlargest(5)

# select the movie titles of the top 5 movies
top_movie_titles = top_movies.index.tolist()

# filter the dataframe to only include the top 5 movies
top_movie_df = df_clean[df_clean['title'].isin(top_movie_titles)]

# group the data by movie title and rating, and count the number of viewers for each rating
top_movie_grouped = top_movie_df.groupby(['title', 'rating']).agg({'userId': 'count'}).reset_index()

# print the top 5 movies with their average rating, number of viewers, and each rating
for movie_title in top_movie_titles:
    print(f'Top 5 ratings for {movie_title}:')
    print(top_movie_grouped[top_movie_grouped['title'] == movie_title])

In [None]:
#########################

PLOTS BELOW

PLOTS ABOVE

# Extra Code for Model Tuning 

In [None]:
import surprise
from surprise import Dataset, Reader, SVD
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
# Load the dataset using Surprise
reader = Reader(rating_scale=(1.0, 5.0))
data = Dataset.load_from_df(df_clean[['userId', 'title', 'rating']], reader)

In [None]:
train, test = train_test_split(data, test_size=0.2, random_state=42)


In [None]:
from surprise import SVD

algo = SVD()
trainset = data.build_full_trainset()
algo.fit(trainset)

In [None]:
#algo.test(test)

In [None]:
surprise.accuracy.mae(algo.test(test))

In [None]:
surprise.accuracy.rmse(algo.test(test))

In [None]:
from surprise.prediction_algorithms.matrix_factorization import SVD
SVD = SVD()

In [None]:
from surprise.model_selection import RandomizedSearchCV

# Define the search space for hyperparameters
param_distributions = {'n_factors': [50, 100, 200],
                       'n_epochs': [10, 20, 30],
                       'lr_all': [0.002, 0.005, 0.01],
                       'reg_all': [0.02, 0.1, 0.4]}
# Create the randomized search object
rs = RandomizedSearchCV(SVD, param_distributions=, n_iter=10, measures=['rmse', 'mae'], cv=5)

# Run the randomized search
rs.fit(data)

# Get the best RMSE score and the corresponding hyperparameters
#best_rmse = rs.best_score['rmse']
#best_params = rs.best_params['rmse']


In [None]:
print('Best RMSE: ' + str(best_rmse))

In [None]:
print('Best Params: ' + str(best_params))