In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors

from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix
from scipy.sparse.linalg import svds

from surprise import KNNWithMeans, Dataset, accuracy, Reader
from surprise.model_selection import train_test_split

In [2]:
df_movies = pd.read_csv("Data/movies.csv")
df_movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [3]:
df_ratings = pd.read_csv("Data/ratings.csv")
df_ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [4]:
df_ratings.drop(columns='timestamp', inplace=True)

In [5]:
df_cleaned = df_movies.merge(df_ratings, on='movieId')

In [6]:
df_cleaned.head()

Unnamed: 0,movieId,title,genres,userId,rating
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1,4.0
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5,4.0
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,7,4.5
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,15,2.5
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,17,4.5


In [7]:
df_cleaned['num_viewers'] = df_cleaned.groupby('movieId')['userId'].transform('count')


In [8]:
df_cleaned.head()

Unnamed: 0,movieId,title,genres,userId,rating,num_viewers
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1,4.0,215
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5,4.0,215
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,7,4.5,215
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,15,2.5,215
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,17,4.5,215


In [9]:
df_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100836 entries, 0 to 100835
Data columns (total 6 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   movieId      100836 non-null  int64  
 1   title        100836 non-null  object 
 2   genres       100836 non-null  object 
 3   userId       100836 non-null  int64  
 4   rating       100836 non-null  float64
 5   num_viewers  100836 non-null  int64  
dtypes: float64(1), int64(3), object(2)
memory usage: 5.4+ MB


In [10]:
df_clean = df_cleaned.drop_duplicates(subset='title', keep='first')
df_clean.head()

Unnamed: 0,movieId,title,genres,userId,rating,num_viewers
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1,4.0,215
215,2,Jumanji (1995),Adventure|Children|Fantasy,6,4.0,110
325,3,Grumpier Old Men (1995),Comedy|Romance,1,4.0,52
377,4,Waiting to Exhale (1995),Comedy|Drama|Romance,6,3.0,7
384,5,Father of the Bride Part II (1995),Comedy,6,5.0,49


In [11]:
df_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9719 entries, 0 to 100835
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   movieId      9719 non-null   int64  
 1   title        9719 non-null   object 
 2   genres       9719 non-null   object 
 3   userId       9719 non-null   int64  
 4   rating       9719 non-null   float64
 5   num_viewers  9719 non-null   int64  
dtypes: float64(1), int64(3), object(2)
memory usage: 531.5+ KB


In [12]:
genres_split = df_clean.genres.apply(lambda x: x.split(sep='|')).apply(pd.value_counts, 1.0).fillna(0.0)
df_clean = pd.concat([df_clean.iloc[:,:], genres_split], axis=1)
df_clean.head()


Unnamed: 0,movieId,title,genres,userId,rating,num_viewers,Animation,Comedy,Fantasy,Children,...,Horror,Mystery,Sci-Fi,War,Musical,Documentary,IMAX,Western,Film-Noir,(no genres listed)
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1,4.0,215,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
215,2,Jumanji (1995),Adventure|Children|Fantasy,6,4.0,110,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
325,3,Grumpier Old Men (1995),Comedy|Romance,1,4.0,52,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
377,4,Waiting to Exhale (1995),Comedy|Drama|Romance,6,3.0,7,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
384,5,Father of the Bride Part II (1995),Comedy,6,5.0,49,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
df_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9719 entries, 0 to 100835
Data columns (total 26 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   movieId             9719 non-null   int64  
 1   title               9719 non-null   object 
 2   genres              9719 non-null   object 
 3   userId              9719 non-null   int64  
 4   rating              9719 non-null   float64
 5   num_viewers         9719 non-null   int64  
 6   Animation           9719 non-null   float64
 7   Comedy              9719 non-null   float64
 8   Fantasy             9719 non-null   float64
 9   Children            9719 non-null   float64
 10  Adventure           9719 non-null   float64
 11  Romance             9719 non-null   float64
 12  Drama               9719 non-null   float64
 13  Crime               9719 non-null   float64
 14  Action              9719 non-null   float64
 15  Thriller            9719 non-null   float64
 16  Horr

In [14]:
df_clean['(no genres listed)'].sum()

34.0

In [15]:
df_clean.drop(columns=['(no genres listed)'], inplace=True)

In [16]:
df_clean.head()

Unnamed: 0,movieId,title,genres,userId,rating,num_viewers,Animation,Comedy,Fantasy,Children,...,Thriller,Horror,Mystery,Sci-Fi,War,Musical,Documentary,IMAX,Western,Film-Noir
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1,4.0,215,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
215,2,Jumanji (1995),Adventure|Children|Fantasy,6,4.0,110,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
325,3,Grumpier Old Men (1995),Comedy|Romance,1,4.0,52,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
377,4,Waiting to Exhale (1995),Comedy|Drama|Romance,6,3.0,7,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
384,5,Father of the Bride Part II (1995),Comedy,6,5.0,49,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Baseline Model

In [17]:
import pandas as pd
from surprise import Dataset, Reader
from surprise.model_selection import train_test_split
from surprise import SVD
from surprise import accuracy
from sklearn.preprocessing import MultiLabelBinarizer

In [22]:
# create a MultiLabelBinarizer object and fit it on the genres column
mlb = MultiLabelBinarizer()
genres_binarized = mlb.fit_transform(df_clean['genres'].str.split('|'))

# create a new dataframe with the binarized genres
df_genres = pd.DataFrame(genres_binarized, columns=mlb.classes_)

# merge the new dataframe with the original dataframe
df_clean = pd.concat([df_clean[['userId', 'movieId', 'rating']], df_genres], axis=1)

# create a new dataset using the updated dataframe
reader = Reader(rating_scale=(1.0, 5.0))
data = Dataset.load_from_df(df_clean, reader)

# split the data into training and testing sets
trainset, testset = train_test_split(data, test_size=.25)

# train a model using the SVD algorithm
algo = SVD()
algo.fit(trainset)

# use the fitted model to predict ratings on the testing set
predictions = algo.test(testset)

# evaluate the performance of the model using different metrics
accuracy.mae(predictions)
accuracy.mse(predictions)

ValueError: too many values to unpack (expected 3)

# Tuned Model

In [19]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import NearestNeighbors

In [20]:
# import libraries
from surprise import KNNWithMeans, Dataset, Reader
from surprise.model_selection import train_test_split

# define a reader to read the dataframe
reader = Reader(rating_scale=(0, 1))

# create a dataset from the dataframe
data = Dataset.load_from_df(df_clean[['userId', 'movieId', 'num_viewers']], reader)

# split the data into training and testing sets
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)

# train a content-based model using the KNN algorithm
k = 10
sim_options = {'name': 'cosine', 'user_based': False}
algo = KNNWithMeans(k=k, sim_options=sim_options)
algo.fit(trainset)


Computing the cosine similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithMeans at 0x7f78c010d460>

In [21]:
# use the fitted model to predict ratings on the testing set
predictions = algo.test(testset)

# evaluate the performance of the model using different metrics
accuracy.mae(predictions)
accuracy.mse(predictions)

MAE:  9.7083
MSE: 636.1332


636.1332304526749

# Recommendation for Top 5

In [20]:
#create movie mapping directory
movie_to_idx = {movie_name: i for i, movie_name in enumerate(df_clean['movieId'].unique())}

In [35]:
from sklearn.neighbors import NearestNeighbors

# Fit a Nearest Neighbors model using 'num_viewers'
nn = NearestNeighbors(n_neighbors=6, algorithm='auto')
nn.fit(df_clean[['num_viewers']])

def recommend_movies(movie_name):
    # Get the index of the movie in the dataset
    movie_idx = df_clean[df_clean['title'] == movie_name].index[0]

    # Get the 5 most similar movies based on 'num_viewers'
    distances, indices = nn.kneighbors(df_clean.loc[movie_idx, ['num_viewers']].values.reshape(1, -1))

    # Get the titles and number of viewers of the recommended movies
    recommended_movies = df_clean.iloc[indices[0][1:]][['title', 'num_viewers']].sort_values(by='num_viewers', ascending=False)

    return recommended_movies[:5]

In [36]:
recommend_movies('Fight Club (1999)')

Unnamed: 0,title,num_viewers
15651,Terminator 2: Judgment Day (1991),224
14106,Schindler's List (1993),220
0,Toy Story (1995),215
24643,Star Wars: Episode V - The Empire Strikes Back...,211
2379,"Usual Suspects, The (1995)",204


In [5]:
import matplotlib.pyplot as plt

In [None]:
#########################

PLOTS BELOW

PLOTS ABOVE

# Extra Code for Model Tuning 

In [None]:
import surprise
from surprise import Dataset, Reader, SVD
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
# Load the dataset using Surprise
reader = Reader(rating_scale=(1.0, 5.0))
data = Dataset.load_from_df(df_clean[['userId', 'title', 'rating']], reader)

In [None]:
train, test = train_test_split(data, test_size=0.2, random_state=42)


In [None]:
from surprise import SVD

algo = SVD()
trainset = data.build_full_trainset()
algo.fit(trainset)

In [None]:
#algo.test(test)

In [None]:
surprise.accuracy.mae(algo.test(test))

In [None]:
surprise.accuracy.rmse(algo.test(test))

In [None]:
from surprise.prediction_algorithms.matrix_factorization import SVD
SVD = SVD()

In [None]:
from surprise.model_selection import RandomizedSearchCV

# Define the search space for hyperparameters
param_distributions = {'n_factors': [50, 100, 200],
                       'n_epochs': [10, 20, 30],
                       'lr_all': [0.002, 0.005, 0.01],
                       'reg_all': [0.02, 0.1, 0.4]}
# Create the randomized search object
rs = RandomizedSearchCV(SVD, param_distributions=, n_iter=10, measures=['rmse', 'mae'], cv=5)

# Run the randomized search
rs.fit(data)

# Get the best RMSE score and the corresponding hyperparameters
#best_rmse = rs.best_score['rmse']
#best_params = rs.best_params['rmse']


In [None]:
print('Best RMSE: ' + str(best_rmse))

In [None]:
print('Best Params: ' + str(best_params))