In [456]:
import pandas as pd
import numpy as np
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from itertools import combinations
from fuzzywuzzy import fuzz
import matplotlib.pyplot as plt
plt.rcParams.update({'figure.max_open_warning': 0})
import seaborn as sns
from collections import defaultdict
sns.set_style('whitegrid')
from wordcloud import WordCloud
import warnings
warnings.filterwarnings("ignore")

In [457]:
ratings_df = pd.read_csv('C:\\Users\\Checkout\\Documents\\SJSU Spring 2022\\CMPE 255\\Project\\ml-latest-small\\ratings.csv')
movies_df = pd.read_csv('C:\\Users\\Checkout\\Documents\\SJSU Spring 2022\\CMPE 255\\Project\\ml-latest-small\\movies.csv')

In [458]:
class HybridRecommender():
    def __init__(self,rating_df,movie_df):
        self.rating_df = rating_df
        self.movie_df = movie_df
        self.user_ids = rating_df['userId'].unique()
        self.movie_ids = rating_df['movieId'].unique()

    def PopularRecommendation(self, top_k = 10):
        grouped_rating = self.rating_df.groupby('movieId')['rating'].mean()
        grouped_count = self.rating_df.groupby('movieId')['movieId'].count()

        df_grouped = pd.DataFrame(grouped_count)
        df_grouped.columns = ['count']

        df_group_avg = pd.DataFrame(grouped_rating)
        df_group_avg.columns = ['avg_rating']
        df_grouped = df_grouped.join(df_group_avg, on ='movieId')
        df_grouped.sort_values(by=['count','avg_rating'], ascending=False)
        top_k_rec = df_grouped.loc[df_grouped['count'] > 200].loc[df_grouped['avg_rating']>4.0][:top_k]
        top_movie_rec = self.movie_df.loc[self.movie_df['movieId'].isin(top_k_rec.index.values)].set_index('movieId')
        self.result_df = top_movie_rec
    def genre_recommendations(self,i, M, items, k=10):
        ix = M.loc[:,i].to_numpy().argpartition(range(-1,-k,-1))
        closest = M.columns[ix[-1:-(k+2):-1]]
        closest = closest.drop(i, errors='ignore')
        return pd.DataFrame(closest).merge(items).head(k)
    
    def ContentRecommendation(self, title):
        movies = self.movie_df
        def extract_title(title):
            year = title[len(title)-5:len(title)-1]
            if year.isnumeric():
                title_no_year = title[:len(title)-7]
                return title_no_year
            else:
                return title

        def extract_year(title):
            year = title[len(title)-5:len(title)-1]
            if year.isnumeric():
                return year
            else:
                return np.nan
        movies.rename(columns={'title':'title_year'}, inplace=True) 
        movies['title_year'] = movies['title_year'].apply(lambda x: x.strip()) 
        movies['title'] = movies['title_year'].apply(extract_title) 
        movies['year'] = movies['title_year'].apply(extract_year)
        r,c = movies[movies['genres']=='(no genres listed)'].shape
        print('The number of movies which do not have info about genres:',r)
        movies = movies[~(movies['genres']=='(no genres listed)')].reset_index(drop=True)
        movies['genres'] = movies['genres'].str.replace('|',' ')
        counts = dict()
        for i in movies.index:
            for g in movies.loc[i,'genres'].split(' '):
                if g not in counts:
                    counts[g] = 1
                else:
                    counts[g] = counts[g] + 1
        movies['genres'] = movies['genres'].str.replace('Sci-Fi','SciFi')
        movies['genres'] = movies['genres'].str.replace('Film-Noir','Noir')

        tfidf_vector = TfidfVectorizer(stop_words='english')
        # apply the object to the genres column
        tfidf_matrix = tfidf_vector.fit_transform(movies['genres'])
        sim_matrix = linear_kernel(tfidf_matrix,tfidf_matrix)
        def matching_score(a,b):
            return fuzz.ratio(a,b)
        def get_title_year_from_index(index):
            return movies[movies.index == index]['title_year'].values[0]
        def get_title_from_index(index):
            return movies[movies.index == index]['title'].values[0]
        def get_index_from_title(title):
            return movies[movies.title == title].index.values[0]
        def find_closest_title(title):
            leven_scores = list(enumerate(movies['title'].apply(matching_score, b=title)))
            sorted_leven_scores = sorted(leven_scores, key=lambda x: x[1], reverse=True)
            closest_title = get_title_from_index(sorted_leven_scores[0][0])
            distance_score = sorted_leven_scores[0][1]
            return closest_title, distance_score
        def contents_based_recommender(movie_user_likes, how_many):
            result = []
            closest_title, distance_score = find_closest_title(movie_user_likes)
            if distance_score == 100:
                movie_index = get_index_from_title(closest_title)
                print('Movies similar to '+'\033[1m'+str(closest_title)+'\033[0m'+'.\n')
                movie = movies[movies.index == movie_index]
                display(movie[['movieId','title_year','genres']])
                movie_list = list(enumerate(sim_matrix[int(movie_index)]))
                similar_movies = list(sorted(movie_list,key=lambda x:x[1], reverse=True)) 
      
                for i,s in similar_movies[:how_many]:
                    result.append(i)
                return result
            else:
                print('Did you mean '+'\033[1m'+str(closest_title)+'\033[0m'+'?','\n')
                print('Movies similar to '+'\033[1m'+str(closest_title)+'\033[0m'+'.\n')
                movie_index = get_index_from_title(closest_title)
                movie = movies[movies.index == movie_index]
                display(movie[['movieId','title_year','genres']])
                movie_list = list(enumerate(sim_matrix[int(movie_index)]))
                similar_movies = list(sorted(movie_list,key=lambda x:x[1], reverse=True))
                for i,s in similar_movies[:how_many]:
                    result.append(i)
                return result
        result = contents_based_recommender(title, 10)
        result_df = pd.DataFrame()
        self.result_df = movies[movies.index.isin(result)][['movieId','title_year','genres']]
        #display(result_df)



    
    def SVDCF(self,userId,top_k = 10):
        #print("Using SVD \n")
        #print("-------------\n")
        svdpred_df = pd.read_csv('C:\\Users\\Checkout\\Documents\\SJSU Spring 2022\\CMPE 255\\Project\\ml-latest-small\\svd_predictions.csv')
        user_pred = svdpred_df[svdpred_df['userId']==userId]
        sorted_pred = user_pred.sort_values(by = 'est',ascending=False)
        result_df = sorted_pred.head(top_k)
        result_df = pd.merge(result_df, self.movie_df, on = "movieId", how = "inner")
        result_df.rename(columns = {'r_ui':'rating'}, inplace = True)
        self.result_df = result_df
        #return result_df[['movieId','title','genres']]
    
    def SVDppCF(self,userId,top_k = 10):
        #print("Using SVD++ \n")
        #print("-------------\n")
        svdpppred_df = pd.read_csv('C:\\Users\\Checkout\\Documents\\SJSU Spring 2022\\CMPE 255\\Project\\ml-latest-small\\svdpp_predictions.csv')
        user_pred = svdpppred_df[svdpppred_df['userId']==userId]
        sorted_pred = user_pred.sort_values(by = 'est',ascending=False)
        result_df = sorted_pred.head(top_k)
        result_df = pd.merge(result_df, self.movie_df, on = "movieId", how = "inner")
        result_df.rename(columns = {'r_ui':'rating'}, inplace = True)
        self.result_df = result_df
        #return result_df[['movieId','title','genres']]
    
    def User_Classification(self,userId):
        if userId not in self.user_ids:
            return '0'
        else:
            num_of_rated_movies = len(self.rating_df.loc[self.rating_df.userId == userId]['movieId'].unique())
            if 1 < num_of_rated_movies < 50:
                return '1-50'
            else:
                return '51'
    def Recommend(self, userId):
        classification = self.User_Classification(userId)
        if classification == '0':
            print("Enter 1 to get movies recommended\n")
            print("Enter 2 to search for a movie\n")
            option = input()
            if option == '1':
                return self.PopularRecommendation()
            elif option == '2':
                print("Enter a movie: \n")
                movie = input()
                return self.ContentRecommendation(movie)
            else:
                print("Invalid option\n")
            
        elif classification == '1-50':
            return self.SVDCF(userId)
        else:
            return self.SVDppCF(userId)
    

In [459]:
Hybrid = HybridRecommender(ratings_df, movies_df)

In [460]:
userId = 123234567
Hybrid.Recommend(userId)

Enter 1 to get movies recommended

Enter 2 to search for a movie

2
Enter a movie: 

Shawshank Redemption
The number of movies which do not have info about genres: 34
Did you mean [1mShawshank Redemption, The[0m? 

Movies similar to [1mShawshank Redemption, The[0m.



Unnamed: 0,movieId,title_year,genres
277,318,"Shawshank Redemption, The (1994)",Crime Drama


In [461]:
print('Recommendations for user with userId: '+str(userId)+'\n')
display(Hybrid.result_df)

Recommendations for user with userId: 123234567



Unnamed: 0,movieId,title_year,genres
15,16,Casino (1995),Crime Drama
29,30,Shanghai Triad (Yao a yao yao dao waipo qiao) ...,Crime Drama
33,36,Dead Man Walking (1995),Crime Drama
86,97,"Hate (Haine, La) (1995)",Crime Drama
102,117,"Young Poisoner's Handbook, The (1995)",Crime Drama
211,247,Heavenly Creatures (1994),Crime Drama
245,283,New Jersey Drive (1995),Crime Drama
251,290,Once Were Warriors (1994),Crime Drama
277,318,"Shawshank Redemption, The (1994)",Crime Drama
348,391,Jason's Lyric (1994),Crime Drama
