In [1]:
import numpy as np
import pandas as pd
import nltk
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors
nltk.download(["punkt","stopwords","wordnet"])

[nltk_data] Downloading package punkt to /Users/sr7037/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/sr7037/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/sr7037/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
class RecommendationSystem:
    def __init__(self):
        self.movies = pd.read_csv("movies.csv")
        self.ratings = pd.read_csv("ratings.csv")
        self.dataset = self.movies.merge(self.ratings)
        
    def sort_movies_by_year(self,li):
        def merge_sort(a,l,r):
            if l==r:
                return
            mid=(l+r)//2
            merge_sort(a,l,mid)
            merge_sort(a,mid+1,r)
            merge(a,l,mid,r)

        def merge(a,l,mid,r):
            n1=mid-l+1
            n2=r-(mid+1)+1
            L=[a[i+l] for i in range(n1)]
            R=[a[i+mid+1] for i in range(n2)]
            i,j,k=0,0,l
            while(i<n1 and j<n2):
                if int(L[i][-5:-1])>int(R[j][-5:-1]) :
                    a[k]=L[i]
                    i+=1
                else:
                    a[k]=R[j]
                    j+=1
                k+=1
            while(i<n1):
                a[k]=L[i]
                i+=1
                k+=1
            while(j<n2):
                a[k]=R[j]
                j+=1
                k+=1
        merge_sort(li,0,len(li)-1)
    
    
    def get_movie_by_id(self,mv_id):
        return self.movies.loc[rs.movies['movieId']==mv_id,['title']].values[0][0]
    
    
    def clean_feature_and_return_ndarray(self,genres):
        lemmatizer = WordNetLemmatizer()
        li=[]
        for i in range(len(genres)):
            temp = genres[i].lower()
            temp = temp.split("|")
            temp = [lemmatizer.lemmatize(word) for word in temp]
            li.append(" ".join(temp))
        
        cv = CountVectorizer()
        return cv.fit_transform(li).toarray(),cv,li
        
    def content_based_filtering(self,userId,no_of_movies=15):
        #Finding based on similar movies
        
        X,cv,li = self.clean_feature_and_return_ndarray(self.movies["genres"])
        movies_dataset = pd.DataFrame(li,columns=["genres"],index=self.movies["title"])
        
        
        def get_movie_by_index(movies_dataset,idx):   
            return movies_dataset.index[idx]
        
        similarities = cosine_similarity(X)
        
        def latest_movieId_watched(uid):
            time = self.ratings.loc[self.ratings["userId"]==uid,["movieId","timestamp"]]
            return time.sort_values(by="timestamp",ascending=False)["movieId"].values[0]
        
        latest_movieId_watched_by_user = latest_movieId_watched(userId)
        movie_index = self.movies.loc[self.movies['movieId']==latest_movieId_watched_by_user,["title"]].index[0]
        similarity_values = pd.Series(similarities[movie_index])
        
        similar_movie_indexes = list(similarity_values.sort_values(ascending=False).index)
        similar_movie_indexes.remove(movie_index)
        
        li = [get_movie_by_index(movies_dataset,idx) for idx in similar_movie_indexes]
        li = li[:no_of_movies]
        
        self.sort_movies_by_year(li)
        
        print("Since u have watched --->",self.get_movie_by_id(latest_movieId_watched_by_user),"<--- We recommend you",end="\n\n")
        
        
        for i in range(no_of_movies):
            print(li[i])
        
        return li
    
    def collaborative_filtering(self,uid,no_of_movies=15):
        #Finding based on similar users
        
        X,cv,li = self.clean_feature_and_return_ndarray(self.dataset['genres'])
        genres = pd.DataFrame(X,columns=cv.get_feature_names())
        
        users = pd.DataFrame(self.dataset['userId'],columns=['userId'])
        users = users.join(genres)
        
        users_moviemat = users.groupby("userId").sum()
        X = users_moviemat.iloc[:,:].values

        classifier = NearestNeighbors()
        classifier.fit(X)
        
        li = classifier.kneighbors([X[uid-1]],n_neighbors=5,return_distance=False)
        current_user = self.dataset.loc[self.dataset["userId"]==li[0][0],:]["title"].values
        similar_user = self.dataset.loc[self.dataset["userId"]==li[0][1],:]["title"].values
        
        
        movies_list = [movie for movie in similar_user if movie not in current_user]
        
        self.sort_movies_by_year(movies_list)
        
        
        print("U May Like These Movies \n\n")
        
        for i in range(no_of_movies):
            print(movies_list[i])
            
        return movies_list[:no_of_movies]
            
    def based_on_ratings(self,movieId):
        avg_ratings = self.dataset.groupby("title")['rating'].mean()
        count = self.dataset.groupby("title")['rating'].count()
        dataset_based_on_ratings = pd.DataFrame({"rating":avg_ratings,"number of ratings":count})
        
        
        df = self.dataset.loc[:,["userId","rating","title"]]
        users_movie_matrix = pd.pivot_table(df,columns='title',index='userId',values='rating') 
        
        
        movie_watched = users_movie_matrix[self.get_movie_by_id(movieId)]
        li = []
        for i in range(len(users_movie_matrix.columns)):
            li.append(movie_watched.corr(users_movie_matrix.iloc[:,i]))
        li = pd.Series(li)
        
        
        df = pd.DataFrame({"title": users_movie_matrix.columns,"Correlation": li,"number of ratings" : dataset_based_on_ratings["number of ratings"].values})
        recommendation_set = df[df["number of ratings"] >= 50].sort_values(by=["Correlation","number of ratings"],ascending=False)
        
        recommended_movies = recommendation_set["title"].values
        print("Movies which have similar ratings like given movie --->",self.get_movie_by_id(movieId),"<--- are",end="\n\n")
        
        for i in range(1,16):
            print(recommended_movies[i])
            
        
    def recommend(self,user_id=None,movie_id=None):
        if movie_id is None and user_id is None:
            print("Error, No user id or movie id found")
        elif user_id is not None and movie_id is not None:
            self.content_based_filtering(user_id)           
            print("\n\n\n")
            self.collaborative_filtering(user_id)
            print("\n\n\n")
            self.based_on_ratings(movie_id)
        elif user_id is None and movie_id is not None:
            self.based_on_ratings(movie_id)
        else:
            self.content_based_filtering(user_id)
            print("\n\n\n")
            self.collaborative_filtering(user_id)
            
        

In [4]:
rs = RecommendationSystem()

rs.recommend()

Error, No user id or movie id found


In [5]:
rs.dataset

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,2,5.0,859046895
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5,4.0,1303501039
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,8,5.0,858610933
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,11,4.0,850815810
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,14,4.0,851766286
...,...,...,...,...,...,...
105334,148238,A Very Murray Christmas (2015),Comedy,475,3.0,1451213043
105335,148626,The Big Short (2015),Drama,458,4.0,1452014749
105336,148626,The Big Short (2015),Drama,576,4.5,1451687664
105337,148626,The Big Short (2015),Drama,668,4.5,1451148148


In [7]:
rs.ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,16,4.0,1217897793
1,1,24,1.5,1217895807
2,1,32,4.0,1217896246
3,1,47,4.0,1217896556
4,1,50,4.0,1217896523
...,...,...,...,...
105334,668,142488,4.0,1451535844
105335,668,142507,3.5,1451535889
105336,668,143385,4.0,1446388585
105337,668,144976,2.5,1448656898
