# Content-Based Recommender
We calculate the similarity of the movie content using cosine similarity and recommend the movie with similar content.

In [2]:
import pandas as pd
import numpy as np
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

class ContentBasedRecommender:
    
    def __init__(self, database):
        
        path_database = database
        self.df = pd.read_csv(path_database, low_memory=True)
        #print("keys:", self.df.keys())
    
    def convert_int(self, x):
        """Function to convert NaN to 0 and all other years to integers"""
        try:
            return int(x)
        except:
            return 0
        
    def get_release_year(self, df):
        
        """Function to ruled-out some movie by precondition

        Args:
            df(object): the dataframe(pandas), which is the dataset
        Return:
            df(object): the dataframe(pandas) with the new collum 'year'

        """
        
        #Convert release_date into pandas datetime format
        df['release_date'] = pd.to_datetime(df['release_date'], errors='coerce')

        #Extract year from the datetime
        df['year'] = df['release_date'].apply(lambda x: str(x).split('-')[0] if x != np.nan else np.nan)
        
        #Apply convert_int to the year feature
        df['year'] = df['year'].apply(self.convert_int)

        return df
    
    def get_genre(self, df):
        
        """Function to ruled-out some movie by precondition

        Args:
            df(object): the dataframe(pandas), which is the dataset
        Return:
            df(object): the dataframe(pandas) with the new collum 'genre' 

        """
        #Convert all NaN into stringified empty lists
        df['genres'] = df['genres'].fillna('[]')

        #Apply literal_eval to convert stringified empty lists to the list object
        df['genres'] = df['genres'].apply(literal_eval)

        #Convert list of dictionaries to a list of strings
        df['genres'] = df['genres'].apply(lambda x: [i['name'].lower() for i in x] if isinstance(x, list) else [])
        
        #Create a new feature by exploding genres
        s = df.apply(lambda x: pd.Series(x['genres']),axis=1).stack().reset_index(level=1, drop=True)

        #Name the new feature as 'genre'
        s.name = 'genre'

        #Create a new dataframe gen_df which by dropping the old 'genres' feature and adding the new 'genre'.
        df = df.join(s)
        
        return df
    
    def precondition(self, df, quantile_num=0.80, runtime=[45,300]):
        
        """Function to ruled-out some movie by precondition

        Args:
            df(object): the dataframe(pandas), which is the dataset
            quantile_num(float): the number to indicate the 80th percentile
            runtime(array): rumtime movies longer than array[0] minutes and shorter than array[1] minutes
            
        Return:
            df_q_movies(object): the movies that made the cut
            
        """
        
        #m (int): the minimum number of votes requred for the movie to be in the chart
        self.m = df['vote_count'].quantile(quantile_num)
        #C (float): the mean rating of all the movies in the dataset
        self.C = df['vote_average'].mean()
        
        #Only consider some movies with runtime conditions 
        df_q_movies = df[(df['runtime'] >= runtime[0]) & (df['runtime'] <= runtime[1])]

        #Only consider movies that have garnered more than m votes
        df_q_movies = df_q_movies[df_q_movies['vote_count'] >= self.m]
        
        return df_q_movies
    
    def weighted_rating(self, df):
        
        """Function to compute the IMDB weighted rating for each movie

        Args:
            df(object): the dataframe(pandas), which is the dataset
        
        Return:
            weight_score(float): the weighted score
            
        """
        v = df['vote_count']
        R = df['vote_average']
        weight_score = (v/(v+self.m) * R) + (self.m/(self.m+v) * self.C)

        return weight_score
    
    def get_preferrence(self):
        
        """Function take useer input to set the user-preferrence """
            
        #Ask for preferred genres
        print("Input preferred genre")
        self.genre = input()

        #Ask for lower limit of duration
        print("Input shortest duration(mins)")
        self.low_time = int(input())

        #Ask for upper limit of duration
        print("Input longest duration(mins)")
        self.high_time = int(input())

        #Ask for lower limit of timeline
        print("Input earliest year")
        self.low_year = int(input())

        #Ask for upper limit of timeline
        print("Input latest year")
        self.high_year = int(input())
    
    def cal_tfidf(self, df, stop_words_list=['english']):
        
        """Function to creat the Term Frequency-Inverse Document Frequency (TF-IDF) matrix

        Args:
            df(object): the dataframe(pandas), which is the dataset that contain 'overview' documents of movies
            stop_words(list): the words that extremly commom in the 'overview' documents of movies
        
        Return:
            tfidf_matrix (tensor): the word vecterized-matrix
        """
        
        #Define a TF-IDF Vectorizer Object. Remove all english stopwords
        tfidf = TfidfVectorizer(stop_words=stop_words_list)

        #Replace NaN with an empty string
        df['overview'] = df['overview'].fillna('')

        #Construct the required TF-IDF matrix by applying the fit_transform method on the overview feature
        tfidf_matrix = tfidf.fit_transform(df['overview'])
        
        return tfidf_matrix
        
    def get_cosine_sim(self, tfidf_matrix):
        
        """Function to compute the cosine similarity matrix 

        Args:
            tfidf_matrix (tensor): the word vecterized-matrix
        
        Return:
            cosine similarity matrix(tensor)
        """
        
        # Compute the cosine similarity matrix
        cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

        return cosine_sim
    
    def get_indices(self, df):
        
        """Function to construct a reverse mapping of indices and movie titles, 
        and drop duplicate titles(if any)"""
        
        indices = pd.Series(df.index, index=df['title']).drop_duplicates()
        
        return indices
        
    def main(self, title_input, see_top =25):
        
        """Function to takes in movie title as input and gives recommendations
        
        Args:
            title_input (string): the movie name
        
        Return:
            recommendation (object):
        """
    
        # Obtain the index of the movie that matches the title
        indices = self.get_indices(self.df)
        idx = indices[title_input]

        # Get the pairwsie similarity scores of all movies with that movie
        # And convert it into a list of tuples 
        tfidf_matrix = self.cal_tfidf(self.df)
        cosine_sim = self.get_cosine_sim(tfidf_matrix)
        sim_scores = list(enumerate(cosine_sim[idx]))

        # Sort the movies based on the cosine similarity scores
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

        # Get the scores of the 'see_top' most similar movies. Ignore the first movie(itself).
        sim_scores = sim_scores[1:see_top+1]

        # Get the movie indices
        movie_indices = [i[0] for i in sim_scores]

        # Return the top 10 most similar movies
        
        return self.df['title'].iloc[movie_indices]

In [3]:
#set the CSV File into df
database = r'C:\Users\MMIL\Panithan\Git_projects\movies_metadata.csv'

recommender= ContentBasedRecommender(database)
recommender.main(title_input='The Lion King', see_top =10)

  self.df = pd.read_csv(path_database, low_memory=True)


34682    How the Lion Cub and the Turtle Sang a Song
9115                  The Lion King 2: Simba's Pride
7658                                The Razor's Edge
9353                                The Lion King 1½
27933              Massaï, les guerriers de la pluie
13079                                     Crows Zero
31208                                     White Lion
14797                                Extreme Justice
28909                            House of Mortal Sin
11244     The Criminal Life of Archibaldo de la Cruz
Name: title, dtype: object

In [4]:
recommender.main(title_input='Iron Man 2', see_top =10)

26558    Avengers: Age of Ultron
2782               The Dark Half
12588                   Iron Man
20047             The Guilt Trip
386                Jason's Lyric
20830                 Iron Man 3
1833          All the King's Men
43540          The Kinematograph
1203               Touch of Evil
19854             Excuse My Dust
Name: title, dtype: object