<a href="https://colab.research.google.com/github/sdannels/movie_recommender/blob/main/final_movie_algorithm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%%capture
!pip install fuzzywuzzy

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting fuzzywuzzy
  Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl (18 kB)
Installing collected packages: fuzzywuzzy
Successfully installed fuzzywuzzy-0.18.0


In [23]:
%%capture
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from fuzzywuzzy import process
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from string import punctuation
import string
import gensim


#df = pd.read_csv("C:/Users/samda/Documents/Statistics/Fall_2021/Computing/Computing_project/IMDB-Movie-Data.csv",
#                 header = 0, index_col = 0)

df = pd.read_csv('https://raw.githubusercontent.com/sdannels/movie_recommender/main/IMDB-Movie-Data.csv', header = 0, index_col = 0)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
# function to score movies based on presence of same actors
def get_actors(df, title):
    # get string of actor names from given title
    given_actors = df.loc[df['Title'] == title, 'Actors'].iloc[0]
    # get actor strings from all other movies in data
    compare_actors = df.loc[df['Title'] != title, 'Actors']
    
    # get top matches
    matches = process.extract(given_actors, compare_actors, limit = len(df))
 
    # extract score and index number
    scores = [x[1:3] for x in matches]
    
    # convert to dataframe
    scores_df = pd.DataFrame(scores, columns = ['actor_score', 'Rank'])
    # normalize score to (0-1)
    scores_df['actor_score'] = (scores_df['actor_score']-min(scores_df['actor_score']))/(max(scores_df['actor_score']) - min(scores_df['actor_score']))
    
    # merge scores into given dataframe
    out = pd.merge(df, scores_df, how = 'left', left_on = df.index, right_on = 'Rank')
    out.drop('Rank', axis = 1, inplace = True)

    return(out)
    
#actor_matches = get_actors(df, 'Guardians of the Galaxy')

In [8]:
# function to score movies based on same genres
def get_genres(df, title):
    # get string of genres from given title
    given_genres = df.loc[df['Title'] == title, 'Genre'].iloc[0]
    # get genre strings from all other movies in data
    compare_genres = df.loc[df['Title'] != title, 'Genre']
    
    # get top matches
    matches = process.extract(given_genres, compare_genres, limit = len(df))
 
    # extract score and index number
    scores = [x[1:3] for x in matches]
    
    # convert to dataframe
    scores_df = pd.DataFrame(scores, columns = ['genre_score', 'Rank'])
    # normalize score to (0-1)
    scores_df['genre_score'] = (scores_df['genre_score']-min(scores_df['genre_score']))/(max(scores_df['genre_score']) - min(scores_df['genre_score']))
    
    # merge scores into given dataframe
    out = pd.merge(df, scores_df, how = 'left', left_on = df.index, right_on = 'Rank')
    out.drop('Rank', axis = 1, inplace = True)

    return(out)

#genre_matches = get_genres(df, 'Guardians of the Galaxy')

In [9]:
# function to find movies directed by the same person
def get_director(df, title):
    # create column with director to match
    df['given_director'] = df.loc[df['Title'] == title, 'Director'].iloc[0]
    # 1 if match, 0 if not
    df['director_score'] = (df['Director'] == df['given_director']).astype(int)
    df.drop('given_director', axis = 1, inplace = True)
    return(df)

#director_match = get_director(df, 'Guardians of the Galaxy')

In [10]:
# function to compare IMDB ratings
def get_IMDB(df, title):
    # create column for given IMDB rating
    df['given_rating'] = df.loc[df['Title'] == title, 'Rating'].iloc[0]
    # create column with absolute value of differences in film's rating
    df['rating_diff'] = abs(df['Rating'] - df['given_rating'])
    # normalize to 0-1 scale
    df['rating_diff'] = (df['rating_diff']-min(df['rating_diff']))/(max(df['rating_diff']) - min(df['rating_diff']))
    df.drop(['given_rating'], axis = 1, inplace = True)
    return(df)

#rating_match = get_IMDB(df, 'Guardians of the Galaxy') 

In [11]:
# function to match descriptions using tfidf and cosine similiarity
def tfidf_matches(title, indices, cosine_sim):
    
        # Get the index of the movie that matches the title
        idx = indices[title]

        # Get the pairwise similarity scores of all movies with that movie
        sim_scores = list(enumerate(cosine_sim[idx]))
        
        # Sort the movies based on the similarity scores
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
        
        # dataframe with results
        out = pd.DataFrame(sim_scores, columns = ['index_value', 'desc_score'])

        # Return the top 10 most similar movies
        return(out)


# function to get apply cosine similarity scores based on tfidf and merge scores into dataframe
def get_description_tfidf(df, title, ngram_max = 4):
    
    # fill missing values
    df['Title'] = df['Title'].fillna('')
    
    # TFID vectorizer for 1-5 word range, remove meaningless words (the, and, of, etc.)
    tfidf = TfidfVectorizer(stop_words='english', ngram_range = (1,ngram_max)) 

    # vectorize data
    tfidf_matrix = tfidf.fit_transform(df['Title'])
    
    # calculate cosine similarity
    cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
    
    # get indices
    indices = pd.Series(df.index, index = df['Title']).drop_duplicates()
    
    # find matches
    matches = tfidf_matches(title = title, indices = indices, cosine_sim = cosine_sim)
    
    # merge to original dataframe
    df = pd.merge(df, matches, how = 'left', left_on = df.index, right_on = 'index_value')

    # normalize score to (0-1)
    df['desc_score'] = (df['desc_score']-min(df['desc_score']))/(max(df['desc_score']) - min(df['desc_score']))
    
    # clean up dataframe
    df.loc[df['Title'] == title, 'desc_score'] = np.nan
    df.drop(['index_value'], axis = 1, inplace = True)
    
    return(df)

#test = get_description(df = df, title = 'Bridesmaids')

In [20]:
# load google news pretrained model
wv = gensim.models.KeyedVectors.load_word2vec_format("https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz", binary=True)  

In [None]:
# function to match description with word2vec
def get_description_word2vec(df, title):
   
    # create copy of original dataframe
    df2 = df.copy()
    
    # get column number of Description
    col_num = df.columns.get_loc("Description")
    
    # convert description to lower case string and tokenize
    df2['Description'] = df2['Description'].str.lower()
    df2['Description'] = df2['Description'].apply(word_tokenize)
    
    # strip description of stop words
    list_stopwords = set(stopwords.words('english') + list(punctuation))
    df2['Description'] = df2['Description'].apply(lambda x: [word for word in x if word not in list_stopwords])
    df2['Description'] = df2['Description'].apply(lambda x : [word.translate(str.maketrans('', '', string.punctuation)) for word in x])
    df2['Description'] = df2['Description'].apply(lambda x : [word for word in x if len(word) > 0])
    df2['Description'] = df2['Description'].apply(lambda x : list(set(x)))
    
    # create matrix vocabulary
    matrix_vocab = []
    titles = []
    for list_ in df2.to_numpy():
        list_[col_num] = [word for word in list_[col_num] if word in wv.vocab]
        matrix_vocab.append(list_[col_num])
        titles.append(list_[0])
    
    # get description for given title
    matrix_title_vocab = [word for word in df2[df2['Title'] == title].to_numpy()[0,col_num] if word in wv.vocab]
    matrix_similarity = []
    desc_string = []
    
    # compare description to all other descriptions using cosine similarity
    for list1 in matrix_vocab:
        score_desc = wv.n_similarity(list1, matrix_title_vocab)
        desc_string.append(list1)
        matrix_similarity.append(score_desc)
        
    # adjust similarity dataframe to merge
    df_similarity = pd.DataFrame([titles, desc_string, matrix_similarity])
    df_similarity = df_similarity.T
    df_similarity.columns = ['Title', 'Description', 'desc_score']
    
    # merge description score to original dataframe
    out = pd.merge(df2, df_similarity, on = 'Title', suffixes = ('', '_y'), how = 'left')
    out.drop(['Description_y'], axis = 1, inplace = True)
    out['desc_score'] = out['desc_score'].astype(float)
    out.loc[out['Title'] == title, 'desc_score'] = np.nan
    out = out.drop_duplicates(subset = ['Title', 'Actors'])

    # normalize score to (0-1)
    out['desc_score'] = (out['desc_score']-min(out['desc_score']))/(max(out['desc_score']) - min(out['desc_score']))
    
    return(out)
    
#test = get_description_word2vec(df, 'How to Train Your Dragon')

In [None]:
def get_recommendations(title, df = df, weights = (.3, .1, .25, .05, .3), top_n = 5, word2vec = True, kids = True):
    
    '''
    This function generates movie recommendations based on Actors, Director, and Description.
    The recommendation equation is defined using the given weights as follows:
       Score = w0*Actors + w1*Director + w2*Genres - w3*IMDB + w4*Description
       
    title : Title of movie entered as string
    df: dataframe with movies
    weights: vector of weights
        0 - Actors
        1 - Director
        2 - Genres
        3 - IMDB Rating
        4 - Description
    top_n: how many recommendations to make (list top n titles)
    '''
    
    # get actor similiarity score
    df = get_actors(df = df, title = title)
    
    # get director score
    df = get_director(df = df, title = title)
    
    # get genre score
    df = get_genres(df = df, title = title)
    
    # compare ratings
    df = get_IMDB(df = df, title = title)
    
    # get description score
    if word2vec == True:
        df = get_description_word2vec(df = df, title = title)
    else:
        df = get_description_tfidf(df = df, title = title)
    
    # calculate recommendation score
    df['recommendation_score'] = weights[0]*df['actor_score'] \
    + weights[1]*df['director_score'] + weights[2]*df['genre_score'] \
    - weights[3]*df['rating_diff'] + weights[4]*df['desc_score']

    # stop it from recommending "Sausage Party" to children
    if kids == True:
      df.loc[df['Title'] == 'Sausage Party', 'recommendation_score'] = 0
    
    # get movie recommendation list
    recommendations = df.sort_values('recommendation_score', ascending = False)
    recommendation_list = recommendations['Title'].head(top_n)
    
    return(recommendation_list)


In [None]:
# weights = (Actors, Director, Genre, IMDB Rating, Description)
get_recommendations(title = "The Dark Knight", weights = (.2, .1, .35, .05, .3), top_n = 5, word2vec=True)

124    The Dark Knight Rises
512                  Shooter
791                  Hancock
518                  Chappie
313             The Babadook
Name: Title, dtype: object