# Movie Recommendation

In [1]:
# Import neccessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pandas.tools.plotting import table

In [2]:
# Import movies table and set the index
movies = pd.read_csv('data/movies0.csv')
#movies.set_index('tconst', inplace=True)

In [3]:
movies.head(3)

Unnamed: 0,tconst,primaryTitle,startYear,runtimeMinutes,genres,directors,writers,averageRating,numVotes,plot,plot outlines,keywords,synopsis
0,tt0111161,The Shawshank Redemption,1994,142,Drama,nm0001104,"nm0000175,nm0001104",9.3,2057323,Two imprisoned men bond over a number of years...,Chronicles the experiences of a formerly succe...,"wrongful-imprisonment,escape-from-prison,based...","In 1947, Andy Dufresne (Tim Robbins), a banker..."
1,tt0468569,The Dark Knight,2008,152,"Action,Crime,Drama",nm0634240,"nm0634300,nm0634240,nm0333060,nm0004170",9.0,2023734,When the menace known as the Joker emerges fro...,Set within a year after the events of Batman B...,"dc-comics,moral-dilemma,psychopath,clown,scarr...",The movie begins with a gang of men with clown...
2,tt1375666,Inception,2010,148,"Action,Adventure,Sci-Fi",nm0634240,nm0634240,8.8,1802989,A thief who steals corporate secrets through t...,"Dom Cobb is a skilled thief, the absolute best...","dream,subconscious,ambiguous-ending,thief,psyc...","A young man, exhausted and delirious, washes u..."


We don't need directors or writers, so we drop them and save the resulting file in a CSV. It's already been done, so the code in the next cell is commented out.

In [4]:
# movies.drop(labels=['directors', 'writers'], axis='columns', inplace=True) 
# Save our dataframe 
# movies.to_csv(path_or_buf='movies0.csv')

In [5]:
# Import movies table and set the index
movies = pd.read_csv('movies0.csv')
#movies.set_index('tconst', inplace=True)

In [6]:
movies.head(3)

Unnamed: 0,tconst,primaryTitle,startYear,runtimeMinutes,genres,averageRating,numVotes,plot,plot outlines,keywords,synopsis
0,tt0111161,The Shawshank Redemption,1994,142,Drama,9.3,2057323,Two imprisoned men bond over a number of years...,Chronicles the experiences of a formerly succe...,"wrongful-imprisonment,escape-from-prison,based...","In 1947, Andy Dufresne (Tim Robbins), a banker..."
1,tt0468569,The Dark Knight,2008,152,"Action,Crime,Drama",9.0,2023734,When the menace known as the Joker emerges fro...,Set within a year after the events of Batman B...,"dc-comics,moral-dilemma,psychopath,clown,scarr...",The movie begins with a gang of men with clown...
2,tt1375666,Inception,2010,148,"Action,Adventure,Sci-Fi",8.8,1802989,A thief who steals corporate secrets through t...,"Dom Cobb is a skilled thief, the absolute best...","dream,subconscious,ambiguous-ending,thief,psyc...","A young man, exhausted and delirious, washes u..."


For the time being, let's drop the plot, plot outlines, and synopsis columns so we can focus on building a model that only uses keywords.

In [7]:
movies.drop(labels=['plot', 'plot outlines', 'synopsis'], axis='columns', inplace=True) 

In [8]:
movies.head(3)

Unnamed: 0,tconst,primaryTitle,startYear,runtimeMinutes,genres,averageRating,numVotes,keywords
0,tt0111161,The Shawshank Redemption,1994,142,Drama,9.3,2057323,"wrongful-imprisonment,escape-from-prison,based..."
1,tt0468569,The Dark Knight,2008,152,"Action,Crime,Drama",9.0,2023734,"dc-comics,moral-dilemma,psychopath,clown,scarr..."
2,tt1375666,Inception,2010,148,"Action,Adventure,Sci-Fi",8.8,1802989,"dream,subconscious,ambiguous-ending,thief,psyc..."


Done.

Let's temporarly get rid of null keyword movies; otherwise CountVectorizer won't work.

In [9]:
movies = movies[movies.keywords.notnull()]

In [10]:
movies.shape

(9877, 8)

###### 1st Model: Recommending with Gensim Similarity (Cosine similarity) and Tfidf Model

A few points before we use gensim similarity.
First, it's often helpful to count words in a document to see what words are mentioned more and hence more important. In our case, since we are using keywords where each keyword is mentioned at most once per movie, that doesn't make sense, so we won't do that.

Second, the normal preprocessing steps for text are as follows:
1. Lowercase the words
2. Take .isalpha() words
3. Remove Stop Words
4. Lemmatize

In our case, we will lowercase the words although it's not really necessary since they look all lowercase. It will be done for certainty.  
We will not take only alpha words because most of the keywords are compound words created with dashes ("-") and taking only alpha words would thus result in us discarding most of the words.  
We will remove stop words for completeness and safety although these are keywords so none should be stopwords.  
We will not lemmatize since doing do changes the meaning of certain keywords. For example, "woods" which indicates the forest, becomes "wood" the material. Or "avengers" becomes "avenger". In both cases, the first words have a meaning that is more than just the plural of the second words. So we will not take this step.

Thus we see that we don't really have to apply any of these steps to our data.

In [11]:
from collections import Counter
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from gensim.corpora.dictionary import Dictionary



In [12]:
docs = movies['keywords'].tolist()

In [13]:
# Create functions for making alpha, removing stop words, and lemmatizing
def make_alpha(doc):
    # Retain alphabetic words: alpha_only
    alpha_only = [t for t in doc if t.isalpha()]    
    return(alpha_only)
def remove_stops(doc):
    no_stops = [t for t in doc if t not in stopwords.words('english')]
    return(no_stops)
def lemmatize(doc):
    wordnet_lemmatizer = WordNetLemmatizer()
    lemmatized = [wordnet_lemmatizer.lemmatize(t) for t in doc]
    return(lemmatized)
def no_commas(doc):
    no_commas = [t for t in doc if t!=',']
    return(no_commas)

In [18]:
import csv
with open('processed_docs.csv', 'r') as f:
    reader = csv.reader(f)
    processed_docs = list(reader)
processed_docs = processed_docs[0::2] # get rid of empty lists

At this point, I have processed doc, so I don't need the other files

In [19]:
dictionary = Dictionary(processed_docs) # create a dictionary of words from our keywords

#dictionary.token2id # see the words (tokens) and their IDs
#len(dictionary) #Number of words and compound words in dictionary

# Print out first 10 words:
# for i in range(len(dictionary))[0:10]:
#     print(i, dictionary[i])

In [20]:
corpus = [dictionary.doc2bow(doc) for doc in processed_docs] #create corpus where the corpus is a bag of words for each document
# len(corpus) get the length, which is the number of words

This next cell could probably be considered EDA.

In [21]:
# Create the defaultdict: total_word_count
# This dictionary contains every word ID and its corresponding number of times it appears in the corpus
from collections import defaultdict
import itertools
total_word_count = defaultdict(int)
for word_id, word_count in itertools.chain.from_iterable(corpus):
    total_word_count[word_id] += word_count

# Create a sorted list from the defaultdict: sorted_word_count
sorted_word_count = sorted(total_word_count.items(), key=lambda w: w[1], reverse=True)

# Print the top 20 words across all documents alongside the count
for word_id, word_count in sorted_word_count[:20]:
    print(dictionary.get(word_id), word_count)

murder 3455
death 3156
blood 2825
husband-wife-relationship 2724
violence 2678
father-son-relationship 2472
flashback 2442
bare-chested-male 2359
title-spoken-by-character 2242
friendship 2166
mother-son-relationship 2121
kiss 2093
father-daughter-relationship 2072
cigarette-smoking 2038
dog 1882
photograph 1861
female-nudity 1859
pistol 1839
fight 1813
chase 1808


Clearly a lot of violence and relationships in these keywords.

Tf-idf Time.

In [22]:
from gensim.models.tfidfmodel import TfidfModel
tfidf = TfidfModel(corpus) #create tfidf model of the corpus

number of docs is the number of movies, num_nnz is the number of words in our corpus

In [23]:
#Get the words with the highest tf-idf values for a given movie
sorted_tfidf_weights = sorted(tfidf[corpus[0]], key=lambda w: w[1], reverse=True)
for term_id, weight in sorted_tfidf_weights[:5]:
    print(dictionary.get(term_id), weight)

boom-shot 0.09165174419616995
fresh-fish 0.09165174419616995
killed-by-prison-guard 0.09165174419616995
library-cart 0.09165174419616995
missing-prisoner 0.09165174419616995


A slight issue with tf-idf for keywords is that since there are no word-repeats among our keywords, tf-idf can't take advantage of that to find the more important words for a movie. Still, it can look to see how often a given keyword is used for other movies, so it is still a great model to use.

In [24]:
import gensim
from gensim.similarities import Similarity
from gensim.similarities import MatrixSimilarity

# Create the similarity data structure. This is the most important part where we get the similarities between the movies.
sims = MatrixSimilarity(tfidf[corpus], num_features=len(dictionary))
print(sims)

MatrixSimilarity<9877 docs, 112380 features>


In [25]:
type(sims)

gensim.similarities.docsim.MatrixSimilarity

Now create a query doc and convert it to tf-idf

Choose between movie and keywords:

In [26]:
def movie_recommendation(movie_title, number_of_hits=5):
    movie = movies.loc[movies.primaryTitle==movie_title] # get the movie row
    keywords = movie['keywords'].iloc[0].split(',') #get the keywords as a Series (movie['keywords']),
    # get just the keywords string ([0]), and then convert to a list of keywords (.split(',') )
    query_doc = keywords #set the query_doc to the list of keywords
    
    query_doc_bow = dictionary.doc2bow(query_doc) # get a bag of words from the query_doc
    query_doc_tfidf = tfidf[query_doc_bow] #convert the regular bag of words model to a tf-idf model where we have tuples
    # of the movie ID and it's tf-idf value for the movie

    similarity_array = sims[query_doc_tfidf] # get the array of similarity values between our movie and every other movie. 
    #So the length is the number of movies we have. To do this, we pass our list of tf-idf tuples to sims.

    similarity_series = pd.Series(similarity_array.tolist(), index=movies.primaryTitle.values) #Convert to a Series
    top_hits = similarity_series.sort_values(ascending=False)[1:number_of_hits+1] 
    #get the top matching results, i.e. most similar movies; start from index 1 because every movie is most similar to itself

    #print the words with the highest tf-idf values for the provided movie:
    sorted_tfidf_weights = sorted(tfidf[corpus[movie.index.values.tolist()[0]]], key=lambda w: w[1], reverse=True)
    print('The top 5 words associated with this movie by tf-idf are: ')
    for term_id, weight in sorted_tfidf_weights[:5]:
        print(" '%s' with a tf-idf score of %.3f" %(dictionary.get(term_id), weight))
    
    # Print the top matching movies
    print("Our top %s most similar movies for movie %s are:" %(number_of_hits, movie_title))
    for idx, (movie,score) in enumerate(zip(top_hits.index, top_hits)):
        print("%d %s with a similarity score of %.3f" %(idx+1, movie, score))

In [27]:
movie_recommendation('The Avengers', 5)

The top 5 words associated with this movie by tf-idf are: 
 'black-eye-patch' with a tf-idf score of 0.101
 'imax,3-dimensional' with a tf-idf score of 0.101
 'superhero-team,2010s' with a tf-idf score of 0.101
 'flying-fortress' with a tf-idf score of 0.093
 'marvel-comic' with a tf-idf score of 0.093
Our top 5 most similar movies for movie The Avengers are:
1 Avengers: Age of Ultron with a similarity score of 0.399
2 Avengers: Infinity War with a similarity score of 0.286
3 Iron Man 2 with a similarity score of 0.274
4 Captain America: Civil War with a similarity score of 0.251
5 Captain America: The Winter Soldier with a similarity score of 0.250


In [28]:
def keywords_recommendation(keywords, number_of_hits):
    query_doc_bow = dictionary.doc2bow(keywords) # get a bag of words from the query_doc
    query_doc_tfidf = tfidf[query_doc_bow] #convert the regular bag of words model to a tf-idf model where we have tuples
    # of the movie ID and it's tf-idf value for the movie

    similarity_array = sims[query_doc_tfidf] # get the array of similarity values between our movie and every other movie. 
    #So the length is the number of movies we have. To do this, we pass our list of tf-idf tuples to sims.

    similarity_series = pd.Series(similarity_array.tolist(), index=movies.primaryTitle.values) #Convert to a Series
    top_hits = similarity_series.sort_values(ascending=False)[:number_of_hits] #get the top matching results, i.e. most similar movies

    # Print the top matching movies
    print("Our top %s most similar movies for the keywords %s are:" %(number_of_hits, keywords))
    for idx, (movie,score) in enumerate(zip(top_hits.index, top_hits)):
        print("%d '%s' with a similarity score of %.3f" %(idx+1, movie, score))

In [29]:
keywords_recommendation(['wrongful-imprisonment','escape-from-prison','based-on-the-works-of-stephen-king',
                         'prison', 'voice-over-narration', 'caged-bird','reference-to-fort-hancock-texas','aria'], 5)

Our top 5 most similar movies for the keywords ['wrongful-imprisonment', 'escape-from-prison', 'based-on-the-works-of-stephen-king', 'prison', 'voice-over-narration', 'caged-bird', 'reference-to-fort-hancock-texas', 'aria'] are:
1 'The Shawshank Redemption' with a similarity score of 0.164
2 'Down by Law' with a similarity score of 0.150
3 'Old Men in New Cars: In China They Eat Dogs II' with a similarity score of 0.093
4 'Diva' with a similarity score of 0.093
5 'I Am a Fugitive from a Chain Gang' with a similarity score of 0.088


##### 2nd Model: Jaccard Similarity Based on Word Counts

In [30]:
def get_jaccard_sim(str1, str2):
    a = set(str1.split(','))
    b = set(str2.split(','))
    c = a.intersection(b)
    return(float(len(c)) / (len(a) + len(b) - len(c)))

def keyword_string(movie):
    movie = movies[movies.primaryTitle==movie]
    keyword_string = movie.keywords.iloc[0]
    
    return(keyword_string)

def get_jaccard_sim2(movie1, movie2):
    keywords1 = keyword_string(movie1)
    keywords2 = keyword_string(movie2)
    
    return(get_jaccard_sim(keywords1, keywords2))

In [31]:
def jaccard_recommender(movie_title, number_of_hits=5):
    movie = movies[movies.primaryTitle==movie_title]
    keyword_string = movie.keywords.iloc[0]

    jaccards = []
    for movie in movies['keywords']:
        jaccards.append(get_jaccard_sim(keyword_string, movie))
    jaccards = pd.Series(jaccards)
    jaccards_index = jaccards.nlargest(number_of_hits+1).index
    matches = movies.loc[jaccards_index]
    for match,score in zip(matches['primaryTitle'][1:],jaccards[jaccards_index][1:]) :
        print(match,score )

##### 3rd Model: Cosine Similarity Based on Word Counts

In [32]:
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def get_cosine_sim(*strs):
    vectors = [t for t in get_vectors1(*strs)]
    return(cosine_similarity(vectors))

def get_vectors1(*strs):
    text = [t for t in strs]
    vectorizer = CountVectorizer(text)
    vectorizer.fit(text)
    return(vectorizer.transform(text).toarray())

def get_vectors2(text):
    vectorizer = CountVectorizer(text)
    X = vectorizer.fit_transform(text)
    return(X.toarray())

In [33]:
vectors = get_vectors2(movies.keywords.tolist())

In [34]:
def cosine_recommender(movie_title, number_of_hits=5):
    movie_index = movies[movies.primaryTitle == movie_title].index.values[0]

    cosines = []
    for i in range(len(vectors)):
        vector_list = [vectors[movie_index], vectors[i]]
        cosines.append(cosine_similarity(vector_list)[0,1])

    cosines = pd.Series(cosines)
    index = cosines.nlargest(number_of_hits+1).index

    matches = movies.loc[index]
    for match,score in zip(matches['primaryTitle'][1:],cosines[index][1:]):
        print(match,score )

Let's do a quick comparison for a film:

In [35]:
cosine_recommender('The Avengers')

Avengers: Infinity War 0.8044695892203602
Avengers: Age of Ultron 0.7913277135173737
Captain America: Civil War 0.7509793475917405
Iron Man 2 0.7470155795225362
Justice League 0.7091790018343553


In [36]:
jaccard_recommender('The Avengers')

Avengers: Age of Ultron 0.27450980392156865
Avengers: Infinity War 0.2370266479663394
Captain America: The Winter Soldier 0.23141891891891891
Captain America: Civil War 0.21246458923512748
Thor: The Dark World 0.20722433460076045


In [37]:
movie_recommendation('The Avengers')

The top 5 words associated with this movie by tf-idf are: 
 'black-eye-patch' with a tf-idf score of 0.101
 'imax,3-dimensional' with a tf-idf score of 0.101
 'superhero-team,2010s' with a tf-idf score of 0.101
 'flying-fortress' with a tf-idf score of 0.093
 'marvel-comic' with a tf-idf score of 0.093
Our top 5 most similar movies for movie The Avengers are:
1 Avengers: Age of Ultron with a similarity score of 0.399
2 Avengers: Infinity War with a similarity score of 0.286
3 Iron Man 2 with a similarity score of 0.274
4 Captain America: Civil War with a similarity score of 0.251
5 Captain America: The Winter Soldier with a similarity score of 0.250


Looks good. They all recommend similar appropriate movies for the same movie, with slight differences in recommendation.