# Setup and Import

In [1]:
import pandas as pd
pd.set_option('display.max_columns', None)

import ast

import numpy as np

In [2]:
movies = pd.read_csv("./data/movies_metadata.csv")
keywords = pd.read_csv("./data/keywords.csv")
credits = pd.read_csv("./data/credits.csv")

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


# Movie Overview-based Recommender Engine

## Cleaning

In [3]:
# def explore_df(df):
#     print('Shape:', df.shape, '\n')
#     print('Columns and dtypes:\n', df.dtypes, '\n')
#     percent_missing = df.isna().mean().round(4) * 100
#     print('Columns with Missingness:\n',
#           percent_missing[percent_missing > 0.00
#                          ].sort_values(ascending=False))


# explore_df(movies) # three columns with > 50% missingness,
#                    # one column with 2% missingness, and
#                    # remaining columns with missingness had < 1%


# explore_df(keywords)          # no missingness
# explore_df(credits)           # no missingness

In [4]:
# drop columns with > 50% missingness
movies.drop(columns=['belongs_to_collection', 'homepage', 'tagline'], inplace=True)

# drop rows with any missing values - 1418 of 45466 dropped, 3%
movies.dropna(inplace=True)

In [5]:
# select only movies with numeric IDs
movies = movies[movies['id'].apply(lambda x: x.find('-') == -1)]

In [6]:
# change movies id column dtype from str to int for merging
movies['id'] = movies['id'].astype(int)

df = movies.merge(keywords, on='id')
df = df.merge(credits, on='id')

In [7]:
# convert stringified JSON object columns to lists of keywords
json_cols = ['genres', 'production_companies', 'production_countries', 'spoken_languages', 'keywords', 'cast', 'crew']

for col in json_cols:
    df[col] = df[col].apply(ast.literal_eval)

## Vectorization

In [8]:
#Import TfIdfVectorizer from scikit-learn
from sklearn.feature_extraction.text import TfidfVectorizer

#Define a TF-IDF Vectorizer Object. Remove all English stop words
tfidf = TfidfVectorizer(stop_words='english')

#Construct the required TF-IDF matrix by fitting and transforming the data
tfidf_matrix = tfidf.fit_transform(movies['overview'])

#Output the shape of tfidf_matrix
print("(# of movies, # of words in >= 2 movie overviews)")
tfidf_matrix.shape

(# of movies, # of words in >= 2 movie overviews)


(44048, 75375)

## Cosine Similarity Scores

In [9]:
# Import linear_kernel
from sklearn.metrics.pairwise import linear_kernel

# Compute the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [10]:
# Construct a reverse map of indices and movie titles
indices = pd.Series(df.index, index=df['title']).drop_duplicates()

# Function that takes in movie title as input and outputs most similar movies
def get_recommendations(title, cosine_sim=cosine_sim):
    # Get the index of the movie that matches the title
    idx = indices[title]

    # Get the pairwise similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1].any(), reverse=True)

    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:11]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar movies
    return df['title'].iloc[movie_indices]

# Director/Cast, Keywords and Genres-based Recommender Engine

## Cleaning

In [11]:
# Get the director's name from the crew feature. If director is not listed, return NaN
def get_director(x):
    for i in x:
        if i['job'] == 'Director':
            return i['name']
    return np.nan

In [12]:
# Returns the list top 3 elements or entire list; whichever is more.
def get_list(x):
    if isinstance(x, list):
        names = [i['name'] for i in x]
        #Check if more than 3 elements exist. If yes, return only first three. If no, return entire list.
        if len(names) > 3:
            names = names[:3]
        return names

    #Return empty list in case of missing/malformed data
    return []

In [13]:
# Define new director, cast, genres and keywords features that are in a suitable form.
df['director'] = df['crew'].apply(get_director)

features = ['cast', 'keywords', 'genres']
for feature in features:
    df[feature] = df[feature].apply(get_list)

In [14]:
# Function to convert all strings to lower case and strip names of spaces
def clean_data(x):
    if isinstance(x, list):
        return [str.lower(i.replace(" ", "")) for i in x]
    else:
        #Check if director exists. If not, return empty string
        if isinstance(x, str):
            return str.lower(x.replace(" ", ""))
        else:
            return ''

In [15]:
# Apply clean_data function to your features.
features = ['cast', 'keywords', 'director', 'genres']

for feature in features:
    df[feature] = df[feature].apply(clean_data)

In [16]:
# Create a string that contains all the metadata that we want to feed to our vectorizer
def create_soup(x):
    return ' '.join(x['keywords']) + ' ' + ' '.join(x['cast']) + ' ' + x['director'] + ' ' + ' '.join(x['genres'])
df['soup'] = df.apply(create_soup, axis=1)

## Vectorization

In [17]:
# Import CountVectorizer and create the count matrix
from sklearn.feature_extraction.text import CountVectorizer

count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(df['soup'])

## Cosine Similarity Scores

In [18]:
# Compute the Cosine Similarity matrix based on the count_matrix
from sklearn.metrics.pairwise import cosine_similarity

cosine_sim2 = cosine_similarity(count_matrix, count_matrix)

# Results

In [19]:
random_index = int(df.sample().index[0]) 
random_title    = df.iloc[random_index]["title"]
random_overview = df.iloc[random_index]["overview"]

In [20]:
print(random_title)
print(random_overview, "\n")
print(df.loc[get_recommendations(random_title).index.to_list()].sample(3)['overview'].to_list(), "\n")

Vettaiyaadu Vilaiyaadu
Raghavan (Kamal Haasan) is an honest cop in Tamil Nadu. His wife Kayalvizhi (Kamalinee Mukerji) dies in a violent incident. Raghavan is deputized to investigate the murder of Rani, the daughter of his colleague Arokiya Raj (Prakash Raj) in a remote village in Tamil Nadu. Arokiya moves to New York after his daughter's murder. 

['Former Marine Louanne Johnson lands a gig teaching in a pilot program for bright but underachieving teens at a notorious inner-city high school. After having a terrible first day, she decides she must throw decorum to the wind. When Johnson returns to the classroom, she does so armed with a no-nonsense attitude informed by her military training and a fearless determination to better the lives of her students -- no matter what the cost.', 'Two homicide detectives are on a desperate hunt for a serial killer whose crimes are based on the "seven deadly sins" in this dark and haunting film that takes viewers from the tortured remains of one vi

In [21]:
# Reset index of our main DataFrame and construct reverse mapping as before
df = df.reset_index()
indices = pd.Series(df.index, index=df['title'])

In [22]:
print(random_title)
print(random_overview, "\n")
print(df.loc[get_recommendations(random_title, cosine_sim2).index.to_list()].sample(3)['overview'].to_list())

Vettaiyaadu Vilaiyaadu
Raghavan (Kamal Haasan) is an honest cop in Tamil Nadu. His wife Kayalvizhi (Kamalinee Mukerji) dies in a violent incident. Raghavan is deputized to investigate the murder of Rani, the daughter of his colleague Arokiya Raj (Prakash Raj) in a remote village in Tamil Nadu. Arokiya moves to New York after his daughter's murder. 

['Morgan Adams and her slave, William Shaw, are on a quest to recover the three portions of a treasure map. Unfortunately, the final portion is held by her murderous uncle, Dawg. Her crew is skeptical of her leadership abilities, so she must complete her quest before they mutiny against her. This is made yet more difficult by the efforts of the British crown to end her pirate raids.', 'An agoraphobic psychologist and a female detective must work together to take down a serial killer who copies serial killers from the past.', 'Assassin Robert Rath arrives at a funeral to kill a prominent mobster, only to witness a rival hired gun complete th