Read the data.

In [96]:
import pandas as pd
import numpy as np

people = pd.read_csv("../data/tmdb_5000_credits.csv")
movies = pd.read_csv("../data/tmdb_5000_movies.csv")

Merge the two datasets.

In [97]:
people.columns = ['id','tittle','cast','crew']
movies = movies.merge(people,on='id')

Look at the data.

In [98]:
movies.head(5)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,...,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,tittle,cast,crew
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...",...,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...",...,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500,Pirates of the Caribbean: At World's End,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,245000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.sonypictures.com/movies/spectre/,206647,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...",...,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466,Spectre,"[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,250000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...",http://www.thedarkknightrises.com/,49026,"[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...",en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,"[{""name"": ""Legendary Pictures"", ""id"": 923}, {""...",...,165.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106,The Dark Knight Rises,"[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,260000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://movies.disney.com/john-carter,49529,"[{""id"": 818, ""name"": ""based on novel""}, {""id"":...",en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}]",...,132.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124,John Carter,"[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


Parse the cast, crew, and genre data from stringified lists to usable Python objects.

In [99]:
from ast import literal_eval

features = ['cast', 'crew', 'genres']
for feature in features:
    movies[feature] = movies[feature].apply(literal_eval)

Define a helper function for extracting the director from the crew.

In [100]:
def get_director(cell):
    """Gets the director's name from the crew feature if it exists"""
    for person in cell:
        if person['job'] == 'Director':
            return person['name']
    return np.nan

Define a helper function for generating the top three items in a cell.

In [101]:
def get_list(cell):
    """Returns a list of the top three items in a cell or entire list; whichever is more"""
    if isinstance(cell, list):
        names = [item['name'] for item in cell]
        # return the top three items in the list if possible
        if len(names) > 3:
            names = names[:3]
        return names
    # return empty list in case of missing/malformed data
    return []

Create a new column for directors.

In [102]:
movies['director'] = movies['crew'].apply(get_director)

Replace the `genre` and `cast` columns with usable data.

In [103]:
movies["genres"] = movies["genres"].apply(get_list)
movies["cast"] = movies["cast"].apply(get_list)

Print the new feature of the data.

In [104]:
movies[['title', 'cast', 'director', 'genres']].head(5)

Unnamed: 0,title,cast,director,genres
0,Avatar,"[Sam Worthington, Zoe Saldana, Sigourney Weaver]",James Cameron,"[Action, Adventure, Fantasy]"
1,Pirates of the Caribbean: At World's End,"[Johnny Depp, Orlando Bloom, Keira Knightley]",Gore Verbinski,"[Adventure, Fantasy, Action]"
2,Spectre,"[Daniel Craig, Christoph Waltz, Léa Seydoux]",Sam Mendes,"[Action, Adventure, Crime]"
3,The Dark Knight Rises,"[Christian Bale, Michael Caine, Gary Oldman]",Christopher Nolan,"[Action, Crime, Drama]"
4,John Carter,"[Taylor Kitsch, Lynn Collins, Samantha Morton]",Andrew Stanton,"[Action, Adventure, Science Fiction]"


Define a function for cleaning the data by removing spaces and making all letters lowercase.

In [105]:
def clean_data(cell):
    """Converts all strings to lower case with no spaces"""
    if isinstance(cell, list):
        return [str.lower(item.replace(" ", "")) for item in cell]
    elif isinstance(cell, str):
        return str.lower(cell.replace(" ", ""))
    else:
        return ''

Apply `clean_data` to the features.

In [106]:
features = ['cast', 'director', 'genres']

for feature in features:
    movies[feature] = movies[feature].apply(clean_data)

Create a string for each movie containing all its features.

In [107]:
def create_soup(row):
    return ' '.join(row['cast']) + ' ' + row['director'] + ' ' + ' '.join(row['genres'])
movies['soup'] = movies.apply(create_soup, axis='columns')

Create a count matrix for each movie with respect to the words in `soup`.

In [108]:
from sklearn.feature_extraction.text import CountVectorizer

count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(movies['soup'])

In [109]:
count_array = count_matrix.toarray()
df = pd.DataFrame(data=count_array,columns = count.get_feature_names_out())
print(df)

      50cent  aaliyah  aamirkhan  aaranthomas  aaronabrams  aaroneckhart  \
0          0        0          0            0            0             0   
1          0        0          0            0            0             0   
2          0        0          0            0            0             0   
3          0        0          0            0            0             0   
4          0        0          0            0            0             0   
...      ...      ...        ...          ...          ...           ...   
4798       0        0          0            0            0             0   
4799       0        0          0            0            0             0   
4800       0        0          0            0            0             0   
4801       0        0          0            0            0             0   
4802       0        0          0            0            0             0   

      aaronhann  aaronkwok  aaronpaul  aaronruell  ...  àlexpastor  \
0             0  

In [110]:
example1_index = movies.index[movies["id"] == 100]
print(example1_index)
print(movies.iloc[4268])

Int64Index([4268], dtype='int64')
budget                                                            1350000
genres                                                    [comedy, crime]
homepage                http://www.universalstudiosentertainment.com/l...
id                                                                    100
keywords                [{"id": 502, "name": "ambush"}, {"id": 567, "n...
original_language                                                      en
original_title                        Lock, Stock and Two Smoking Barrels
overview                A card sharp and his unwillingly-enlisted frie...
popularity                                                      16.032594
production_companies    [{"name": "Handmade Films Ltd.", "id": 146}, {...
production_countries     [{"iso_3166_1": "GB", "name": "United Kingdom"}]
release_date                                                   1998-03-05
revenue                                                           3897569
runt

In [158]:
example_frequencies = count_matrix[
    [100, 200, 302], :
]
example_count_matrix = np.sum(example_frequencies, axis=0)
print(example_count_matrix)

[[0 0 0 ... 0 0 0]]


In [157]:
from sklearn.metrics.pairwise import cosine_similarity
similarity_scores = cosine_similarity(np.asarray(example_count_matrix), count_matrix)[0]
with np.printoptions(threshold=np.inf):
    # print(similarity_matrix)
    # print(np.sort(similarity_matrix)[::-1])
    # similarity_matrix
    similarity_score_tuples = list(enumerate(similarity_scores))
    sim_scores = sorted(similarity_score_tuples, key=lambda x: x[1], reverse=True)
    # similarity_scores[::-1].sort()
    print(sim_scores)
    print(type(count_matrix))

[(200, 0.6803360514166089), (100, 0.6047431568147635), (302, 0.6047431568147635), (102, 0.529150262212918), (183, 0.529150262212918), (426, 0.529150262212918), (185, 0.3779644730092272), (249, 0.3779644730092272), (294, 0.3779644730092272), (310, 0.3779644730092272), (490, 0.3779644730092272), (547, 0.3779644730092272), (1841, 0.3779644730092272), (2348, 0.3779644730092272), (3225, 0.3779644730092272), (3329, 0.3779644730092272), (769, 0.35355339059327373), (1452, 0.35355339059327373), (3492, 0.35355339059327373), (4589, 0.3464101615137755), (1775, 0.3265986323710905), (2003, 0.3265986323710905), (2439, 0.3265986323710905), (8, 0.30237157840738177), (9, 0.30237157840738177), (11, 0.30237157840738177), (14, 0.30237157840738177), (15, 0.30237157840738177), (27, 0.30237157840738177), (29, 0.30237157840738177), (32, 0.30237157840738177), (37, 0.30237157840738177), (40, 0.30237157840738177), (41, 0.30237157840738177), (54, 0.30237157840738177), (63, 0.30237157840738177), (67, 0.302371578407

In [148]:
thing = np.array([1,2,3,4])
thing[::-1].sort()
print(thing)

[4 3 2 1]


Compute the cosine similarity matrix based on the `count_matrix`.

In [113]:
from sklearn.metrics.pairwise import cosine_similarity

similarity_matrix = cosine_similarity(count_matrix, count_matrix)

Reset index of the dataframe and sort the dataframe in reverse order.

In [114]:
movies = movies.reset_index()
indices = pd.Series(movies.index, index=movies['title'])

Implement a function for producing recommendations.

In [115]:
def recommend(title, cosine_sim=similarity_matrix):
    """Produces ten movie recommendations given a movie title"""
    # get the index of the input
    index = indices[title]

    # compute the pairwise similarity score of each movie with respect to the input
    sim_scores = list(enumerate(cosine_sim[index]))

    # sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:11]

    # get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # return the top 10 most similar movies
    return movies['title'].iloc[movie_indices]

Try out the algorithm.

In [116]:
recommend('Punch-Drunk Love')

4247    Me You and Five Bucks
4358     Next Stop Wonderland
1323               The Master
434          The Longest Yard
445           Just Go with It
515            50 First Dates
552              Funny People
905               Patch Adams
907                 Mr. Deeds
1004     The Boat That Rocked
Name: title, dtype: object