In [119]:
import pandas as pd
import numpy as np
import json
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [120]:
# !curl -o tfidf_recommender_data.csv https://lazyprogrammer.me/course_files/nlp/tmdb_5000_movies.csv

df = pd.read_csv('./tfidf_recommender_data.csv')
df.fillna('', inplace=True)
df.head()

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2007-05-19,961000000,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500
2,245000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.sonypictures.com/movies/spectre/,206647,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...","[{""iso_3166_1"": ""GB"", ""name"": ""United Kingdom""...",2015-10-26,880674609,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466
3,250000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...",http://www.thedarkknightrises.com/,49026,"[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...",en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,"[{""name"": ""Legendary Pictures"", ""id"": 923}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2012-07-16,1084939099,165.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106
4,260000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://movies.disney.com/john-carter,49529,"[{""id"": 818, ""name"": ""based on novel""}, {""id"":...",en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}]","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2012-03-07,284139100,132.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124


In [169]:
# genres, keywords, production_companies, production_countries, spoken_languages

def strip_jsons(row):
    json_categories = ['genres', 'keywords', 'production_companies', 'production_countries', 'spoken_languages']
    new_str = ''
    for col in json_categories:
        parsed_json = json.loads(row[col])
        for i in parsed_json:
            new_str += i['name'] + ' '
    return new_str

def combine_all_text(row):
    new_str = strip_jsons(row)
    new_str += f'{row.original_title} {row.overview} {row.status} {row.tagline} {row.title}'
    return new_str

In [170]:
df['combined_text'] = df.apply(combine_all_text, axis=1)

In [171]:
df.iloc[0].combined_text

'Action Adventure Fantasy Science Fiction culture clash future space war space colony society space travel futuristic romance space alien tribe alien planet cgi marine soldier battle love affair anti war power relations mind and soul 3d Ingenious Film Partners Twentieth Century Fox Film Corporation Dune Entertainment Lightstorm Entertainment United States of America United Kingdom English Español Avatar In the 22nd century, a paraplegic Marine is dispatched to the moon Pandora on a unique mission, but becomes torn between following orders and protecting an alien civilization. Released Enter the World of Pandora. Avatar'

In [172]:
tfidf = TfidfVectorizer()
X = tfidf.fit_transform(df.combined_text)

movie2idx = pd.Series(df.index, index=df.title)

In [173]:
scream3_vector = X[movie2idx['Scream 3']].toarray()
scores = cosine_similarity(scream3_vector, X)
scores = scores.flatten()

In [174]:
recommendations = scores.argsort() * -1
for i in recommendations[:10]:
    print(df.iloc[i].title)

Bangkok Dangerous
Charlie St. Cloud
Spy Game
The Debt
Meet the Parents
Daybreakers
We Are Marshall
The Gunman
National Treasure
The Indian in the Cupboard


In [175]:
for example_title in ['Mortal Kombat', 'Runaway Bride', 'Avatar']:
    title_vector = X[movie2idx[example_title]].toarray()
    scores = cosine_similarity(title_vector, X)
    scores = scores.flatten()
    recommendations = (-scores).argsort()
    for i in recommendations[1:3]:
        print(df.iloc[i].title)

Mortal Kombat: Annihilation
The Mortal Instruments: City of Bones
Bride Wars
What's Love Got to Do with It
Alien
Aliens


In [177]:
# genres and keywords only

def strip_jsons(row):
    json_categories = ['genres', 'keywords']
    new_str = ''
    for col in json_categories:
        parsed_json = json.loads(row[col])
        for i in parsed_json:
            new_str += i['name'] + ' '
    return new_str

df['genres_keywords'] = df.apply(strip_jsons, axis=1)
df.iloc[0].genres_keywords


'Action Adventure Fantasy Science Fiction culture clash future space war space colony society space travel futuristic romance space alien tribe alien planet cgi marine soldier battle love affair anti war power relations mind and soul 3d '

In [179]:
tfidf_2 = TfidfVectorizer()
X2 = tfidf_2.fit_transform(df.genres_keywords)

for example_title in ['Mortal Kombat', 'Runaway Bride', 'Avatar']:
    title_vector = X2[movie2idx[example_title]].toarray()
    scores = cosine_similarity(title_vector, X2)
    scores = scores.flatten()
    recommendations = (-scores).argsort()
    for i in recommendations[0:5]:
        print(df.iloc[i].title)

Mortal Kombat
Mortal Kombat: Annihilation
Resident Evil: Retribution
Æon Flux
300: Rise of an Empire
Runaway Bride
House of D
Dancer, Texas Pop. 81
Just Married
My Big Fat Greek Wedding 2
Avatar
Planet of the Apes
Aliens
Alien³
Soldier
