In [1]:
import pandas as pd
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
import seaborn as sns
import os

%matplotlib inline

In [2]:
sns.set_style('darkgrid')
sns.set_context('talk')

In [3]:
# Read in the movie metadata, credits, keywords files
df_movies = pd.read_csv('movies_metadata.csv', usecols=['id', 'title'])

In [5]:
len(df_movies)

45466

In [6]:
df_ratings = pd.read_csv('ratings_small.csv')

In [22]:
# Drop columns not needed at all
df_mmd.drop(['homepage', 'imdb_id', 'original_title', 'poster_path', 'video', 'belongs_to_collection', 'revenue', 'tagline', 'overview', 'spoken_languages', 'production_countries', 'budget'], axis=1, inplace=True)

# Drop adult movies
adult_index = df_mmd[df_mmd['adult'] == 'True'].index
df_mmd.drop(axis=0, index=adult_index, inplace=True)
df_mmd.drop(['adult'], axis=1, inplace=True)

# Keep only English movies
non_english_index = df_mmd[df_mmd.original_language != 'en'].index
df_mmd.drop(axis=0, index=non_english_index, inplace=True)
df_mmd.drop(['original_language'], axis=1, inplace=True)

# Only released movies
df_mmd.drop(axis=0, index=df_mmd[df_mmd.status != 'Released'].index, inplace=True)
df_mmd.drop(['status'], axis=1, inplace=True)

# Dropping few rows that had nan
df_mmd.dropna(inplace=True, axis=0)

In [23]:
# Extract genres
df_mmd.genres = df_mmd.genres.map(lambda col_str: [col_dict['name'] for col_dict in eval(col_str)])

# Extract production companies
df_mmd.production_companies = df_mmd.production_companies.map(lambda col_str: [col_dict['name'] for col_dict in eval(col_str)])

# Transform release date and keep only year
df_mmd['release_year'] = pd.to_datetime(df_mmd.release_date).dt.year
df_mmd.drop(['release_date'], axis=1, inplace=True)

# Convert popularity to float
df_mmd.popularity = df_mmd.popularity.map(str).map(eval)

In [24]:
df_mmd.head()

Unnamed: 0,genres,id,popularity,production_companies,runtime,title,vote_average,vote_count,release_year
0,"[Animation, Comedy, Family]",862,21.946943,[Pixar Animation Studios],81.0,Toy Story,7.7,5415.0,1995
1,"[Adventure, Fantasy, Family]",8844,17.015539,"[TriStar Pictures, Teitler Film, Interscope Co...",104.0,Jumanji,6.9,2413.0,1995
2,"[Romance, Comedy]",15602,11.7129,"[Warner Bros., Lancaster Gate]",101.0,Grumpier Old Men,6.5,92.0,1995
3,"[Comedy, Drama, Romance]",31357,3.859495,[Twentieth Century Fox Film Corporation],127.0,Waiting to Exhale,6.1,34.0,1995
4,[Comedy],11862,8.387519,"[Sandollar Productions, Touchstone Pictures]",106.0,Father of the Bride Part II,5.7,173.0,1995


In [25]:
nz_genres = sum(df_mmd.genres.map(len) == 0)
nz_pop = sum(df_mmd.popularity == 0)
nz_prodc = sum(df_mmd.production_companies.map(len) == 0)
nz_runtime = sum(df_mmd.popularity == 0)
nz_title = sum(df_mmd.title.map(len) == 0)
nz_ry = sum(df_mmd.release_year == 0)
print(f"Number of zeros in genres: {nz_genres}")
print(f"Number of zeros in popularity: {nz_pop}")
print(f"Number of zeros in production_companies: {nz_prodc}")
print(f"Number of zeros in runtime: {nz_runtime}")
print(f"Number of zeros in title: {nz_title}")
print(f"Number of zeros in release year: {nz_ry}")

Number of zeros in genres: 1606
Number of zeros in popularity: 28
Number of zeros in production_companies: 8366
Number of zeros in runtime: 28
Number of zeros in title: 0
Number of zeros in release year: 0


In [26]:
# Dropping runtime, popularity rows having 0
df_mmd.drop(df_mmd[df_mmd.runtime == 0].index, inplace=True)
df_mmd.drop(df_mmd[df_mmd.popularity == 0].index, inplace=True)

In [39]:
df_credits = pd.read_csv('credits.csv')

In [40]:
df_mmd.set_index('id').join(df_credits.set_index('id'))

Unnamed: 0_level_0,genres,popularity,production_companies,runtime,title,vote_average,vote_count,release_year,cast,crew
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
100,"[Comedy, Crime]",4.607860,"[Handmade Films Ltd., Summit Entertainment, Po...",105.0,"Lock, Stock and Two Smoking Barrels",7.5,1671.0,1998,,
10001,"[Comedy, Science Fiction]",2.562888,[Warner Bros.],91.0,Young Einstein,4.5,46.0,1988,,
100010,"[Drama, War]",0.769266,[Metro-Goldwyn-Mayer (MGM)],116.0,Flight Command,6.0,1.0,1940,,
10002,"[Drama, Crime, Romance]",3.095584,"[The Criterion Collection, HandMade Films]",104.0,Mona Lisa,6.7,62.0,1986,,
100024,"[Horror, Thriller]",1.099911,[],100.0,Bloodwork,4.3,13.0,2012,,
...,...,...,...,...,...,...,...,...,...,...
99934,"[Adventure, Drama, Romance]",0.008084,[Metro-Goldwyn-Mayer (MGM)],100.0,The Flying Fleet,0.0,0.0,1929,,
9994,"[Comedy, Animation, Family]",11.183924,"[Walt Disney Pictures, Silver Screen Partners II]",74.0,The Great Mouse Detective,6.9,334.0,1986,,
99946,[Comedy],0.202315,[Metro-Goldwyn-Mayer (MGM)],77.0,Exit Smiling,8.5,2.0,1926,,
9995,"[Action, Crime, Drama]",1.316179,[New Line Cinema],86.0,Turn It Up,5.0,5.0,2000,,


In [37]:
# Getting first 5 members of the cast
df_credits.cast = df_credits.cast.map(lambda col_str: [col_dict['name'] for col_dict in eval(col_str)][:5])

# Getting directors
df_credits['director'] = df_credits.crew.map(lambda col_str: [col_dict['name'] for col_dict in eval(col_str) if col_dict['job'] == 'Director'])

# Getting writers
df_credits['screenplay'] = df_credits.crew.map(lambda col_str: [col_dict['name'] for col_dict in eval(col_str) if col_dict['job'] == 'Screenplay'])

# Getting writers
df_credits['producer'] = df_credits.crew.map(lambda col_str: [col_dict['name'] for col_dict in eval(col_str) if col_dict['job'] == 'Producer'])

# Getting writers
df_credits['editor'] = df_credits.crew.map(lambda col_str: [col_dict['name'] for col_dict in eval(col_str) if col_dict['job'] == 'Editor'])

# Getting writers
df_credits['art_director'] = df_credits.crew.map(lambda col_str: [col_dict['name'] for col_dict in eval(col_str) if col_dict['job'] == 'Art Direction'])

# Getting writers
df_credits['music'] = df_credits.crew.map(lambda col_str: [col_dict['name'] for col_dict in eval(col_str) if col_dict['job'] == 'Music'])

In [38]:
df_credits.head()

Unnamed: 0,cast,crew,id,director,screenplay,producer,editor,art_director,music
0,"[Tom Hanks, Tim Allen, Don Rickles, Jim Varney...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",862,[John Lasseter],"[Joss Whedon, Andrew Stanton, Joel Cohen, Alec...","[Bonnie Arnold, Ralph Guggenheim]","[Lee Unkrich, Robert Gordon]",[Ralph Eggleston],[Randy Newman]
1,"[Robin Williams, Jonathan Hyde, Kirsten Dunst,...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...",8844,[Joe Johnston],"[Jonathan Hensleigh, Greg Taylor, Jim Strain]","[Scott Kroopf, William Teitler]",[Robert Dalva],[],[]
2,"[Walter Matthau, Jack Lemmon, Ann-Margret, Sop...","[{'credit_id': '52fe466a9251416c75077a89', 'de...",15602,[Howard Deutch],[],[],[],[],[]
3,"[Whitney Houston, Angela Bassett, Loretta Devi...","[{'credit_id': '52fe44779251416c91011acb', 'de...",31357,[Forest Whitaker],"[Ronald Bass, Terry McMillan]","[Ronald Bass, Ezra Swerdlow, Deborah Schindler...",[],[],[]
4,"[Steve Martin, Diane Keaton, Martin Short, Kim...","[{'credit_id': '52fe44959251416c75039ed7', 'de...",11862,[Charles Shyer],"[Nancy Meyers, Albert Hackett]",[Nancy Meyers],[Adam Bernardi],[],[]


In [42]:
df_keywords = pd.read_csv("keywords.csv")

In [43]:
df_keywords.head()

Unnamed: 0,id,keywords
0,862,"[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,8844,"[{'id': 10090, 'name': 'board game'}, {'id': 1..."
2,15602,"[{'id': 1495, 'name': 'fishing'}, {'id': 12392..."
3,31357,"[{'id': 818, 'name': 'based on novel'}, {'id':..."
4,11862,"[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n..."


In [44]:
df_keywords.keywords[0]

"[{'id': 931, 'name': 'jealousy'}, {'id': 4290, 'name': 'toy'}, {'id': 5202, 'name': 'boy'}, {'id': 6054, 'name': 'friendship'}, {'id': 9713, 'name': 'friends'}, {'id': 9823, 'name': 'rivalry'}, {'id': 165503, 'name': 'boy next door'}, {'id': 170722, 'name': 'new toy'}, {'id': 187065, 'name': 'toy comes to life'}]"