In [1]:
import numpy as np
import json
import pandas as pd
import matplotlib.pyplot as plt
import scipy

from sklearn.model_selection import train_test_split

In [2]:
%matplotlib inline

In [3]:
seed = 3

# Load the data

In [4]:
# Kaggle data
path = 'data/tmdb-5000-movie-dataset/'
credits = pd.read_csv(path + 'tmdb_5000_credits.csv')
movies = pd.read_csv(path + 'tmdb_5000_movies.csv')

credits = credits.rename(columns={'movie_id': 'id'})

# merge the two data set credit and movies
movies = movies.merge(credits, how='inner', on='id')

movies.head(1)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,...,runtime,spoken_languages,status,tagline,title_x,vote_average,vote_count,title_y,cast,crew
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...",...,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


# Clean and prepare the data

In [5]:
movies.columns

Index(['budget', 'genres', 'homepage', 'id', 'keywords', 'original_language',
       'original_title', 'overview', 'popularity', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title_x', 'vote_average',
       'vote_count', 'title_y', 'cast', 'crew'],
      dtype='object')

In [6]:
movies['earnings'] = (movies['revenue'] - movies['budget']) / movies['budget']

In [7]:
# keep only useful movies
movies = movies[['original_title', 'budget', 'revenue', 'earnings', 'production_companies', 'cast', 'crew']]

In [8]:
# keep only movies that have non zero budgets and revenue
movies = movies[movies['budget'] != 0]
movies = movies[movies['revenue'] != 0]

In [9]:
movies.sort_values(by='earnings').head()

Unnamed: 0,original_title,budget,revenue,earnings,production_companies,cast,crew
1655,Chasing Liberty,23000000,12,-0.999999,"[{""name"": ""Alcon Entertainment"", ""id"": 1088}, ...","[{""cast_id"": 1, ""character"": ""Anna Foster"", ""c...","[{""credit_id"": ""563c813992514150af00414f"", ""de..."
2485,The Cookout,16000000,12,-0.999999,"[{""name"": ""Cookout Productions"", ""id"": 3494}]","[{""cast_id"": 1, ""character"": ""blin bling"", ""cr...","[{""credit_id"": ""52fe477f9251416c7509b8ed"", ""de..."
2874,In the Cut,12000000,23,-0.999998,"[{""name"": ""Pathe Productions"", ""id"": 21126}, {...","[{""cast_id"": 1, ""character"": ""Frannie"", ""credi...","[{""credit_id"": ""5628abae9251414cd8001453"", ""de..."
3875,Dreaming of Joseph Lees,2000000,7,-0.999996,"[{""name"": ""Fox Entertainment Group"", ""id"": 857...","[{""cast_id"": 5, ""character"": ""Joseph Lees"", ""c...","[{""credit_id"": ""52fe4a98c3a36847f81d7345"", ""de..."
2068,Death at a Funeral,9000000,46,-0.999995,"[{""name"": ""Sidney Kimmel Entertainment"", ""id"":...","[{""cast_id"": 4, ""character"": ""Daniel"", ""credit...","[{""credit_id"": ""52fe4340c3a36847f8045d0d"", ""de..."


In [10]:
movies.sort_values(by='earnings', ascending=False).head()

Unnamed: 0,original_title,budget,revenue,earnings,production_companies,cast,crew
4238,Modern Times,1,8500000,8499999.0,"[{""name"": ""United Artists"", ""id"": 60}, {""name""...","[{""cast_id"": 8, ""character"": ""A factory worker...","[{""credit_id"": ""5621aeadc3a3680e1d00a09a"", ""de..."
3137,Nurse 3-D,10,10000000,999999.0,"[{""name"": ""Lions Gate"", ""id"": 6644}, {""name"": ...","[{""cast_id"": 5, ""character"": ""Abby Russell"", ""...","[{""credit_id"": ""52fe499cc3a368484e1346b1"", ""de..."
4577,Paranormal Activity,15000,193355800,12889.39,"[{""name"": ""Blumhouse Productions"", ""id"": 3172}...","[{""cast_id"": 3, ""character"": ""Katie"", ""credit_...","[{""credit_id"": ""52fe4477c3a368484e024b01"", ""de..."
4582,Tarnation,218,1162014,5329.339,[],"[{""cast_id"": 2, ""character"": ""Herself"", ""credi...","[{""credit_id"": ""52fe42f7c3a36847f8030443"", ""de..."
4496,The Blair Witch Project,60000,248000000,4132.333,"[{""name"": ""Artisan Entertainment"", ""id"": 2188}...","[{""cast_id"": 41, ""character"": ""Mike"", ""credit_...","[{""credit_id"": ""52fe4364c3a36847f8050c01"", ""de..."


After checking some of the data, we ca see that some of the values in the budget and revenue columns are expressed in millions wheras most of the values are raw numbers. We then choose to remove the values expressed in millions for more consistency of the future results. This is done by selecting only the movies that have a budget and revenue bigger than 1000 (it is unlikely that a movie will have a buget or a revenue more than a billion dollars).

In [11]:
prev_len = len(movies)
movies = movies[(movies['budget'] > 1000) & (movies['revenue'] > 1000)]
n_movies = len(movies)
print(prev_len - n_movies, 100*(prev_len - n_movies)/prev_len)

18 0.5574481263549086


This deletion represents 18 movies (~0.6% of the data), which is an acceptable value.

In [22]:
movies.sort_values(by='earnings', ascending=False).head()

Unnamed: 0,original_title,earnings,features
4577,Paranormal Activity,12889.386667,"[Blumhouse Productions, Solana Films, Katie Fe..."
4496,The Blair Witch Project,4132.333333,"[Artisan Entertainment, Haxan Films, Michael C..."
4724,Eraserhead,699.0,"[American Film Institute (AFI), Libra Films, J..."
4788,Pink Flamingos,499.0,"[Dreamland Productions, Divine, David Lochary,..."
4742,Super Size Me,438.616585,"[Kathbur Pictures, Morgan Spurlock, Daryl Isaa..."


In [12]:
to_keep = 5

movies['actors'] = movies.cast.map(lambda x: list(map(lambda d: d['name'], json.loads(x)))[:to_keep])

movies['characters'] = movies.cast.map(lambda x: list(map(lambda d: d['character'], json.loads(x)))[:to_keep])

#movies['crew'] = movies.crew.map(lambda x: list(map(lambda d: d['name'], json.loads(x)))[:to_keep])

movies['production_companies'] = movies.production_companies.map(lambda x: list(map(lambda d: d['name'], json.loads(x))))

#movies = movies[['original_title', 'earnings', 'production_companies', 'actors', 'characters', 'crew']]
movies = movies[['original_title', 'earnings', 'production_companies', 'actors', 'characters']]

movies.head()

Unnamed: 0,original_title,earnings,production_companies,actors,characters
0,Avatar,10.763566,"[Ingenious Film Partners, Twentieth Century Fo...","[Sam Worthington, Zoe Saldana, Sigourney Weave...","[Jake Sully, Neytiri, Dr. Grace Augustine, Col..."
1,Pirates of the Caribbean: At World's End,2.203333,"[Walt Disney Pictures, Jerry Bruckheimer Films...","[Johnny Depp, Orlando Bloom, Keira Knightley, ...","[Captain Jack Sparrow, Will Turner, Elizabeth ..."
2,Spectre,2.59459,"[Columbia Pictures, Danjaq, B24]","[Daniel Craig, Christoph Waltz, Léa Seydoux, R...","[James Bond, Blofeld, Madeleine, M, Lucia]"
3,The Dark Knight Rises,3.339756,"[Legendary Pictures, Warner Bros., DC Entertai...","[Christian Bale, Michael Caine, Gary Oldman, A...","[Bruce Wayne / Batman, Alfred Pennyworth, Jame..."
4,John Carter,0.092843,[Walt Disney Pictures],"[Taylor Kitsch, Lynn Collins, Samantha Morton,...","[John Carter, Dejah Thoris, Sola, Tars Tarkas,..."


In [13]:
movies['features'] = movies['production_companies'] + movies['actors'] + movies['characters']
movies = movies[['original_title', 'earnings', 'features']]
movies.head()

Unnamed: 0,original_title,earnings,features
0,Avatar,10.763566,"[Ingenious Film Partners, Twentieth Century Fo..."
1,Pirates of the Caribbean: At World's End,2.203333,"[Walt Disney Pictures, Jerry Bruckheimer Films..."
2,Spectre,2.59459,"[Columbia Pictures, Danjaq, B24, Daniel Craig,..."
3,The Dark Knight Rises,3.339756,"[Legendary Pictures, Warner Bros., DC Entertai..."
4,John Carter,0.092843,"[Walt Disney Pictures, Taylor Kitsch, Lynn Col..."


# Split the data

We split the data into a training and testing set to make the future prediction on the movies in the testing set.

In [14]:
train, test = train_test_split(movies, test_size=0.25, random_state=seed)

# Compute weights of features

In [15]:
# Create the features table with their weights associated weights (set to 0).
features = set(train['features'].sum())
features.discard('')
features = list(features)
features.sort()
features = pd.DataFrame(features, columns=['feature'])
features['weight'] = 0
features.head()

Unnamed: 0,feature,weight
0,Larry Mullen Jr.,0
1,"""DIA"" Productions GmbH & Co. KG",0
2,"""Hickory"" / The Tin Man",0
3,"""Hunk"" / The Scarecrow",0
4,"""Whistling"" John Shaw",0


In [16]:
# A DataFrame where each row contains only one feature (actor, character or production company)
# Easier for later computations
df = train.set_index(['original_title', 'earnings'])['features'].apply(pd.Series).stack().reset_index().drop('level_2', axis=1)
df.columns = ['original_title', 'earnings', 'feature']
df.head()

Unnamed: 0,original_title,earnings,feature
0,You've Got Mail,2.858792,Warner Bros.
1,You've Got Mail,2.858792,Tom Hanks
2,You've Got Mail,2.858792,Meg Ryan
3,You've Got Mail,2.858792,Katie Sagona
4,You've Got Mail,2.858792,Greg Kinnear


In [17]:
for i, feature in enumerate(features['feature']):
    in_movie = df[df['feature'] == feature]
    features.loc[i, 'weight'] = in_movie['earnings'].sum() / len(in_movie)

In [20]:
features.sort_values(by='weight', ascending=False).head()

Unnamed: 0,feature,weight
661,Amber Armstrong,12889.386667
16648,The Psychic,12889.386667
11591,Micah Sloat,12889.386667
1159,Ashley Palmer,12889.386667
11048,Mark Fredrichs,12889.386667


In [21]:
df[df['feature'] == 'Amber Armstrong']

Unnamed: 0,original_title,earnings,feature
4903,Paranormal Activity,12889.386667,Amber Armstrong
