# Analysis of The Movies Database

## Import data and packages

In [137]:
import pandas as pd
import os
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import plotly.express as px
import re
from sklearn.feature_extraction.text import CountVectorizer
from wordcloud import WordCloud

%matplotlib inline
pd.set_option('display.max_columns', None) #set full rows

In [138]:
df_movies_metadata = pd.read_csv(r'./DATA/45000_plus_movies/movies_metadata.csv')
df_credits = pd.read_csv(r'./DATA/45000_plus_movies/credits.csv')
df_ratings = pd.read_csv(r'./DATA/45000_plus_movies/ratings.csv')
df_cpi = pd.read_csv(r'./DATA/cpi/API_FP.CPI.TOTL.ZG_DS2_en_csv_v2_988671.csv',skiprows=4)
df_gdp = pd.read_csv(r'./DATA/gdp/API_NY.GDP.MKTP.CD_DS2_en_csv_v2_988718.csv',skiprows=4)

print(df_movies_metadata.memory_usage(deep=True).sum()/1000000)
print(df_credits.memory_usage(deep=True).sum()/1000000)
print(df_ratings.memory_usage(deep=True).sum()/1000000)
print(df_cpi.memory_usage(deep=True).sum()/1000000)
print(df_gdp.memory_usage(deep=True).sum()/1000000)


Columns (10) have mixed types.Specify dtype option on import or set low_memory=False.



83.471315
203.294768
832.777376
0.206704
0.201424


## Prepare inflation by year

In [139]:
temp_list = [x for x in df_cpi.columns.tolist() if len(x)<5]

df_cpi_gdp = pd.merge(df_cpi, df_gdp, left_on='Country Name',right_on='Country Name')

for this_year in temp_list:
    df_cpi_gdp["{}_weighted_cpi".format(this_year)] = df_cpi_gdp["{}_x".format(this_year)] * df_cpi_gdp["{}_y".format(this_year)]
    
weighted_cols = [x for x in df_cpi_gdp.columns.tolist() if x.endswith("_weighted_cpi")]    

dicts_cpi = {}
for this_gdp_col, this_wave_col in zip(temp_list,weighted_cols):
    dicts_cpi[this_wave_col] = df_cpi_gdp[this_wave_col].sum() / df_cpi_gdp["{}_y".format(this_gdp_col)].sum()
    
dicts_cpi    


invalid value encountered in double_scalars



{'1960_weighted_cpi': 0.8332556738934528,
 '1961_weighted_cpi': 0.9315936763270694,
 '1962_weighted_cpi': 1.5221661199137895,
 '1963_weighted_cpi': 1.3024874980853909,
 '1964_weighted_cpi': 1.521678146139268,
 '1965_weighted_cpi': 1.7594154026790947,
 '1966_weighted_cpi': 1.9069954589430467,
 '1967_weighted_cpi': 1.7381827144201405,
 '1968_weighted_cpi': 1.9554597917338292,
 '1969_weighted_cpi': 1.8532967294159093,
 '1970_weighted_cpi': 2.520879990730323,
 '1971_weighted_cpi': 2.8091012769128882,
 '1972_weighted_cpi': 3.034524046010322,
 '1973_weighted_cpi': 4.801280725576881,
 '1974_weighted_cpi': 7.970002159173341,
 '1975_weighted_cpi': 6.413744092237752,
 '1976_weighted_cpi': 4.948858163311302,
 '1977_weighted_cpi': 6.6278603656939,
 '1978_weighted_cpi': 5.590222715851748,
 '1979_weighted_cpi': 6.327326932073842,
 '1980_weighted_cpi': 8.595804408250158,
 '1981_weighted_cpi': 11.35285057120662,
 '1982_weighted_cpi': 9.082242627755974,
 '1983_weighted_cpi': 7.966561177098277,
 '1984_w

## Inspect data

In [140]:
print(df_movies_metadata.memory_usage(deep=True).sum()/1000000)
print(df_credits.memory_usage(deep=True).sum()/1000000)
print(df_ratings.memory_usage(deep=True).sum()/1000000)
print(df_cpi.memory_usage(deep=True).sum()/1000000)
print(df_gdp.memory_usage(deep=True).sum()/1000000)

83.471315
203.294768
832.777376
0.206704
0.201424


In [141]:
df_movies_metadata.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,popularity,poster_path,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",21.9469,/rhIRbceoE9lR4veEXuwCC2wARtG.jpg,"[{'name': 'Pixar Animation Studios', 'id': 3}]","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,17.0155,/vzmL6fP7aPKNKPRTFnZmiUfciyV.jpg,"[{'name': 'TriStar Pictures', 'id': 559}, {'na...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,11.7129,/6ksm1sjKMFLbO7UY2i6G1ju9SML.jpg,"[{'name': 'Warner Bros.', 'id': 6194}, {'name'...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",3.85949,/16XOMpEaLWkrcPqSQqhTmeJuqQl.jpg,[{'name': 'Twentieth Century Fox Film Corporat...,"[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,8.38752,/e64sOI48hQXyru7naBFyssKFxVd.jpg,"[{'name': 'Sandollar Productions', 'id': 5842}...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


In [142]:
df_movies_metadata.shape

(45466, 24)

In [143]:
df_movies_metadata.dtypes

adult                     object
belongs_to_collection     object
budget                    object
genres                    object
homepage                  object
id                        object
imdb_id                   object
original_language         object
original_title            object
overview                  object
popularity                object
poster_path               object
production_companies      object
production_countries      object
release_date              object
revenue                  float64
runtime                  float64
spoken_languages          object
status                    object
tagline                   object
title                     object
video                     object
vote_average             float64
vote_count               float64
dtype: object

We only want to look at 'released' films.

In [144]:
df_movies_metadata.status.value_counts()

Released           45014
Rumored              230
Post Production       98
In Production         20
Planned               15
Canceled               2
Name: status, dtype: int64

In [145]:
df_movies_metadata = df_movies_metadata[df_movies_metadata.status=='Released']
print(len(df_movies_metadata))

45014


In [146]:
df_movies_metadata['budget'] = df_movies_metadata.apply(lambda row: '0' if row['budget'].endswith('jpg') else row['budget'],axis=1)
df_movies_metadata['budget'] = df_movies_metadata['budget'].astype(float)

Quite a number of films with zero budgets and / or zero revenue. This will make our analysis of revenue vs budget rather tricky.

In [147]:
len(df_movies_metadata[(df_movies_metadata['budget']==0) & (df_movies_metadata['revenue']==0)])

34131

In [148]:
len(df_movies_metadata[(df_movies_metadata['budget']==0) & (df_movies_metadata['revenue']!=0)])

2018

In [149]:
len(df_movies_metadata[(df_movies_metadata['budget']!=0) & (df_movies_metadata['revenue']==0)])

3488

In [150]:
df_movies_metadata.isnull().sum()

adult                        0
belongs_to_collection    40548
budget                       0
genres                       0
homepage                 37308
id                           0
imdb_id                     15
original_language           10
original_title               0
overview                   920
popularity                   0
poster_path                373
production_companies         0
production_countries         0
release_date                78
revenue                      0
runtime                    251
spoken_languages             0
status                       0
tagline                  24714
title                        0
video                        0
vote_average                 0
vote_count                   0
dtype: int64

Seem to have duplicated movie id's.

In [151]:
df_movies_metadata.id.nunique()

44985

In [152]:
df_movies_metadata[df_movies_metadata.duplicated(keep=False)].sort_values('id')

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,popularity,poster_path,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
676,False,,0.0,"[{'id': 18, 'name': 'Drama'}, {'id': 10749, 'n...",,105045,tt0111613,de,Das Versprechen,"East-Berlin, 1961, shortly after the erection ...",0.122178,/5WFIrBhOOgc0jGmoLxMZwWqCctO.jpg,"[{'name': 'Studio Babelsberg', 'id': 264}, {'n...","[{'iso_3166_1': 'DE', 'name': 'Germany'}]",1995-02-16,0.0,115.0,"[{'iso_639_1': 'de', 'name': 'Deutsch'}]",Released,"A love, a hope, a wall.",The Promise,False,5.0,1.0
1465,False,,0.0,"[{'id': 18, 'name': 'Drama'}, {'id': 10749, 'n...",,105045,tt0111613,de,Das Versprechen,"East-Berlin, 1961, shortly after the erection ...",0.122178,/5WFIrBhOOgc0jGmoLxMZwWqCctO.jpg,"[{'name': 'Studio Babelsberg', 'id': 264}, {'n...","[{'iso_3166_1': 'DE', 'name': 'Germany'}]",1995-02-16,0.0,115.0,"[{'iso_639_1': 'de', 'name': 'Deutsch'}]",Released,"A love, a hope, a wall.",The Promise,False,5.0,1.0
24844,False,,0.0,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",http://www.dealthemovie.com/,11115,tt0446676,en,Deal,As an ex-gambler teaches a hot-shot college ki...,6.88036,/kHaBqrrozaG7rj6GJg3sUCiM29B.jpg,"[{'name': 'Andertainment Group', 'id': 2634}, ...","[{'iso_3166_1': 'US', 'name': 'United States o...",2008-01-29,0.0,85.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Deal,False,5.2,22.0
14012,False,,0.0,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",http://www.dealthemovie.com/,11115,tt0446676,en,Deal,As an ex-gambler teaches a hot-shot college ki...,6.88036,/kHaBqrrozaG7rj6GJg3sUCiM29B.jpg,"[{'name': 'Andertainment Group', 'id': 2634}, ...","[{'iso_3166_1': 'US', 'name': 'United States o...",2008-01-29,0.0,85.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Deal,False,5.2,22.0
21165,False,,0.0,"[{'id': 14, 'name': 'Fantasy'}, {'id': 18, 'na...",,119916,tt0080000,en,The Tempest,"Prospero, the true Duke of Milan is now living...",1.8e-05,/gLVRTxaLtUDkfscFKPyYrCtRnTk.jpg,[],[],1980-02-27,0.0,123.0,[],Released,,The Tempest,False,0.0,0.0
19890,False,,0.0,"[{'id': 14, 'name': 'Fantasy'}, {'id': 18, 'na...",,119916,tt0080000,en,The Tempest,"Prospero, the true Duke of Milan is now living...",1.8e-05,/gLVRTxaLtUDkfscFKPyYrCtRnTk.jpg,[],[],1980-02-27,0.0,123.0,[],Released,,The Tempest,False,0.0,0.0
13375,False,,0.0,"[{'id': 53, 'name': 'Thriller'}, {'id': 9648, ...",,141971,tt1180333,fi,Blackout,Recovering from a nail gun shot to the head an...,0.411949,/8VSZ9coCzxOCW2wE2Qene1H1fKO.jpg,"[{'name': 'Filmiteollisuus Fine', 'id': 5166}]","[{'iso_3166_1': 'FI', 'name': 'Finland'}]",2008-12-26,0.0,108.0,"[{'iso_639_1': 'fi', 'name': 'suomi'}]",Released,Which one is the first to return - memory or t...,Blackout,False,6.7,3.0
16764,False,,0.0,"[{'id': 53, 'name': 'Thriller'}, {'id': 9648, ...",,141971,tt1180333,fi,Blackout,Recovering from a nail gun shot to the head an...,0.411949,/8VSZ9coCzxOCW2wE2Qene1H1fKO.jpg,"[{'name': 'Filmiteollisuus Fine', 'id': 5166}]","[{'iso_3166_1': 'FI', 'name': 'Finland'}]",2008-12-26,0.0,108.0,"[{'iso_639_1': 'fi', 'name': 'suomi'}]",Released,Which one is the first to return - memory or t...,Blackout,False,6.7,3.0
13261,False,,0.0,"[{'id': 53, 'name': 'Thriller'}, {'id': 9648, ...",,141971,tt1180333,fi,Blackout,Recovering from a nail gun shot to the head an...,0.411949,/8VSZ9coCzxOCW2wE2Qene1H1fKO.jpg,"[{'name': 'Filmiteollisuus Fine', 'id': 5166}]","[{'iso_3166_1': 'FI', 'name': 'Finland'}]",2008-12-26,0.0,108.0,"[{'iso_639_1': 'fi', 'name': 'suomi'}]",Released,Which one is the first to return - memory or t...,Blackout,False,6.7,3.0
19925,False,,0.0,"[{'id': 18, 'name': 'Drama'}, {'id': 878, 'nam...",,152795,tt1821641,en,The Congress,More than two decades after catapulting to sta...,8.53404,/nnKX3ahYoT7P3au92dNgLf4pKwA.jpg,"[{'name': 'Pandora Filmproduktion', 'id': 254}...","[{'iso_3166_1': 'BE', 'name': 'Belgium'}, {'is...",2013-05-16,455815.0,122.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,The Congress,False,6.4,165.0


However there are fewer duplicate rows. 

In [153]:
df_movies_metadata[df_movies_metadata.id.duplicated(keep=False)].sort_values('id')

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,popularity,poster_path,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
676,False,,0.0,"[{'id': 18, 'name': 'Drama'}, {'id': 10749, 'n...",,105045,tt0111613,de,Das Versprechen,"East-Berlin, 1961, shortly after the erection ...",0.122178,/5WFIrBhOOgc0jGmoLxMZwWqCctO.jpg,"[{'name': 'Studio Babelsberg', 'id': 264}, {'n...","[{'iso_3166_1': 'DE', 'name': 'Germany'}]",1995-02-16,0.0,115.0,"[{'iso_639_1': 'de', 'name': 'Deutsch'}]",Released,"A love, a hope, a wall.",The Promise,False,5.0,1.0
1465,False,,0.0,"[{'id': 18, 'name': 'Drama'}, {'id': 10749, 'n...",,105045,tt0111613,de,Das Versprechen,"East-Berlin, 1961, shortly after the erection ...",0.122178,/5WFIrBhOOgc0jGmoLxMZwWqCctO.jpg,"[{'name': 'Studio Babelsberg', 'id': 264}, {'n...","[{'iso_3166_1': 'DE', 'name': 'Germany'}]",1995-02-16,0.0,115.0,"[{'iso_639_1': 'de', 'name': 'Deutsch'}]",Released,"A love, a hope, a wall.",The Promise,False,5.0,1.0
44821,False,"{'id': 34055, 'name': 'Pokémon Collection', 'p...",16000000.0,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",http://movies.warnerbros.com/pk3/,10991,tt0235679,ja,Pokémon 3: The Movie,When Molly Hale's sadness of her father's disa...,6.480376,/5ILjS6XB5deiHop8SXPsYxXWVPE.jpg,"[{'name': 'TV Tokyo', 'id': 3034}, {'name': '4...","[{'iso_3166_1': 'JP', 'name': 'Japan'}]",2000-07-08,68411275.0,93.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Pokémon: Spell of the Unknown,Pokémon: Spell of the Unknown,False,6.0,144.0
4114,False,"{'id': 34055, 'name': 'Pokémon Collection', 'p...",16000000.0,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",http://movies.warnerbros.com/pk3/,10991,tt0235679,ja,Pokémon 3: The Movie,When Molly Hale's sadness of her father's disa...,10.2646,/5ILjS6XB5deiHop8SXPsYxXWVPE.jpg,"[{'name': 'TV Tokyo', 'id': 3034}, {'name': '4...","[{'iso_3166_1': 'JP', 'name': 'Japan'}]",2000-07-08,68411275.0,93.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Pokémon: Spell of the Unknown,Pokémon: Spell of the Unknown,False,6.0,143.0
5710,False,,0.0,"[{'id': 18, 'name': 'Drama'}]",,109962,tt0082992,en,Rich and Famous,Two literary women compete for 20 years: one w...,12.1808,/tOflyY8eUFWubLKJH7fKg4KwpCl.jpg,"[{'name': 'Metro-Goldwyn-Mayer (MGM)', 'id': 8...","[{'iso_3166_1': 'US', 'name': 'United States o...",1981-09-23,0.0,115.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,"From the very beginning, they knew they'd be f...",Rich and Famous,False,4.9,7.0
20899,False,,0.0,"[{'id': 18, 'name': 'Drama'}]",,109962,tt0082992,en,Rich and Famous,Two literary women compete for 20 years: one w...,10.3969,/tOflyY8eUFWubLKJH7fKg4KwpCl.jpg,"[{'name': 'Metro-Goldwyn-Mayer (MGM)', 'id': 8...","[{'iso_3166_1': 'US', 'name': 'United States o...",1981-09-23,0.0,115.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,"From the very beginning, they knew they'd be f...",Rich and Famous,False,4.9,7.0
23534,False,,3512454.0,"[{'id': 18, 'name': 'Drama'}]",,110428,tt2018086,fr,Camille Claudel 1915,"Winter, 1915. Confined by her family to an asy...",0.110065,/sGMPDg6je1zKi0TiX9b4pP6yN02.jpg,"[{'name': 'Canal+', 'id': 5358}, {'name': 'Art...","[{'iso_3166_1': 'FR', 'name': 'France'}]",2013-03-13,115860.0,95.0,"[{'iso_639_1': 'fr', 'name': 'Français'}]",Released,,Camille Claudel 1915,False,7.0,20.0
4356,False,,3512454.0,"[{'id': 18, 'name': 'Drama'}]",,110428,tt2018086,fr,Camille Claudel 1915,"Winter, 1915. Confined by her family to an asy...",0.134014,/sGMPDg6je1zKi0TiX9b4pP6yN02.jpg,"[{'name': 'Canal+', 'id': 5358}, {'name': 'Art...","[{'iso_3166_1': 'FR', 'name': 'France'}]",2013-03-13,115860.0,95.0,"[{'iso_639_1': 'fr', 'name': 'Français'}]",Released,,Camille Claudel 1915,False,7.0,20.0
24844,False,,0.0,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",http://www.dealthemovie.com/,11115,tt0446676,en,Deal,As an ex-gambler teaches a hot-shot college ki...,6.88036,/kHaBqrrozaG7rj6GJg3sUCiM29B.jpg,"[{'name': 'Andertainment Group', 'id': 2634}, ...","[{'iso_3166_1': 'US', 'name': 'United States o...",2008-01-29,0.0,85.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Deal,False,5.2,22.0
14012,False,,0.0,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",http://www.dealthemovie.com/,11115,tt0446676,en,Deal,As an ex-gambler teaches a hot-shot college ki...,6.88036,/kHaBqrrozaG7rj6GJg3sUCiM29B.jpg,"[{'name': 'Andertainment Group', 'id': 2634}, ...","[{'iso_3166_1': 'US', 'name': 'United States o...",2008-01-29,0.0,85.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Deal,False,5.2,22.0


In [154]:
temp_dup_all = df_movies_metadata[df_movies_metadata.duplicated(keep=False)].sort_values('id')
print(len(temp_dup_all))

23


In [155]:
temp_dup_id = df_movies_metadata[df_movies_metadata.id.duplicated(keep=False)].sort_values('id')
print(len(temp_dup_id))

57


In [156]:
temp_merged = pd.merge(temp_dup_id, temp_dup_all, left_index=True, right_index=True, how='left')

In [157]:
id_to_inspect = temp_merged[temp_merged['id_y'].isnull()]['id_x'].unique()

We see that this is because of different 'popularity' scores, except for two movies

In [158]:
temp_dup_id[temp_dup_id['id'].isin(id_to_inspect)]

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,popularity,poster_path,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
44821,False,"{'id': 34055, 'name': 'Pokémon Collection', 'p...",16000000.0,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",http://movies.warnerbros.com/pk3/,10991,tt0235679,ja,Pokémon 3: The Movie,When Molly Hale's sadness of her father's disa...,6.480376,/5ILjS6XB5deiHop8SXPsYxXWVPE.jpg,"[{'name': 'TV Tokyo', 'id': 3034}, {'name': '4...","[{'iso_3166_1': 'JP', 'name': 'Japan'}]",2000-07-08,68411275.0,93.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Pokémon: Spell of the Unknown,Pokémon: Spell of the Unknown,False,6.0,144.0
4114,False,"{'id': 34055, 'name': 'Pokémon Collection', 'p...",16000000.0,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",http://movies.warnerbros.com/pk3/,10991,tt0235679,ja,Pokémon 3: The Movie,When Molly Hale's sadness of her father's disa...,10.2646,/5ILjS6XB5deiHop8SXPsYxXWVPE.jpg,"[{'name': 'TV Tokyo', 'id': 3034}, {'name': '4...","[{'iso_3166_1': 'JP', 'name': 'Japan'}]",2000-07-08,68411275.0,93.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Pokémon: Spell of the Unknown,Pokémon: Spell of the Unknown,False,6.0,143.0
5710,False,,0.0,"[{'id': 18, 'name': 'Drama'}]",,109962,tt0082992,en,Rich and Famous,Two literary women compete for 20 years: one w...,12.1808,/tOflyY8eUFWubLKJH7fKg4KwpCl.jpg,"[{'name': 'Metro-Goldwyn-Mayer (MGM)', 'id': 8...","[{'iso_3166_1': 'US', 'name': 'United States o...",1981-09-23,0.0,115.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,"From the very beginning, they knew they'd be f...",Rich and Famous,False,4.9,7.0
20899,False,,0.0,"[{'id': 18, 'name': 'Drama'}]",,109962,tt0082992,en,Rich and Famous,Two literary women compete for 20 years: one w...,10.3969,/tOflyY8eUFWubLKJH7fKg4KwpCl.jpg,"[{'name': 'Metro-Goldwyn-Mayer (MGM)', 'id': 8...","[{'iso_3166_1': 'US', 'name': 'United States o...",1981-09-23,0.0,115.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,"From the very beginning, they knew they'd be f...",Rich and Famous,False,4.9,7.0
23534,False,,3512454.0,"[{'id': 18, 'name': 'Drama'}]",,110428,tt2018086,fr,Camille Claudel 1915,"Winter, 1915. Confined by her family to an asy...",0.110065,/sGMPDg6je1zKi0TiX9b4pP6yN02.jpg,"[{'name': 'Canal+', 'id': 5358}, {'name': 'Art...","[{'iso_3166_1': 'FR', 'name': 'France'}]",2013-03-13,115860.0,95.0,"[{'iso_639_1': 'fr', 'name': 'Français'}]",Released,,Camille Claudel 1915,False,7.0,20.0
4356,False,,3512454.0,"[{'id': 18, 'name': 'Drama'}]",,110428,tt2018086,fr,Camille Claudel 1915,"Winter, 1915. Confined by her family to an asy...",0.134014,/sGMPDg6je1zKi0TiX9b4pP6yN02.jpg,"[{'name': 'Canal+', 'id': 5358}, {'name': 'Art...","[{'iso_3166_1': 'FR', 'name': 'France'}]",2013-03-13,115860.0,95.0,"[{'iso_639_1': 'fr', 'name': 'Français'}]",Released,,Camille Claudel 1915,False,7.0,20.0
5535,False,"{'id': 34055, 'name': 'Pokémon Collection', 'p...",0.0,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",http://www.pokemon.com/us/movies/movie-pokemon...,12600,tt0287635,ja,劇場版ポケットモンスター セレビィ 時を越えた遭遇（であい）,"All your favorite Pokémon characters are back,...",7.0723,/bqL0PVHbQ8Jmw3Njcl38kW0CoeM.jpg,[],"[{'iso_3166_1': 'JP', 'name': 'Japan'}, {'iso_...",2001-07-06,28023563.0,75.0,"[{'iso_639_1': 'ja', 'name': '日本語'}]",Released,,Pokémon 4Ever: Celebi - Voice of the Forest,False,5.7,82.0
44826,False,"{'id': 34055, 'name': 'Pokémon Collection', 'p...",0.0,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",http://www.pokemon.com/us/movies/movie-pokemon...,12600,tt0287635,ja,劇場版ポケットモンスター セレビィ 時を越えた遭遇（であい）,"All your favorite Pokémon characters are back,...",6.080108,/bqL0PVHbQ8Jmw3Njcl38kW0CoeM.jpg,[],"[{'iso_3166_1': 'JP', 'name': 'Japan'}, {'iso_...",2001-07-06,28023563.0,75.0,"[{'iso_639_1': 'ja', 'name': '日本語'}]",Released,,Pokémon 4Ever: Celebi - Voice of the Forest,False,5.7,82.0
11342,False,,2500.0,"[{'id': 18, 'name': 'Drama'}, {'id': 35, 'name...",,13209,tt0499537,fa,Offside,"Since women are banned from soccer matches, Ir...",1.52896,/nfkOkpudNNIjRrf0mTFVoiGzHyc.jpg,"[{'name': 'Jafar Panahi Film Productions', 'id...","[{'iso_3166_1': 'IR', 'name': 'Iran'}]",2006-05-26,0.0,93.0,"[{'iso_639_1': 'fa', 'name': 'فارسی'}]",Released,,Offside,False,6.7,27.0
15765,False,,2500.0,"[{'id': 18, 'name': 'Drama'}, {'id': 35, 'name...",,13209,tt0499537,fa,Offside,"Since women are banned from soccer matches, Ir...",1.52988,/nfkOkpudNNIjRrf0mTFVoiGzHyc.jpg,"[{'name': 'Jafar Panahi Film Productions', 'id...","[{'iso_3166_1': 'IR', 'name': 'Iran'}]",2006-05-26,0.0,93.0,"[{'iso_639_1': 'fa', 'name': 'فارسی'}]",Released,,Offside,False,6.7,27.0


In [159]:
temp_inspect_1 = temp_dup_id[temp_dup_id['id']=='159849'].iloc[0]
temp_inspect_2 = temp_dup_id[temp_dup_id['id']=='159849'].iloc[1]
temp_inspect_1 == temp_inspect_2

adult                     True
belongs_to_collection     True
budget                    True
genres                    True
homepage                 False
id                        True
imdb_id                   True
original_language         True
original_title            True
overview                  True
popularity               False
poster_path               True
production_companies      True
production_countries      True
release_date              True
revenue                   True
runtime                   True
spoken_languages          True
status                    True
tagline                  False
title                     True
video                     True
vote_average              True
vote_count                True
dtype: bool

In [160]:
temp_inspect_1 = temp_dup_id[temp_dup_id['id']=='99080'].iloc[0]
temp_inspect_2 = temp_dup_id[temp_dup_id['id']=='99080'].iloc[1]
temp_inspect_1 == temp_inspect_2

adult                     True
belongs_to_collection    False
budget                    True
genres                    True
homepage                 False
id                        True
imdb_id                   True
original_language         True
original_title            True
overview                  True
popularity               False
poster_path               True
production_companies      True
production_countries      True
release_date              True
revenue                   True
runtime                   True
spoken_languages          True
status                    True
tagline                   True
title                     True
video                     True
vote_average              True
vote_count                True
dtype: bool

Still appears to be due to popularity, but also some other features. We aren't too concerned about belongs_to_collection.

Overall, to handle duplicate ID's, we will retain rows with highest popularity scores.

In [161]:
df_movies_metadata.popularity = df_movies_metadata.popularity.astype('float')
df_movies_metadata = df_movies_metadata.sort_values('popularity',ascending=False)
df_movies_metadata.drop_duplicates(subset='id', keep='first', inplace=True)

We also only need to retain certain columns

In [183]:
df_movies_metadata = df_movies_metadata[['id', 'adult', 'belongs_to_collection', 'budget', 'genres','original_title', 
       'popularity', 'production_companies',
       'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'title',
       'vote_average', 'vote_count']]

We can bring in cast and crew information

In [184]:
df_movies_metadata = df_movies_metadata.copy()
df_movies_metadata.id = df_movies_metadata.id.astype('int')

df_movies_metadata = pd.merge(df_movies_metadata,df_credits_temp,left_on='id',right_on='id',how='left')

In [187]:
df_movies_metadata.isnull().sum()

id                           0
adult                        0
belongs_to_collection    40559
budget                       0
genres                       0
original_title               0
popularity                   0
production_companies         0
release_date                78
revenue                      0
runtime                    251
spoken_languages             0
status                       0
title                        0
vote_average                 0
vote_count                   0
cast                         1
crew                         1
dtype: int64

In [226]:
df_movies_metadata = df_movies_metadata[~df_movies_metadata.cast.isnull()]

In [227]:
df_movies_metadata.cast

0        [{'cast_id': 22, 'character': 'Scarlet Overkil...
1        [{'cast_id': 0, 'character': 'Wonder Woman / D...
2        [{'cast_id': 174, 'character': 'Belle', 'credi...
3        [{'cast_id': 9, 'character': 'Baby', 'credit_i...
4        [{'cast_id': 23, 'character': 'Baymax (voice)'...
                               ...                        
45023    [{'cast_id': 0, 'character': 'Ecaterina Teodor...
45024                                                   []
45025    [{'cast_id': 1, 'character': 'Fred', 'credit_i...
45026                                                   []
45027                                                   []
Name: cast, Length: 45027, dtype: object

In [233]:
list_cast = []

for row in range(0,len(df_movies_metadata)):
    temp_list = [x[x.find("name")+8:x.find("order")-4] for x in df_movies_metadata.iloc[row]['cast'][1:-1].split("{") if 'name' in x]
    list_cast.extend(temp_list)
    
unique_list_cast = list(set(list_cast))  