In [79]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pickle

In [80]:
df_cleaned = pd.read_csv("./cleaned_data.csv")

In [81]:
df_cleaned.columns

Index(['id', 'adult', 'belongs_to_collection', 'genres', 'original_language',
       'original_title', 'overview', 'popularity', 'production_companies',
       'release_date', 'runtime', 'spoken_languages', 'title', 'vote_average',
       'vote_count', 'keywords'],
      dtype='object')

#### just some helper functions

In [82]:
def convert_dict(column_name,key,value):
    # store just the ids and None/NaN otherwise
    d = dict()
    for i in df_cleaned[column_name]:
        j = eval(i)
        if j.get('id') != None:
            d.update({j.get(key) : j.get(value)})
    df_cleaned[column_name] = [eval(i).get(key) for i in df_cleaned[column_name]]
    return d

In [83]:
def convert_list_of_dict(column_name,key,value):
    # store just the list of ids in the df and dictionary stores all the mappings
    d_ = dict()
    l = []
    for i in df_cleaned[column_name]:
        l2 = []
        for j in eval(i):
            l2.append(j.get(key))
            d_.update({j.get(key): j.get(value)})
        l.append(l2)
    df_cleaned[column_name] = l
    return d_

In [84]:
def store_dict_to_df(d,index_name,value_name,path):
    """pass the dictionary and convert that to data frame"""
    df = pd.DataFrame(d.items(),columns = [index_name,value_name])
    df.to_csv(f"{path}",index=False)

### adult,popularity, release_date, runtime, title, vote_average, vote_count don't have to change anything

### Convert belongs_to_collection to just the ids and store the dictionary with name somewhere else
again gets the NaN values (setting to zero might be wrong!!)

In [85]:
dict_ids_to_collection = convert_dict('belongs_to_collection','id','name')

In [111]:
store_dict_to_df(dict_ids_to_collection,"collection_id","collection_name","./dataframes/ids_to_collection.csv")

In [112]:
df = pd.read_csv("./dataframes/ids_to_collection.csv")
df

Unnamed: 0,collection_id,collection_name
0,10194,Toy Story Collection
1,119050,Grumpy Old Men Collection
2,96871,Father of the Bride Collection
3,645,James Bond Collection
4,117693,Balto Collection
...,...,...
1686,104774,Tomtar och Trolltyg Collection
1687,400500,Чебурашка и крокодил Гена
1688,148603,Ducobu Collection
1689,152918,Mister Blot Collection


In [88]:
with open("./new_movie_data/dict/ids_to_collection.pkl","wb") as f:
    pickle.dump(dict_ids_to_collection,f)

In [89]:
with open("./dict/ids_to_collection.pkl","rb") as f:
    d = pickle.load(f)
d[df_cleaned.belongs_to_collection.iloc[0]]

'Toy Story Collection'

### Convert genres to just a list of ids and store the dictionary with name somewhere else


In [92]:
dict_ids_to_genres = convert_list_of_dict('genres','id','name')

In [93]:
with open("./dict/ids_to_genres.pkl","wb") as f:
    pickle.dump(dict_ids_to_genres,f)

In [94]:
with open("./dict/ids_to_genres.pkl","rb") as f:
    d = pickle.load(f)
[d.get(i) for i in df_cleaned.genres.iloc[0]]

['Animation', 'Comedy', 'Family']

In [95]:
store_dict_to_df(dict_ids_to_genres,'genre_id','genre','./dataframes/ids_to_genres.csv')

In [113]:
df = pd.read_csv("./dataframes/ids_to_genres.csv")
df

Unnamed: 0,genre_id,genre
0,16,Animation
1,35,Comedy
2,10751,Family
3,12,Adventure
4,14,Fantasy
5,10749,Romance
6,18,Drama
7,28,Action
8,80,Crime
9,53,Thriller


### removing the column original title and original language not that useful - since title and spoken languages already exist

In [105]:
remove_col = ['original_language','original_title']

### HAVE TO DECIDE WHAT TO DO WITH THE OVERVIEW

### production companies - empty list if no information

In [106]:
df_cleaned.production_companies.iloc[1]

"[{'name': 'TriStar Pictures', 'id': 559}, {'name': 'Teitler Film', 'id': 2550}, {'name': 'Interscope Communications', 'id': 10201}]"

In [107]:
dict_ids_to_prod = convert_list_of_dict('production_companies','id','name')

In [109]:
with open("./dict/ids_to_prod.pkl","wb") as f:
    pickle.dump(dict_ids_to_prod,f)

In [110]:
with open("./dict/ids_to_prod.pkl","rb") as f:
    d = pickle.load(f)
[d.get(i) for i in df_cleaned.production_companies.iloc[1]]

['TriStar Pictures', 'Teitler Film', 'Interscope Communications']

In [116]:
store_dict_to_df(dict_ids_to_prod,'production_company_id','production_company','./dataframes/ids_to_prod_companies.csv')

In [117]:
df = pd.read_csv("./dataframes/ids_to_prod_companies.csv")
df

Unnamed: 0,production_company_id,production_company
0,3,Pixar Animation Studios
1,559,TriStar Pictures
2,2550,Teitler Film
3,10201,Interscope Communications
4,6194,Warner Bros.
...,...,...
23570,85400,He and She Films
23571,27570,Neptune Salad Entertainment
23572,27571,Pirie Productions
23573,19653,Sine Olivia


### Spoken Languages

In [122]:
df_cleaned.spoken_languages.iloc[341]

"[{'iso_639_1': 'fr', 'name': 'Français'}, {'iso_639_1': 'en', 'name': 'English'}]"

In [123]:
dict_iso_to_language = convert_list_of_dict("spoken_languages",'iso_639_1','name')

In [125]:
with open("./dict/iso_to_lang.pkl","wb") as f:
    pickle.dump(dict_iso_to_language,f)

In [126]:
dict_id_to_iso = dict()
dict_iso_to_id = dict()
for j,i in enumerate(dict_iso_to_language.keys()):
    dict_id_to_iso.update({j+1:i})
    dict_iso_to_id.update({i:j+1})

In [128]:
with open("./dict/id_to_iso.pkl","wb") as f:
    pickle.dump(dict_id_to_iso,f)

In [129]:
l = []
for i in df_cleaned.spoken_languages:
    l2 = []
    for j in i:
        l2.append(dict_iso_to_id.get(j))
    l.append(l2)
df_cleaned.spoken_languages = l

In [131]:
with open("./dict/iso_to_lang.pkl","rb") as f:
    d = pickle.load(f)
with open("./dict/id_to_iso.pkl","rb") as f:
    d_ = pickle.load(f)
[d.get(d_.get(i)) for i in df_cleaned.spoken_languages.iloc[341]]

['Français', 'English']

In [132]:
store_dict_to_df(dict_id_to_iso,'lang_id','iso','./dataframes/ids_to_iso.csv')

In [133]:
df = pd.read_csv("./dataframes/ids_to_iso.csv")
df

Unnamed: 0,lang_id,iso
0,1,en
1,2,fr
2,3,es
3,4,de
4,5,ru
...,...,...
128,129,sn
129,130,bi
130,131,ha
131,132,ig


### Keywords

In [134]:
df_cleaned.keywords.iloc[0]

"[{'id': 931, 'name': 'jealousy'}, {'id': 4290, 'name': 'toy'}, {'id': 5202, 'name': 'boy'}, {'id': 6054, 'name': 'friendship'}, {'id': 9713, 'name': 'friends'}, {'id': 9823, 'name': 'rivalry'}, {'id': 165503, 'name': 'boy next door'}, {'id': 170722, 'name': 'new toy'}, {'id': 187065, 'name': 'toy comes to life'}]"

In [135]:
dict_ids_to_keywords = convert_list_of_dict('keywords','id','name')

In [137]:
with open("./dict/ids_to_keywords.pkl","wb") as f:
    pickle.dump(dict_ids_to_keywords,f)

In [139]:
with open("./dict/ids_to_keywords.pkl","rb") as f:
    d = pickle.load(f)
[d.get(i) for i in df_cleaned.keywords.iloc[0]]

['jealousy',
 'toy',
 'boy',
 'friendship',
 'friends',
 'rivalry',
 'boy next door',
 'new toy',
 'toy comes to life']

In [140]:
store_dict_to_df(dict_ids_to_keywords,'keyword_id','keyword','./dataframes/ids_to_keywords.csv')

In [141]:
df = pd.read_csv("./dataframes/ids_to_keywords.csv")
df

Unnamed: 0,keyword_id,keyword
0,931,jealousy
1,4290,toy
2,5202,boy
3,6054,friendship
4,9713,friends
...,...,...
19913,234625,camping equipment
19914,233372,emt
19915,218070,radio controlled
19916,220150,fictional documentary


In [142]:
df_cleaned = df_cleaned[[i for i in list(df_cleaned.columns) if i not in remove_col ]]

In [143]:
df_cleaned = df_cleaned.copy()
df_cleaned.loc[:, 'adult'] = df_cleaned['adult'].astype(int)

In [144]:
df_cleaned.head()

Unnamed: 0,id,adult,belongs_to_collection,genres,overview,popularity,production_companies,release_date,runtime,spoken_languages,title,vote_average,vote_count,keywords
0,862,0,10194.0,"[16, 35, 10751]","Led by Woody, Andy's toys live happily in his ...",21.946943,[3],1995,81.0,[1],Toy Story,7.7,5415.0,"[931, 4290, 5202, 6054, 9713, 9823, 165503, 17..."
1,8844,0,,"[12, 14, 10751]",When siblings Judy and Peter discover an encha...,17.015539,"[559, 2550, 10201]",1995,104.0,"[1, 2]",Jumanji,6.9,2413.0,"[10090, 10941, 15101, 33467, 158086, 158091]"
2,15602,0,119050.0,"[10749, 35]",A family wedding reignites the ancient feud be...,11.7129,"[6194, 19464]",1995,101.0,[1],Grumpier Old Men,6.5,92.0,"[1495, 12392, 179431, 208510]"
3,31357,0,,"[35, 18, 10749]","Cheated on, mistreated and stepped on, the wom...",3.859495,[306],1995,127.0,[1],Waiting to Exhale,6.1,34.0,"[818, 10131, 14768, 15160, 33455]"
4,11862,0,96871.0,[35],Just when George Banks has recovered from his ...,8.387519,"[5842, 9195]",1995,106.0,[1],Father of the Bride Part II,5.7,173.0,"[1009, 1599, 2246, 4995, 5600, 10707, 13149, 3..."


In [151]:
df_cleaned.to_csv("./final_cleaned.csv",index = False)

In [194]:
df = pd.read_csv("./final_cleaned.csv")

#### Forgot the credits file :(

In [195]:
df_c = pd.read_csv("./credits.csv")
df_c_new = pd.merge(df,df_c,on='id')

In [196]:
[i.get('name') for i in eval(df_c_new.cast.iloc[0])]

['Tom Hanks',
 'Tim Allen',
 'Don Rickles',
 'Jim Varney',
 'Wallace Shawn',
 'John Ratzenberger',
 'Annie Potts',
 'John Morris',
 'Erik von Detten',
 'Laurie Metcalf',
 'R. Lee Ermey',
 'Sarah Freeman',
 'Penn Jillette']

In [197]:
dict_ids_to_cast = dict()
l = []
for j in df_c_new.cast:
    l2 = []
    for i in sorted(eval(j), key=lambda d: d['order']):
        l2.append(i.get('id'))
        dict_ids_to_cast.update({i.get('id') : i.get('name')})
    l.append(l2)
df_c_new.cast = l

In [198]:
with open("./dict/ids_to_cast.pkl","wb") as f:
    pickle.dump(dict_ids_to_cast,f)

In [199]:
with open("./dict/ids_to_cast.pkl","rb") as f:
    d = pickle.load(f)
[d.get(i) for i in df_c_new.cast.iloc[0]]

['Tom Hanks',
 'Tim Allen',
 'Don Rickles',
 'Jim Varney',
 'Wallace Shawn',
 'John Ratzenberger',
 'Annie Potts',
 'John Morris',
 'Erik von Detten',
 'Laurie Metcalf',
 'R. Lee Ermey',
 'Sarah Freeman',
 'Penn Jillette']

In [178]:
df_c_new.head()

Unnamed: 0,id,adult,belongs_to_collection,genres,overview,popularity,production_companies,release_date,runtime,spoken_languages,title,vote_average,vote_count,keywords,cast,crew
0,862,0,10194.0,"[16, 35, 10751]","Led by Woody, Andy's toys live happily in his ...",21.946943,[3],1995,81.0,[1],Toy Story,7.7,5415.0,"[931, 4290, 5202, 6054, 9713, 9823, 165503, 17...","[31, 12898, 7167, 12899, 12900, 7907, 8873, 11...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de..."
1,8844,0,,"[12, 14, 10751]",When siblings Judy and Peter discover an encha...,17.015539,"[559, 2550, 10201]",1995,104.0,"[1, 2]",Jumanji,6.9,2413.0,"[10090, 10941, 15101, 33467, 158086, 158091]","[2157, 8537, 205, 145151, 5149, 10739, 58563, ...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de..."
2,15602,0,119050.0,"[10749, 35]",A family wedding reignites the ancient feud be...,11.7129,"[6194, 19464]",1995,101.0,[1],Grumpier Old Men,6.5,92.0,"[1495, 12392, 179431, 208510]","[6837, 3151, 13567, 16757, 589, 16523, 7166]","[{'credit_id': '52fe466a9251416c75077a89', 'de..."
3,31357,0,,"[35, 18, 10749]","Cheated on, mistreated and stepped on, the wom...",3.859495,[306],1995,127.0,[1],Waiting to Exhale,6.1,34.0,"[818, 10131, 14768, 15160, 33455]","[8851, 9780, 18284, 51359, 66804, 352, 87118, ...","[{'credit_id': '52fe44779251416c91011acb', 'de..."
4,11862,0,96871.0,[35],Just when George Banks has recovered from his ...,8.387519,"[5842, 9195]",1995,106.0,[1],Father of the Bride Part II,5.7,173.0,"[1009, 1599, 2246, 4995, 5600, 10707, 13149, 3...","[67773, 3092, 519, 70696, 59222, 18793, 14592,...","[{'credit_id': '52fe44959251416c75039ed7', 'de..."


In [200]:
[i.get('name') for i in eval(df_c_new.crew.iloc[0])][:5]

['John Lasseter',
 'Joss Whedon',
 'Andrew Stanton',
 'Joel Cohen',
 'Alec Sokolow']

In [201]:
store_dict_to_df(dict_ids_to_cast,'cast_id','name','./dataframes/ids_to_cast.csv')

In [202]:
df = pd.read_csv("./dataframes/ids_to_cast.csv")
df

Unnamed: 0,cast_id,name
0,31,Tom Hanks
1,12898,Tim Allen
2,7167,Don Rickles
3,12899,Jim Varney
4,12900,Wallace Shawn
...,...,...
205081,169498,Jeremy Lelliott
205082,1177500,Jason Widener
205083,93930,Kiko Ellsworth
205084,1686379,Darrell Dubovsky


In [203]:
dict_ids_to_crew = dict()
l = []
for i in df_c_new['crew']:
    l2 = []
    for j in eval(i):
        if j.get('job') == 'Director':
            l2.append(j.get('id'))
            dict_ids_to_crew.update({j.get('id'): j.get('name')})
    l.append(l2)
df_c_new['crew'] = l

In [204]:
with open("./dict/ids_to_crew.pkl","wb") as f:
    pickle.dump(dict_ids_to_crew,f)

In [205]:
with open("./dict/ids_to_crew.pkl","rb") as f:
    d = pickle.load(f)
[d.get(i) for i in df_c_new.crew.iloc[0]][:5]

['John Lasseter']

In [206]:
df_c_new.head()

Unnamed: 0,id,adult,belongs_to_collection,genres,overview,popularity,production_companies,release_date,runtime,spoken_languages,title,vote_average,vote_count,keywords,cast,crew
0,862,0,10194.0,"[16, 35, 10751]","Led by Woody, Andy's toys live happily in his ...",21.946943,[3],1995,81.0,[1],Toy Story,7.7,5415.0,"[931, 4290, 5202, 6054, 9713, 9823, 165503, 17...","[31, 12898, 7167, 12899, 12900, 7907, 8873, 11...",[7879]
1,8844,0,,"[12, 14, 10751]",When siblings Judy and Peter discover an encha...,17.015539,"[559, 2550, 10201]",1995,104.0,"[1, 2]",Jumanji,6.9,2413.0,"[10090, 10941, 15101, 33467, 158086, 158091]","[2157, 8537, 205, 145151, 5149, 10739, 58563, ...",[4945]
2,15602,0,119050.0,"[10749, 35]",A family wedding reignites the ancient feud be...,11.7129,"[6194, 19464]",1995,101.0,[1],Grumpier Old Men,6.5,92.0,"[1495, 12392, 179431, 208510]","[6837, 3151, 13567, 16757, 589, 16523, 7166]",[26502]
3,31357,0,,"[35, 18, 10749]","Cheated on, mistreated and stepped on, the wom...",3.859495,[306],1995,127.0,[1],Waiting to Exhale,6.1,34.0,"[818, 10131, 14768, 15160, 33455]","[8851, 9780, 18284, 51359, 66804, 352, 87118, ...",[2178]
4,11862,0,96871.0,[35],Just when George Banks has recovered from his ...,8.387519,"[5842, 9195]",1995,106.0,[1],Father of the Bride Part II,5.7,173.0,"[1009, 1599, 2246, 4995, 5600, 10707, 13149, 3...","[67773, 3092, 519, 70696, 59222, 18793, 14592,...",[56106]


In [207]:
store_dict_to_df(dict_ids_to_crew,'crew_id','director_name','./dataframes/ids_to_director_name.csv')

In [208]:
df = pd.read_csv("./dataframes/ids_to_director_name.csv")
df        

Unnamed: 0,crew_id,director_name
0,7879,John Lasseter
1,4945,Joe Johnston
2,26502,Howard Deutch
3,2178,Forest Whitaker
4,56106,Charles Shyer
...,...,...
19728,1644440,Ravi Udyawar
19729,1736944,Shanra J. Kehl
19730,42634,Aaron Osborne
19731,1182809,Hamid Nematollah


In [209]:
df_c_new.to_csv("./final_cleaned_2.csv",index = False)

In [210]:
df = pd.read_csv("./final_cleaned_2.csv")
df

Unnamed: 0,id,adult,belongs_to_collection,genres,overview,popularity,production_companies,release_date,runtime,spoken_languages,title,vote_average,vote_count,keywords,cast,crew
0,862,0,10194.0,"[16, 35, 10751]","Led by Woody, Andy's toys live happily in his ...",21.946943,[3],1995,81.0,[1],Toy Story,7.7,5415.0,"[931, 4290, 5202, 6054, 9713, 9823, 165503, 17...","[31, 12898, 7167, 12899, 12900, 7907, 8873, 11...",[7879]
1,8844,0,,"[12, 14, 10751]",When siblings Judy and Peter discover an encha...,17.015539,"[559, 2550, 10201]",1995,104.0,"[1, 2]",Jumanji,6.9,2413.0,"[10090, 10941, 15101, 33467, 158086, 158091]","[2157, 8537, 205, 145151, 5149, 10739, 58563, ...",[4945]
2,15602,0,119050.0,"[10749, 35]",A family wedding reignites the ancient feud be...,11.712900,"[6194, 19464]",1995,101.0,[1],Grumpier Old Men,6.5,92.0,"[1495, 12392, 179431, 208510]","[6837, 3151, 13567, 16757, 589, 16523, 7166]",[26502]
3,31357,0,,"[35, 18, 10749]","Cheated on, mistreated and stepped on, the wom...",3.859495,[306],1995,127.0,[1],Waiting to Exhale,6.1,34.0,"[818, 10131, 14768, 15160, 33455]","[8851, 9780, 18284, 51359, 66804, 352, 87118, ...",[2178]
4,11862,0,96871.0,[35],Just when George Banks has recovered from his ...,8.387519,"[5842, 9195]",1995,106.0,[1],Father of the Bride Part II,5.7,173.0,"[1009, 1599, 2246, 4995, 5600, 10707, 13149, 3...","[67773, 3092, 519, 70696, 59222, 18793, 14592,...",[56106]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
46159,439050,0,,"[18, 10751]",Rising and falling between a man and woman.,0.072051,[],0,90.0,[14],Subdue,4.0,1.0,[10703],"[240240, 1749839, 1619957]",[1182809]
46160,111109,0,,[18],An artist struggles to finish his work while a...,0.178241,[19653],2011,360.0,[32],Century of Birthing,9.0,3.0,"[2679, 14531, 215397]","[1043186, 111636, 1204271, 278923, 1042953, 57...",[1051381]
46161,67758,0,,"[28, 18, 53]","When one of her hits goes wrong, a professiona...",0.903007,[6165],2003,90.0,[1],Betrayal,3.8,6.0,[],"[23764, 2059, 46277, 1736, 58646, 54649, 55270...",[67753]
46162,227506,0,,[],"In a small town live two brothers, one a minis...",0.003503,[88753],1917,87.0,[],Satan Triumphant,0.0,0.0,[],"[544742, 1090923, 1136422, 1261758, 29199]",[1085341]
