In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pickle

In [2]:
df_cleaned = pd.read_csv("./new_movie_data/cleaned_data.csv")

In [3]:
df_cleaned.columns

Index(['id', 'adult', 'belongs_to_collection', 'genres', 'original_language',
       'original_title', 'overview', 'popularity', 'production_companies',
       'release_date', 'runtime', 'spoken_languages', 'title', 'vote_average',
       'vote_count', 'keywords'],
      dtype='object')

#### just some helper functions

In [4]:
def convert_dict(column_name,key,value):
    # store just the ids and None/NaN otherwise
    d = dict()
    for i in df_cleaned[column_name]:
        j = eval(i)
        if j.get('id') != None:
            d.update({j.get(key) : j.get(value)})
    df_cleaned[column_name] = [eval(i).get(key) for i in df_cleaned[column_name]]
    return d

In [5]:
def convert_list_of_dict(column_name,key,value):
    # store just the list of ids in the df and dictionary stores all the mappings
    d_ = dict()
    l = []
    for i in df_cleaned[column_name]:
        l2 = []
        for j in eval(i):
            l2.append(j.get(key))
            d_.update({j.get(key): j.get(value)})
        l.append(l2)
    df_cleaned[column_name] = l
    return d_

### adult,popularity, release_date, runtime, title, vote_average, vote_count don't have to change anything

### Convert belongs_to_collection to just the ids and store the dictionary with name somewhere else
again gets the NaN values (setting to zero might be wrong!!)

In [6]:
dict_ids_to_collection = convert_dict('belongs_to_collection','id','name')

In [7]:
with open("./new_movie_data/dict/ids_to_collection.pkl","wb") as f:
    pickle.dump(dict_ids_to_collection,f)

In [8]:
with open("./new_movie_data/dict/ids_to_collection.pkl","rb") as f:
    d = pickle.load(f)
d[df_cleaned.belongs_to_collection.iloc[0]]

'Toy Story Collection'

### Convert genres to just a list of ids and store the dictionary with name somewhere else


In [9]:
dict_ids_to_genres = convert_list_of_dict('genres','id','name')

In [10]:
with open("./new_movie_data/dict/ids_to_genres.pkl","wb") as f:
    pickle.dump(dict_ids_to_genres,f)

In [11]:
with open("./new_movie_data/dict/ids_to_genres.pkl","rb") as f:
    d = pickle.load(f)
[d.get(i) for i in df_cleaned.genres.iloc[0]]

['Animation', 'Comedy', 'Family']

### removing the column original title and original language not that useful - since title and spoken languages already exist

In [12]:
remove_col = ['original_language','original_title']

### HAVE TO DECIDE WHAT TO DO WITH THE OVERVIEW

### production companies - empty list if no information

In [13]:
df_cleaned.production_companies.iloc[1]

"[{'name': 'TriStar Pictures', 'id': 559}, {'name': 'Teitler Film', 'id': 2550}, {'name': 'Interscope Communications', 'id': 10201}]"

In [14]:
dict_ids_to_prod = convert_list_of_dict('production_companies','id','name')

In [15]:
with open("./new_movie_data/dict/ids_to_prod.pkl","wb") as f:
    pickle.dump(dict_ids_to_prod,f)

In [16]:
with open("./new_movie_data/dict/ids_to_prod.pkl","rb") as f:
    d = pickle.load(f)
[d.get(i) for i in df_cleaned.production_companies.iloc[1]]

['TriStar Pictures', 'Teitler Film', 'Interscope Communications']

### Spoken Languages

In [17]:
df_cleaned.spoken_languages.iloc[341]

"[{'iso_639_1': 'fr', 'name': 'Français'}, {'iso_639_1': 'en', 'name': 'English'}]"

In [18]:
dict_iso_to_language = convert_list_of_dict("spoken_languages",'iso_639_1','name')

In [19]:
with open("./new_movie_data/dict/iso_to_lang.pkl","wb") as f:
    pickle.dump(dict_iso_to_language,f)

In [20]:
dict_id_to_iso = dict()
dict_iso_to_id = dict()
for j,i in enumerate(dict_iso_to_language.keys()):
    dict_id_to_iso.update({j+1:i})
    dict_iso_to_id.update({i:j+1})

In [21]:
with open("./new_movie_data/dict/id_to_iso.pkl","wb") as f:
    pickle.dump(dict_id_to_iso,f)

In [22]:
l = []
for i in df_cleaned.spoken_languages:
    l2 = []
    for j in i:
        l2.append(dict_iso_to_id.get(j))
    l.append(l2)
df_cleaned.spoken_languages = l

In [23]:
with open("./new_movie_data/dict/iso_to_lang.pkl","rb") as f:
    d = pickle.load(f)
with open("./new_movie_data/dict/id_to_iso.pkl","rb") as f:
    d_ = pickle.load(f)
[d.get(d_.get(i)) for i in df_cleaned.spoken_languages.iloc[341]]

['Français', 'English']

### Keywords

In [24]:
df_cleaned.keywords.iloc[0]

"[{'id': 931, 'name': 'jealousy'}, {'id': 4290, 'name': 'toy'}, {'id': 5202, 'name': 'boy'}, {'id': 6054, 'name': 'friendship'}, {'id': 9713, 'name': 'friends'}, {'id': 9823, 'name': 'rivalry'}, {'id': 165503, 'name': 'boy next door'}, {'id': 170722, 'name': 'new toy'}, {'id': 187065, 'name': 'toy comes to life'}]"

In [25]:
dict_ids_to_keywords = convert_list_of_dict('keywords','id','name')

In [26]:
with open("./new_movie_data/dict/ids_to_keywords.pkl","wb") as f:
    pickle.dump(dict_ids_to_keywords,f)

In [27]:
with open("./new_movie_data/dict/ids_to_keywords.pkl","rb") as f:
    d = pickle.load(f)
[d.get(i) for i in df_cleaned.keywords.iloc[0]]

['jealousy',
 'toy',
 'boy',
 'friendship',
 'friends',
 'rivalry',
 'boy next door',
 'new toy',
 'toy comes to life']

In [28]:
df_cleaned = df_cleaned[[i for i in list(df_cleaned.columns) if i not in remove_col ]]

In [29]:
df_cleaned = df_cleaned.copy()
df_cleaned.loc[:, 'adult'] = df_cleaned['adult'].astype(int)

In [30]:
df_cleaned.head()

Unnamed: 0,id,adult,belongs_to_collection,genres,overview,popularity,production_companies,release_date,runtime,spoken_languages,title,vote_average,vote_count,keywords
0,862,0,10194.0,"[16, 35, 10751]","Led by Woody, Andy's toys live happily in his ...",21.946943,[3],1995,81.0,[1],Toy Story,7.7,5415.0,"[931, 4290, 5202, 6054, 9713, 9823, 165503, 17..."
1,8844,0,,"[12, 14, 10751]",When siblings Judy and Peter discover an encha...,17.015539,"[559, 2550, 10201]",1995,104.0,"[1, 2]",Jumanji,6.9,2413.0,"[10090, 10941, 15101, 33467, 158086, 158091]"
2,15602,0,119050.0,"[10749, 35]",A family wedding reignites the ancient feud be...,11.7129,"[6194, 19464]",1995,101.0,[1],Grumpier Old Men,6.5,92.0,"[1495, 12392, 179431, 208510]"
3,31357,0,,"[35, 18, 10749]","Cheated on, mistreated and stepped on, the wom...",3.859495,[306],1995,127.0,[1],Waiting to Exhale,6.1,34.0,"[818, 10131, 14768, 15160, 33455]"
4,11862,0,96871.0,[35],Just when George Banks has recovered from his ...,8.387519,"[5842, 9195]",1995,106.0,[1],Father of the Bride Part II,5.7,173.0,"[1009, 1599, 2246, 4995, 5600, 10707, 13149, 3..."


In [31]:
df_cleaned.to_csv("./new_movie_data/final_cleaned.csv",index = False)

In [91]:
df = pd.read_csv("./new_movie_data/final_cleaned.csv")

#### Forgot the credits file :(

In [92]:
df_c = pd.read_csv("./new_movie_data/credits.csv")
df_c_new = pd.merge(df,df_c,on='id')

In [93]:
[i.get('name') for i in eval(df_c_new.cast.iloc[0])]

['Tom Hanks',
 'Tim Allen',
 'Don Rickles',
 'Jim Varney',
 'Wallace Shawn',
 'John Ratzenberger',
 'Annie Potts',
 'John Morris',
 'Erik von Detten',
 'Laurie Metcalf',
 'R. Lee Ermey',
 'Sarah Freeman',
 'Penn Jillette']

In [94]:
dict_ids_to_cast = dict()
l = []
for j in df_c_new.cast:
    l2 = []
    for i in sorted(eval(j), key=lambda d: d['order']):
        l2.append(i.get('id'))
        dict_ids_to_cast.update({i.get('id') : i.get('name')})
    l.append(l2)
df_c_new.cast = l

In [95]:
with open("./new_movie_data/dict/ids_to_cast.pkl","wb") as f:
    pickle.dump(dict_ids_to_cast,f)

In [96]:
with open("./new_movie_data/dict/ids_to_cast.pkl","rb") as f:
    d = pickle.load(f)
[d.get(i) for i in df_c_new.cast.iloc[0]]

['Tom Hanks',
 'Tim Allen',
 'Don Rickles',
 'Jim Varney',
 'Wallace Shawn',
 'John Ratzenberger',
 'Annie Potts',
 'John Morris',
 'Erik von Detten',
 'Laurie Metcalf',
 'R. Lee Ermey',
 'Sarah Freeman',
 'Penn Jillette']

In [97]:
df_c_new.head()

Unnamed: 0,id,adult,belongs_to_collection,genres,overview,popularity,production_companies,release_date,runtime,spoken_languages,title,vote_average,vote_count,keywords,cast,crew
0,862,0,10194.0,"[16, 35, 10751]","Led by Woody, Andy's toys live happily in his ...",21.946943,[3],1995,81.0,[1],Toy Story,7.7,5415.0,"[931, 4290, 5202, 6054, 9713, 9823, 165503, 17...","[31, 12898, 7167, 12899, 12900, 7907, 8873, 11...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de..."
1,8844,0,,"[12, 14, 10751]",When siblings Judy and Peter discover an encha...,17.015539,"[559, 2550, 10201]",1995,104.0,"[1, 2]",Jumanji,6.9,2413.0,"[10090, 10941, 15101, 33467, 158086, 158091]","[2157, 8537, 205, 145151, 5149, 10739, 58563, ...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de..."
2,15602,0,119050.0,"[10749, 35]",A family wedding reignites the ancient feud be...,11.7129,"[6194, 19464]",1995,101.0,[1],Grumpier Old Men,6.5,92.0,"[1495, 12392, 179431, 208510]","[6837, 3151, 13567, 16757, 589, 16523, 7166]","[{'credit_id': '52fe466a9251416c75077a89', 'de..."
3,31357,0,,"[35, 18, 10749]","Cheated on, mistreated and stepped on, the wom...",3.859495,[306],1995,127.0,[1],Waiting to Exhale,6.1,34.0,"[818, 10131, 14768, 15160, 33455]","[8851, 9780, 18284, 51359, 66804, 352, 87118, ...","[{'credit_id': '52fe44779251416c91011acb', 'de..."
4,11862,0,96871.0,[35],Just when George Banks has recovered from his ...,8.387519,"[5842, 9195]",1995,106.0,[1],Father of the Bride Part II,5.7,173.0,"[1009, 1599, 2246, 4995, 5600, 10707, 13149, 3...","[67773, 3092, 519, 70696, 59222, 18793, 14592,...","[{'credit_id': '52fe44959251416c75039ed7', 'de..."


In [98]:
[i.get('name') for i in eval(df_c_new.crew.iloc[0])][:5]

['John Lasseter',
 'Joss Whedon',
 'Andrew Stanton',
 'Joel Cohen',
 'Alec Sokolow']

In [99]:
dict_ids_to_crew = dict()
l = []
for i in df_c_new['crew']:
    l2 = []
    for j in eval(i):
        l2.append(j.get('id'))
        dict_ids_to_crew.update({j.get('id'): j.get('name')})
    l.append(l2)
df_c_new['crew'] = l

In [101]:
with open("./new_movie_data/dict/ids_to_crew.pkl","wb") as f:
    pickle.dump(dict_ids_to_crew,f)

In [103]:
with open("./new_movie_data/dict/ids_to_crew.pkl","rb") as f:
    d = pickle.load(f)
[d.get(i) for i in df_c_new.crew.iloc[0]][:5]

['John Lasseter',
 'Joss Whedon',
 'Andrew Stanton',
 'Joel Cohen',
 'Alec Sokolow']

In [105]:
df_c_new.head()

Unnamed: 0,id,adult,belongs_to_collection,genres,overview,popularity,production_companies,release_date,runtime,spoken_languages,title,vote_average,vote_count,keywords,cast,crew
0,862,0,10194.0,"[16, 35, 10751]","Led by Woody, Andy's toys live happily in his ...",21.946943,[3],1995,81.0,[1],Toy Story,7.7,5415.0,"[931, 4290, 5202, 6054, 9713, 9823, 165503, 17...","[31, 12898, 7167, 12899, 12900, 7907, 8873, 11...","[7879, 12891, 7, 12892, 12893, 12894, 12895, 1..."
1,8844,0,,"[12, 14, 10751]",When siblings Judy and Peter discover an encha...,17.015539,"[559, 2550, 10201]",1995,104.0,"[1, 2]",Jumanji,6.9,2413.0,"[10090, 10941, 15101, 33467, 158086, 158091]","[2157, 8537, 205, 145151, 5149, 10739, 58563, ...","[511, 876, 1729, 4945, 4951, 4952, 8023, 9967,..."
2,15602,0,119050.0,"[10749, 35]",A family wedding reignites the ancient feud be...,11.7129,"[6194, 19464]",1995,101.0,[1],Grumpier Old Men,6.5,92.0,"[1495, 12392, 179431, 208510]","[6837, 3151, 13567, 16757, 589, 16523, 7166]","[26502, 16837, 16837, 1551320]"
3,31357,0,,"[35, 18, 10749]","Cheated on, mistreated and stepped on, the wom...",3.859495,[306],1995,127.0,[1],Waiting to Exhale,6.1,34.0,"[818, 10131, 14768, 15160, 33455]","[8851, 9780, 18284, 51359, 66804, 352, 87118, ...","[2178, 5144, 5144, 21968, 70592, 111118, 11111..."
4,11862,0,96871.0,[35],Just when George Banks has recovered from his ...,8.387519,"[5842, 9195]",1995,106.0,[1],Father of the Bride Part II,5.7,173.0,"[1009, 1599, 2246, 4995, 5600, 10707, 13149, 3...","[67773, 3092, 519, 70696, 59222, 18793, 14592,...","[37, 5506, 17698, 17698, 26160, 56106, 68755]"


In [106]:
df_c_new.to_csv("./new_movie_data/final_cleaned_2.csv",index = False)

In [107]:
df = pd.read_csv("./new_movie_data/final_cleaned_2.csv")
df

Unnamed: 0,id,adult,belongs_to_collection,genres,overview,popularity,production_companies,release_date,runtime,spoken_languages,title,vote_average,vote_count,keywords,cast,crew
0,862,0,10194.0,"[16, 35, 10751]","Led by Woody, Andy's toys live happily in his ...",21.946943,[3],1995,81.0,[1],Toy Story,7.7,5415.0,"[931, 4290, 5202, 6054, 9713, 9823, 165503, 17...","[31, 12898, 7167, 12899, 12900, 7907, 8873, 11...","[7879, 12891, 7, 12892, 12893, 12894, 12895, 1..."
1,8844,0,,"[12, 14, 10751]",When siblings Judy and Peter discover an encha...,17.015539,"[559, 2550, 10201]",1995,104.0,"[1, 2]",Jumanji,6.9,2413.0,"[10090, 10941, 15101, 33467, 158086, 158091]","[2157, 8537, 205, 145151, 5149, 10739, 58563, ...","[511, 876, 1729, 4945, 4951, 4952, 8023, 9967,..."
2,15602,0,119050.0,"[10749, 35]",A family wedding reignites the ancient feud be...,11.712900,"[6194, 19464]",1995,101.0,[1],Grumpier Old Men,6.5,92.0,"[1495, 12392, 179431, 208510]","[6837, 3151, 13567, 16757, 589, 16523, 7166]","[26502, 16837, 16837, 1551320]"
3,31357,0,,"[35, 18, 10749]","Cheated on, mistreated and stepped on, the wom...",3.859495,[306],1995,127.0,[1],Waiting to Exhale,6.1,34.0,"[818, 10131, 14768, 15160, 33455]","[8851, 9780, 18284, 51359, 66804, 352, 87118, ...","[2178, 5144, 5144, 21968, 70592, 111118, 11111..."
4,11862,0,96871.0,[35],Just when George Banks has recovered from his ...,8.387519,"[5842, 9195]",1995,106.0,[1],Father of the Bride Part II,5.7,173.0,"[1009, 1599, 2246, 4995, 5600, 10707, 13149, 3...","[67773, 3092, 519, 70696, 59222, 18793, 14592,...","[37, 5506, 17698, 17698, 26160, 56106, 68755]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
46159,439050,0,,"[18, 10751]",Rising and falling between a man and woman.,0.072051,[],0,90.0,[14],Subdue,4.0,1.0,[10703],"[240240, 1749839, 1619957]","[1182809, 1182809, 1543705, 1749842, 1749847, ..."
46160,111109,0,,[18],An artist struggles to finish his work while a...,0.178241,[19653],2011,360.0,[32],Century of Birthing,9.0,3.0,"[2679, 14531, 215397]","[1043186, 111636, 1204271, 278923, 1042953, 57...","[1051381, 1051381, 1728582, 1051381, 1051381, ..."
46161,67758,0,,"[28, 18, 53]","When one of her hits goes wrong, a professiona...",0.903007,[6165],2003,90.0,[1],Betrayal,3.8,6.0,[],"[23764, 2059, 46277, 1736, 58646, 54649, 55270...","[67753, 19713, 549355, 549356, 58818]"
46162,227506,0,,[],"In a small town live two brothers, one a minis...",0.003503,[88753],1917,87.0,[],Satan Triumphant,0.0,0.0,[],"[544742, 1090923, 1136422, 1261758, 29199]","[1085341, 1195656]"
