### Get Movie Metadata

This notebook gathers metadata from IMDb which includes crew information, description, year, among other details.

In [None]:
!pip install Cinemagoer

In [1]:
import pandas as pd
from imdb import Cinemagoer
import json
import imdb

In [2]:
ia = Cinemagoer()

In [3]:
METADATA_DIR = "../../data/8_screenplays"

df = pd.read_csv(f'{METADATA_DIR}/1_validation/clean_validated.csv', dtype={'imdb_id': str, 'id_merged': str})

In [4]:
df.head()

Unnamed: 0,imdb_id,title,match,alt_id,script_url,notes,filename,id_merged,char_fname
0,417385,12 and Holding,1,,https://imsdb.com/scripts/12-and-Holding.html,,12-and-Holding,417385,12-and-Holding_charinfo.txt
1,2024544,12 Years a Slave,1,,https://imsdb.com/scripts/12-Years-a-Slave.html,,12-Years-a-Slave,2024544,12-Years-a-Slave_charinfo.txt
2,1542344,127 Hours,1,,https://imsdb.com/scripts/127-Hours.html,,127-Hours,1542344,127-Hours_charinfo.txt
3,179626,15 Minutes,1,,https://imsdb.com/scripts/15-Minutes.html,,15-Minutes,179626,15-Minutes_charinfo.txt
4,974661,17 Again,1,,https://imsdb.com/scripts/17-Again.html,,17-Again,974661,17-Again_charinfo.txt


In [5]:
df.shape

(1494, 9)

In [6]:
df['id_merged'].duplicated().sum()

0

In [7]:
df['id_merged'].isnull().sum()

0

In [8]:
all_data = []

In [9]:
def extract_info(imdb_id):
  m = ia.get_movie(imdb_id)

  res = {}

  for key in m.keys():
      info = m[key]
      if type(info) == list:
          for item in info:
              if not isinstance(item, str):
                  person_obj = {}
                  for person_key in item.keys():
                      person_obj[person_key] = item[person_key]
                  item = person_obj

              if res.get(key):
                  res[key].append(item)
              else:
                  res[key] = [item]
      else:
          res[key] = info
          # print(info, type(info))
          if not isinstance(info, imdb.Movie.Movie):
              res[key] = info
          else:
              res[key] = info.getID()

  all_data.append(res)
  return res

In [None]:
for index, row in df.iterrows():
  id = row['id_merged']
  response = extract_info(id)
  print(index, response['imdbID'])

In [None]:
import os

if not os.path.exists(f"{METADATA_DIR}/2_metadata"): os.mkdir(f"{METADATA_DIR}/2_metadata")

with open(f'{METADATA_DIR}/2_metadata/validated_meta.json', 'w') as f:
  json.dump(all_data, f)

Now, we also parse this info into a dataframe.

In [12]:
df = pd.read_json(f'{METADATA_DIR}/2_metadata/validated_meta.json', dtype={'imdbID': str})

In [13]:
df.head()

Unnamed: 0,localized title,cast,genres,runtimes,countries,country codes,language codes,color info,aspect ratio,sound mix,...,episode,episode of,long imdb episode title,series title,canonical series title,episode title,canonical episode title,smart canonical series title,smart canonical episode title,creator
0,12 and Holding,"[{'name': 'Conor Donovan', 'canonical name': '...",[Drama],[95],[United States],[us],[en],[Color],1.85 : 1,[Dolby Digital],...,,,,,,,,,,
1,12 Years a Slave,"[{'name': 'Chiwetel Ejiofor', 'canonical name'...","[Biography, Drama, History]",[134],"[United States, United Kingdom]","[us, gb]",[en],[Color],2.35 : 1,"[DTS::(Digital DTS Sound), Dolby Digital, SDDS...",...,,,,,,,,,,
2,127 Hours,"[{'name': 'James Franco', 'canonical name': 'F...","[Biography, Drama]",[94],"[United States, United Kingdom, France]","[us, gb, fr]","[en, it]",[Color],1.85 : 1,"[SDDS, Dolby Digital, DTS]",...,,,,,,,,,,
3,15 Minutes,"[{'name': 'Robert De Niro', 'canonical name': ...","[Action, Crime, Drama, Thriller]",[120],"[United States, Germany]","[us, de]","[en, cs, ru, el, af]",[Color],2.35 : 1,"[DTS, Dolby Digital, SDDS]",...,,,,,,,,,,
4,17 Again,"[{'name': 'Zac Efron', 'canonical name': 'Efro...","[Comedy, Drama, Fantasy, Romance, Sport]",[102],[United States],[us],[en],[Color],2.35 : 1,"[SDDS, Dolby Digital, DTS]",...,,,,,,,,,,


In [14]:
df[df['imdbID'].duplicated()]

Unnamed: 0,localized title,cast,genres,runtimes,countries,country codes,language codes,color info,aspect ratio,sound mix,...,episode,episode of,long imdb episode title,series title,canonical series title,episode title,canonical episode title,smart canonical series title,smart canonical episode title,creator


In [15]:
df['kind'].value_counts()

kind
movie          1456
episode          17
video movie       7
tv series         7
tv movie          5
short             1
Name: count, dtype: int64

In [16]:
df_clean = df[['movie' in kind for kind in df['kind']]]

In [17]:
df_clean.shape

(1468, 86)

In [18]:
df_clean.loc[df_clean['imdbID'] == '20768712']

Unnamed: 0,localized title,cast,genres,runtimes,countries,country codes,language codes,color info,aspect ratio,sound mix,...,episode,episode of,long imdb episode title,series title,canonical series title,episode title,canonical episode title,smart canonical series title,smart canonical episode title,creator
545,Book Club: The Next Chapter,"[{'name': 'Diane Keaton', 'canonical name': 'K...",[Comedy],[107],[United States],[us],"[en, it]",[Color],1.85 : 1,,...,,,,,,,,,,


We export metadata for movies only into csv format.

In [19]:
df_clean.to_csv(f'{METADATA_DIR}/2_metadata/validated_movies.csv', index=False)