In [1]:
# TODO:
# https://www.imdb.com/title/tt0120783/fullcredits; get more writers, actors
# https://www.imdb.com/title/tt2557478/companycredits; no legendary in pacific rim uprising
# Scrape box office mojo (ex: terminator 2); budget, opening_usa, total_usa, total_world

In [2]:
# Imports
import bs4
import pandas
import re
import requests

### Define

In [3]:
def add_titles(soup, data):
    # title (consider using: "Also Known As")
    title = soup.find('meta', attrs={'property': 'og:title'})['content']
    title = title.replace('TV Movie ', '')
    title = title[:-7]
    title = title.strip()
    data['title'] = title
    
    # original_title
    h1 = soup.find('h1').text
    h1 = h1.strip().split('\xa0')
    original_title = h1[0]
    data['original_title'] = original_title
    return data

In [4]:
def add_release_dates(soup, data):
    # release_year
    h1 = soup.find('h1').text
    h1 = h1.strip().split('\xa0')
    release_year = h1[1].replace('(', '').replace(')', '')
    data['release_year'] = release_year
    
    # release_date
    release_date = soup.find('a', attrs={'title': "See more release dates"})
    release_date = release_date.text
    if '(' in release_date:
        release_date = release_date[:release_date.index('(')]   # Most release dates contain "(USA)" 
    release_date = release_date.replace('TV Movie', '')   # ex. tt0080789
    release_date = release_date.replace('Video', '')   # ex. tt3060952
    release_date = release_date.strip()
    data['release_date'] = release_date
    return data

In [5]:
def add_genres(soup, data):
    genres = soup.find('div', attrs={'itemprop': 'genre'})
    genres = genres.find_all('a')
    genres = [genre.text.strip() for genre in genres]
    data['genres'] = genres
    return data

In [6]:
def add_studios(soup, data):
    blocks = soup.find_all('div', class_='txt-block')
    blocks = [block for block in blocks if block.find('h4', class_='inline') is not None]
    text = [block.text for block in blocks if 'Production' in block.text][0]
    text = text.strip()
    text = re.sub(' +', ' ', text)         # Replace 1+ whitespace with 1 whitespace
    if 'See more' in text: text = text[:text.index('See more')]   # Remove everything after 'See more'
    text = text.strip()
    studios = text.split('\n')
    studios = [studio for studio in studios if studio != '']   # Remove blanks
    studios = studios[1:]   # Remove 'Production Co:'
    studios = [studio.strip().strip(',') for studio in studios]   # Remove surrounding whitespace & commas
    studios = [studio.replace(';', '') for studio in studios]   # Remove semi-colons
    data['studios'] = studios
    return data

In [7]:
def add_directors(soup, data):
    spans = soup.find_all('span', attrs={'itemprop': 'director',
                                         'itemtype': "http://schema.org/Person"})
    directors = [span.text.strip() for span in spans]
    directors = [director.strip(',') for director in directors]    
    data['directors'] = directors
    return data

In [8]:
def add_writers(soup, data):
    spans = soup.find_all('span', attrs={'itemprop': 'creator',
                                         'itemtype': "http://schema.org/Person"})
    writers = [span.text.strip() for span in spans]
    writers = [writer.strip(',') for writer in writers]
    data['writers'] = writers
    return data

In [9]:
def add_actors(soup, data):
    cast_list = soup.find('table', class_='cast_list')
    cast_list = cast_list.find_all('span', class_='itemprop')
    actors = [cast.text for cast in cast_list]
    data['actors'] = actors
    return data

In [10]:
def add_duration(soup, data):
    duration = soup.find('time', attrs={'itemprop': 'duration'}).text
    duration = duration.strip()
    data['duration'] = duration
    return data

In [11]:
def add_mpaa_rating(soup, data):
    mpaa_rating = soup.find('meta', attrs={'itemprop': 'contentRating'})
    if mpaa_rating is None:
        return data
    else:
        mpaa_rating = mpaa_rating.attrs['content']
        data['mpaa_rating'] = mpaa_rating
        return data

In [12]:
def add_locations(soup, data):
    blocks = soup.find_all('div', class_='txt-block')
    blocks = [block for block in blocks if block.find('h4', class_='inline') is not None]
    text = [block.text for block in blocks if 'Filming Locations' in block.text]
    if len(text) == 0:
        return data
    else:
        text = text[0]
        text = text.replace('\n', '').replace(': ', ':')
        text = re.sub(' +', ' ', text)         # Replace 1+ whitespace with 1 whitespace
        if 'See more' in text: text = text[:text.index('See more')]   # Remove everything after 'See more'
        text = text.strip()
        text = text.split(':')
        locations = text[1]
        data['locations'] = locations
        return data

In [13]:
# def financials(soup, data):
#     blocks = soup.find_all('div', class_='txt-block')
#     blocks = [block for block in blocks if block.find('h4', class_='inline') is not None]
#     box_office = {}
#     for block in blocks:
#         if '$' in block.text:
#             [span.extract() for span in block.find_all('span')]   # Remove span        
#             text = block.text
#             text = text.replace('\n', '').replace(',', '').replace(': ', ':')
#             text = text.replace('$', '')
#             text = text.strip()
#             text = text.split(':')
#             box_office[text[0]] = int(text[1])

#     box_office = pandas.Series(box_office)
#     columns = {'Budget' : 'budget',
#                'Opening Weekend USA' : 'opening_usa',
#                'Gross USA' : 'total_usa',
#                'Cumulative Worldwide Gross' : 'total_world'}
#     box_office = box_office.rename(index=columns)
#     data = data.append(box_office)
#     return data

### Testing

In [14]:
imdb_id = 'tt5996252'

In [15]:
# Set up soup
url = 'https://www.imdb.com/title/' + imdb_id
r = requests.get(url, timeout=5)
    soup = bs4.BeautifulSoup(r.text, 'lxml')

In [16]:
# title (consider using: "Also Known As")
title = soup.find('meta', attrs={'property': 'og:title'})['content']
title = title.replace('TV Movie ', '')
title = title[:-7]
title = title.strip()
title

'Trader'

In [17]:
# original_title
h1 = soup.find('h1').text
h1 = h1.strip().split('\xa0')
original_title = h1[0]
original_title

'Trader'

### Run

In [18]:
id_list = pandas.read_csv('raw_data/imdb_ids.csv')
id_list.groupby('imdb_id').count().query("Title ! = 1")

Unnamed: 0_level_0,Title,Release Year
imdb_id,Unnamed: 1_level_1,Unnamed: 2_level_1


In [19]:
id_list = id_list['imdb_id'].dropna().values
id_list = [imdb_id for imdb_id in id_list if imdb_id != '-']

In [20]:
# already_done = pandas.read_csv('raw_data/imdb_raw.csv')
# already_done = already_done['imdb_id'].dropna().values
# already_done = [imdb_id for imdb_id in already_done if imdb_id != '-']
# id_list = [imdb_id for imdb_id in id_list if imdb_id not in already_done]

In [21]:
%%time
# Iterate IDs
all_data = pandas.DataFrame()
for imdb_id in id_list:
    # Set up soup
    url = 'https://www.imdb.com/title/' + imdb_id
    r = requests.get(url, timeout=5)
    soup = bs4.BeautifulSoup(r.text, 'html.parser')
    
    # Get data
    data = {}
    data['imdb_id'] = imdb_id
    data = add_titles(soup, data)
    data = add_release_dates(soup, data)
    data = add_genres(soup, data)
    data = add_studios(soup, data)
    data = add_directors(soup, data)
    data = add_writers(soup, data)
    data = add_actors(soup, data)
    data = add_duration(soup, data)
    data = add_mpaa_rating(soup, data)
    data = add_locations(soup, data)
    data = pandas.Series(data)
    
    # Append
    print([data['imdb_id'], data['title'], data['release_year']])
    all_data = all_data.append(data, ignore_index=True)

['tt0050613', 'Throne of Blood', '1957']
['tt0053946', 'Inherit the Wind', '1960']
['tt0070034', 'Enter the Dragon', '1973']
['tt0073195', 'Jaws', '1975']
['tt0076759', 'Star Wars: Episode IV - A New Hope', '1977']
['tt0080179', 'Drunken Master', '1978']
['tt0078748', 'Alien', '1979']
['tt0080789', "Gideon's Trumpet", '1980']
['tt0080684', 'Star Wars: Episode V - The Empire Strikes Back', '1980']
['tt0083658', 'Blade Runner', '1982']
['tt0084827', 'TRON', '1982']
['tt0086250', 'Scarface', '1983']
['tt0086190', 'Star Wars: Episode VI - Return of the Jedi', '1983']
['tt0086567', 'WarGames', '1983']
['tt0087538', 'The Karate Kid', '1984']
['tt0088247', 'The Terminator', '1984']
['tt0089927', 'Rocky IV', '1985']
['tt0090605', 'Aliens', '1986']
['tt0092099', 'Top Gun', '1986']
['tt0093779', 'The Princess Bride', '1987']
['tt0095016', 'Die Hard', '1988']
['tt0097165', 'Dead Poets Society', '1989']
['tt0097441', 'Glory', '1989']
['tt0102057', 'Hook', '1991']
['tt0103064', 'Terminator 2', '199

['tt1136608', 'District 9', '2009']
['tt1013752', 'Fast & Furious', '2009']
['tt1046173', 'G.I. Joe: The Rise of Cobra', '2009']
['tt1034032', 'Gamer', '2009']
['tt0361748', 'Inglourious Basterds', '2009']
['tt0448011', 'Knowing', '2009']
['tt0892782', 'Monsters vs. Aliens', '2009']
['tt0988045', 'Sherlock Holmes', '2009']
['tt0796366', 'Star Trek', '2009']
['tt0438488', 'Terminator Salvation', '2009']
['tt1119646', 'The Hangover', '2009']
['tt1041829', 'The Proposal', '2009']
['tt1055369', 'Transformers: Revenge of the Fallen', '2009']
['tt1049413', 'Up', '2009']
['tt0409459', 'Watchmen', '2009']
['tt0458525', 'X-Men Origins: Wolverine', '2009']
['tt1156398', 'Zombieland', '2009']
['tt1014759', 'Alice in Wonderland', '2010']
['tt1323594', 'Despicable Me', '2010']
['tt1226229', 'Get Him to the Greek', '2010']
['tt0892769', 'How to Train Your Dragon', '2010']
['tt1375666', 'Inception', '2010']
['tt1001526', 'Megamind', '2010']
['tt1053424', 'Repo Men', '2010']
['tt0944835', 'Salt', '201

['tt0790636', 'Dallas Buyers Club', '2013']
['tt0217869', 'Unbreakable', '2000']
['tt0098258', 'Say Anything...', '1989']
['tt0087332', 'Ghostbusters', '1984']
['tt1411238', 'No Strings Attached', '2011']
['tt2975590', 'Batman v Superman: Dawn of Justice', '2016']
['tt0382932', 'Ratatouille', '2007']
['tt1182345', 'Moon', '2009']
['tt1899353', 'The Raid: Redemption', '2011']
['tt3850214', 'Dope', '2015']
['tt3498820', 'Captain America: Civil War', '2016']
['tt1663202', 'The Revenant', '2015']
['tt2267968', 'Kung Fu Panda 3', '2016']
['tt0180093', 'Requiem for a Dream', '2000']
['tt0083929', 'Fast Times at Ridgemont High', '1982']
['tt2948356', 'Zootopia', '2016']
['tt0088128', 'Sixteen Candles', '1984']
['tt0083131', 'Stripes', '1981']
['tt3385516', 'X-Men: Apocalypse', '2016']
['tt1895587', 'Spotlight', '2015']
['tt1628841', 'Independence Day: Resurgence', '2016']
['tt4094724', 'The Purge: Election Year', '2016']
['tt0093105', 'Good Morning, Vietnam', '1987']
['tt3878542', 'Justice Le

### Finalize

In [22]:
all_data = all_data[[
    'imdb_id', 'title', 'original_title', 'release_year', 'release_date',
    'genres', 'studios', 'directors', 'writers', 'actors',
    'duration', 'mpaa_rating', 'locations',
    # 'budget', 'opening_usa', 'total_usa', 'total_world',
    # 'plot', 'rating_imdb', 'rating_mc', 'rating_rt',
    # 'imdb_votes', 'type'
]]

In [23]:
all_data.to_csv('raw_data/imdb_raw.csv', index=False)

In [24]:
# all_data.to_csv('raw_data/imdb_raw.csv', header=False, index=False, mode='a')