In [None]:
# TODO:
# https://www.imdb.com/title/tt0120783/fullcredits; get more writers, actors
# https://www.imdb.com/title/tt2557478/companycredits; no legendary in pacific rim uprising
# Scrape box office mojo (ex: terminator 2); budget, opening_usa, total_usa, total_world

# TODO:
# Get original title vs. revised title (probably use revised title)
# ex: tt1104001, tt0364569, tt0290334, tt0398373
# <meta content="Blade of Kings (2004)" property="og:title"/>

In [None]:
# Imports
import bs4
import pandas
import re
import requests

### Define

In [None]:
def add_title(soup, data):
    # title
    h1 = soup.find('h1').text
    h1 = h1.strip().split('\xa0')
    title = h1[0]
    data['title'] = title

#     # original_title
#     if soup.find('div', class_='originalTitle') is not None:
#         original_title = soup.find('div', class_='originalTitle').text
#         original_title = original_title[:original_title.index('(')]
#         original_title = original_title.strip()
        
    return data

In [None]:
def add_release_datetime(soup, data):
    # release_year
    h1 = soup.find('h1').text
    h1 = h1.strip().split('\xa0')
    release_year = h1[1].replace('(', '').replace(')', '')
    data['release_year'] = release_year
    
    # release_date
    release_date = soup.find('a', attrs={'title': "See more release dates"})
    release_date = release_date.text
    if '(' in release_date:
        release_date = release_date[:release_date.index('(')]   # Most release dates contain "(USA)" 
    release_date = release_date.replace('TV Movie', '')   # ex. tt0080789
    release_date = release_date.replace('Video', '')   # ex. tt3060952
    release_date = release_date.strip()
    data['release_date'] = release_date
    return data

In [None]:
def add_genres(soup, data):
    genres = soup.find('div', attrs={'itemprop': 'genre'})
    genres = genres.find_all('a')
    genres = [genre.text.strip() for genre in genres]
    data['genres'] = genres
    return data

In [None]:
def add_studios(soup, data):
    blocks = soup.find_all('div', class_='txt-block')
    blocks = [block for block in blocks if block.find('h4', class_='inline') is not None]
    text = [block.text for block in blocks if 'Production' in block.text][0]
    text = text.strip()
    text = re.sub(' +', ' ', text)         # Replace 1+ whitespace with 1 whitespace
    if 'See more' in text: text = text[:text.index('See more')]   # Remove everything after 'See more'
    text = text.strip()
    studios = text.split('\n')
    studios = [studio for studio in studios if studio != '']   # Remove blanks
    studios = studios[1:]   # Remove 'Production Co:'
    studios = [studio.strip().strip(',') for studio in studios]   # Remove surrounding whitespace & commas
    studios = [studio.replace(';', '') for studio in studios]   # Remove semi-colons
    data['studios'] = studios
    return data

In [None]:
def add_directors(soup, data):
    spans = soup.find_all('span', attrs={'itemprop': 'director',
                                         'itemtype': "http://schema.org/Person"})
    directors = [span.text.strip() for span in spans]
    directors = [director.strip(',') for director in directors]    
    data['directors'] = directors
    return data

In [None]:
def add_writers(soup, data):
    spans = soup.find_all('span', attrs={'itemprop': 'creator',
                                         'itemtype': "http://schema.org/Person"})
    writers = [span.text.strip() for span in spans]
    writers = [writer.strip(',') for writer in writers]
    data['writers'] = writers
    return data

In [None]:
def add_actors(soup, data):
    cast_list = soup.find('table', class_='cast_list')
    cast_list = cast_list.find_all('span', class_='itemprop')
    actors = [cast.text for cast in cast_list]
    data['actors'] = actors
    return data

In [None]:
def add_duration(soup, data):
    duration = soup.find('time', attrs={'itemprop': 'duration'}).text
    duration = duration.strip()
    data['duration'] = duration
    return data

In [None]:
def add_mpaa_rating(soup, data):
    mpaa_rating = soup.find('meta', attrs={'itemprop': 'contentRating'})
    if mpaa_rating is None:
        return data
    else:
        mpaa_rating = mpaa_rating.attrs['content']
        data['mpaa_rating'] = mpaa_rating
        return data

In [None]:
def add_locations(soup, data):
    blocks = soup.find_all('div', class_='txt-block')
    blocks = [block for block in blocks if block.find('h4', class_='inline') is not None]
    text = [block.text for block in blocks if 'Filming Locations' in block.text]
    if len(text) == 0:
        return data
    else:
        text = text[0]
        text = text.replace('\n', '').replace(': ', ':')
        text = re.sub(' +', ' ', text)         # Replace 1+ whitespace with 1 whitespace
        if 'See more' in text: text = text[:text.index('See more')]   # Remove everything after 'See more'
        text = text.strip()
        text = text.split(':')
        locations = text[1]
        data['locations'] = locations
        return data

In [None]:
# def financials(soup, data):
#     blocks = soup.find_all('div', class_='txt-block')
#     blocks = [block for block in blocks if block.find('h4', class_='inline') is not None]
#     box_office = {}
#     for block in blocks:
#         if '$' in block.text:
#             [span.extract() for span in block.find_all('span')]   # Remove span        
#             text = block.text
#             text = text.replace('\n', '').replace(',', '').replace(': ', ':')
#             text = text.replace('$', '')
#             text = text.strip()
#             text = text.split(':')
#             box_office[text[0]] = int(text[1])

#     box_office = pandas.Series(box_office)
#     columns = {'Budget' : 'budget',
#                'Opening Weekend USA' : 'opening_usa',
#                'Gross USA' : 'total_usa',
#                'Cumulative Worldwide Gross' : 'total_world'}
#     box_office = box_office.rename(index=columns)
#     data = data.append(box_office)
#     return data

### Testing

In [None]:
imdb_id = 'tt0066999'

In [None]:
# Set up soup
url = 'https://www.imdb.com/title/' + imdb_id
r = requests.get(url, timeout=5)
soup = bs4.BeautifulSoup(r.text, 'lxml')

In [None]:
# title ("original title")
h1 = soup.find('h1').text
h1 = h1.strip().split('\xa0')
title = h1[0]
soup.find('h1')

In [None]:
# title (revised title / official title / the title I probably know it by)
soup.find('meta', attrs={'property': 'og:title'})['content']

In [None]:
blocks = soup.find_all('div', class_='txt-block')
blocks = [block for block in blocks if block.find('h4', class_='inline') is not None]
text = [block.text for block in blocks if 'Production' in block.text][0]
text = text.strip()
text = re.sub(' +', ' ', text)         # Replace 1+ whitespace with 1 whitespace
if 'See more' in text: text = text[:text.index('See more')]   # Remove everything after 'See more'
text = text.strip()
studios = text.split('\n')
studios = [studio for studio in studios if studio != '']   # Remove blanks
studios = studios[1:]   # Remove 'Production Co:'
studios = [studio.strip().strip(',') for studio in studios]   # Remove surrounding whitespace & commas
studios = [studio.replace(';', '') for studio in studios]   # Remove semi-colons

In [None]:
studios

### Run

In [None]:
id_list = pandas.read_csv('raw_data/imdb_ids.csv')
id_list = id_list['imdb_id'].dropna().values
id_list = [imdb_id for imdb_id in id_list if imdb_id != '-']

In [None]:
# already_done = pandas.read_csv('raw_data/imdb_raw.csv')
# already_done = already_done['imdb_id'].dropna().values
# already_done = [imdb_id for imdb_id in already_done if imdb_id != '-']
# id_list = [imdb_id for imdb_id in id_list if imdb_id not in already_done]

In [None]:
%%time
# Iterate IDs
all_data = pandas.DataFrame()
for imdb_id in id_list:
    # Set up soup
    url = 'https://www.imdb.com/title/' + imdb_id
    r = requests.get(url, timeout=5)
    soup = bs4.BeautifulSoup(r.text, 'html.parser')
    
    # Get data
    data = {}
    data['imdb_id'] = imdb_id
    data = add_title(soup, data)
    data = add_release_datetime(soup, data)
    data = add_genres(soup, data)
    data = add_studios(soup, data)
    data = add_directors(soup, data)
    data = add_writers(soup, data)
    data = add_actors(soup, data)
    data = add_duration(soup, data)
    data = add_mpaa_rating(soup, data)
    data = add_locations(soup, data)
    data = pandas.Series(data)
    
    # Append
    print([data['imdb_id'], data['title'], data['release_year']])
    all_data = all_data.append(data, ignore_index=True)

### Finalize

In [None]:
all_data = all_data[[
    'imdb_id', 'title', 'release_year', 'release_date', 'genres',
    'studios', 'directors', 'writers', 'actors',
    'duration', 'mpaa_rating', 'locations',
    # 'budget', 'opening_usa', 'total_usa', 'total_world',
    # 'plot', 'rating_imdb', 'rating_mc', 'rating_rt',
    # 'imdb_votes', 'type'
]]

In [None]:
all_data.to_csv('raw_data/imdb_raw.csv', index=False)

In [None]:
# all_data.to_csv('raw_data/imdb_raw.csv', header=False, index=False, mode='a')