# IMDB Crawler

In [1]:
import urllib
from bs4 import BeautifulSoup as bs

In [2]:
import urllib.request

## Search

In [3]:
TITLE_QUERY = (
    'http://www.imdb.com/find'
    '?q={title}&s=tt&ttype=ft&exact=true&ref_=fn_tt_ex'
)

In [4]:
movie_name="Black or White"

In [5]:
def convert_title(title):
    return urllib.parse.quote(title).lower()

In [6]:
convert_title(movie_name)

'black%20or%20white'

In [7]:
query = TITLE_QUERY.format(title=convert_title(movie_name))
search_res = bs(urllib.request.urlopen(query), "html.parser")

In [8]:
res_table = search_res.find_all("table", {"class": "findList"})[0]

In [12]:
for line in res_table.find_all("tr"):
    print('2014' in str(line))
    print(line)

False
<tr class="findResult odd"> <td class="primary_photo"> <a href="/title/tt0086465/?ref_=fn_ft_tt_1"><img src="https://images-na.ssl-images-amazon.com/images/M/MV5BYTEzMjBiMzktMjQyMS00YzBhLTgzNWQtNzA0NmEwMTNmMDQ2XkEyXkFqcGdeQXVyNDk3NzU2MTQ@._V1_UX32_CR0,0,32,44_AL_.jpg"/></a> </td> <td class="result_text"> <a href="/title/tt0086465/?ref_=fn_ft_tt_1">Trading Places</a> (1983) <br/>aka <i>"Black or White"</i> </td> </tr>
True
<tr class="findResult even"> <td class="primary_photo"> <a href="/title/tt2883434/?ref_=fn_ft_tt_2"><img src="https://images-na.ssl-images-amazon.com/images/M/MV5BMTYyMzE2NTE5MV5BMl5BanBnXkFtZTgwNDI3ODI2MzE@._V1_UX32_CR0,0,32,44_AL_.jpg"/></a> </td> <td class="result_text"> <a href="/title/tt2883434/?ref_=fn_ft_tt_2">Black or White</a> (2014) </td> </tr>
False
<tr class="findResult odd"> <td class="primary_photo"> <a href="/title/tt6270540/?ref_=fn_ft_tt_3"><img src="http://ia.media-imdb.com/images/G/01/imdb/images/nopicture/32x44/film-3119741174._CB522736599_.p

In [9]:
first_row = res_table.find_all("tr")[0]

In [10]:
first_row

<tr class="findResult odd"> <td class="primary_photo"> <a href="/title/tt0078748/?ref_=fn_ft_tt_1"><img src="https://images-na.ssl-images-amazon.com/images/M/MV5BNDNhN2IxZWItNGEwYS00ZDNhLThiM2UtODU3NWJlZjBkYjQxXkEyXkFqcGdeQXVyMTQxNzMzNDI@._V1_UX32_CR0,0,32,44_AL_.jpg"/></a> </td> <td class="result_text"> <a href="/title/tt0078748/?ref_=fn_ft_tt_1">Alien</a> (1979) </td> </tr>

### Extracting the movie code

In [11]:
import re

In [12]:
MOVIE_CODE_REGEX = r'/title/([a-z0-9]+)/'

In [13]:
movie_code = re.findall(MOVIE_CODE_REGEX, str(first_row))[0]

In [14]:
movie_code

'tt0078748'

## Movie Profile

In [None]:
PROFILE_URL = 'http://www.imdb.com/title/{code}/' #?region=us

In [None]:
cur_profile_url = PROFILE_URL.format(code=movie_code)

In [None]:
prof_page = bs(urlopen(cur_profile_url), "html.parser")

### Rating

In [None]:
prof_page.find_all("span", {"itemprop": "ratingValue"})

In [None]:
rating = float(prof_page.find_all("span", {"itemprop": "ratingValue"})[0].contents[0])

In [None]:
rating

In [None]:
rating_count = int(prof_page.find_all("span", {"itemprop": "ratingCount"})[0].contents[0].replace(',', ''))

In [None]:
rating_count

### Genres

In [None]:
genres = []

In [None]:
for span in prof_page.find_all("span", {"itemprop": "genre"}):
    genres.append(span.contents[0])

In [None]:
genres

### Review counts

In [None]:
REVIEW_COUNT_REGEX = r'([0-9,]+) ([a-zA-Z]+)'

In [None]:
user_review_count = 0
critic_review_count = 0

In [None]:
for span in prof_page.find_all("span", {"itemprop": "reviewCount"}):
    span_str = span.contents[0]
    res = re.findall(REVIEW_COUNT_REGEX, span_str)[0]
    if res[1] == 'user':
        user_review_count = int(res[0].replace(',', ''))
    elif res[1] == 'critic':
        critic_review_count = int(res[0].replace(',', ''))

In [None]:
user_review_count

In [None]:
critic_review_count

### Metascore

In [None]:
metascore = int(prof_page.find_all("div", {"class": "metacriticScore"})[0].contents[1].contents[0])

In [None]:
metascore

### Year

In [None]:
year = int(prof_page.find_all("span", {"id": "titleYear"})[0].contents[1].contents[0])

In [None]:
year

### Duration

In [None]:
MOVIE_DURATION_REGEX = r'PT([0-9]+)M'

In [None]:
duration_str = prof_page.find_all("time", {"itemprop": "duration"})[0]['datetime']

In [None]:
duration_in_minutes = int(re.findall(MOVIE_DURATION_REGEX, duration_str)[0])

In [None]:
duration_in_minutes

### Box office section

In [None]:
BOX_CONTENT_REGEX = r"<h3.*>Box Office</h3>([\s\S]+?)<h3"

In [None]:
box_contents = re.findall(BOX_CONTENT_REGEX, str(prof_page))[0]

In [None]:
box_contents

#### Budget

In [None]:
BUDGET_REGEX = r"<h4.*>Budget:</h4>\s*\$([0-9,]+)"

In [None]:
budget = int(re.findall(BUDGET_REGEX, box_contents)[0].replace(',', ''))

In [None]:
budget

#### Opening Weekend

In [None]:
from datetime import datetime

In [None]:
OPEN_DATE_REGEX = r"<h4.*>Opening Weekend:</h4>[\s\S]*?\([A-Z]+\)[\s\S]*?\(([0-9a-zA-Z\s]+)\)[\s\S]*?<h4"

In [None]:
open_date_str = re.findall(OPEN_DATE_REGEX, box_contents)[0]

In [None]:
open_date = datetime.strptime(open_date_str, "%d %B %Y").date()

In [None]:
open_date

In [None]:
OPEN_PROF_REGEX = r"<h4.*>Opening Weekend:</h4>\s*[\$\£]([0-9,]+)"

In [None]:
opening_weekend_income = int(re.findall(OPEN_PROF_REGEX, box_contents)[0].replace(',', ''))

In [None]:
opening_weekend_income

In [None]:
OPEN_PROF_CURRENCY_REGEX = r"<h4.*>Opening Weekend:</h4>\s*([\$\£])[0-9,]+"

In [None]:
opening_weekend_currency = re.findall(OPEN_PROF_CURRENCY_REGEX, box_contents)[0]
opening_weekend_currency

#### Gross

In [None]:
GROSS_DATE_REGEX = r"<h4.*>Gross:</h4>[\s\S]*?\(USA\)[\s\S]*?\(([0-9a-zA-Z\s]+)\)"

In [None]:
gross_date_str = re.findall(GROSS_DATE_REGEX, box_contents)[0]

In [None]:
gross_date = datetime.strptime(gross_date_str, "%d %B %Y").date()

In [None]:
gross_date

In [None]:
GROSS_REGEX = r"<h4.*>Gross:</h4>\s*\$([0-9,]+)[\s\S]*?\(USA\)"

In [None]:
gross = int(re.findall(GROSS_REGEX, box_contents)[0].replace(',', ''))

In [None]:
gross

## Ratings page

In [None]:
RATINGS_URL = 'http://www.imdb.com/title/{code}/ratings'
cur_ratings_url = RATINGS_URL.format(code=movie_code)
ratings_page = bs(urlopen(cur_ratings_url), "html.parser")

In [None]:
tables = ratings_page.find_all("table")

In [None]:
def extract_table(table):
    content = []
    for row in table.find_all("tr")[1:]:
        content.append([td.get_text() for td in row.find_all("td")])
    return content

### Rating Frequency

In [None]:
hist_table = tables[0]

In [None]:
hist_content = extract_table(hist_table)

In [None]:
rating_freq = {}
for row in hist_content:
    rating_freq[int(row[2])] = int(row[0])
rating_freq

### Demographic breakdown

In [None]:
demog_table = tables[1]
demog_content = extract_table(demog_table)
demog_content

In [None]:
votes_per_demo = {}
avg_rating_per_demo = {}

In [None]:
for row in demog_content:
    try:
        votes_per_demo[row[0].strip()] = int(row[1])
        avg_rating_per_demo[row[0].strip()] = float(row[2])
    except IndexError:
        pass
print(votes_per_demo)
print(avg_rating_per_demo)

## Business page

In [None]:
BUSINESS_URL = 'http://www.imdb.com/title/{code}/business'
cur_business_url = BUSINESS_URL.format(code=movie_code)
busi_page = bs(urlopen(cur_business_url), "html.parser")
busi_str = str(busi_page)

In [None]:
# #### Budget
# BUDGET_REGEX = r"<h5>Budget</h5>\n\s*\$([0-9,]+)"
# budget_dollar = int(re.findall(BUDGET_REGEX, busi_str)[0].replace(',', ''))

#### Number of screens (weekends)

In [None]:
WEEKEND_CONTENT_REGEX = r"<h5>Weekend Gross</h5>([\s\S]+?)<h5>"
weekend_contents = re.findall(WEEKEND_CONTENT_REGEX, busi_str)[0]
weekend_contents

In [None]:
US_OPEN_WEEKEND_REGEX = r"\$[\s\S]*?\(USA\)[\s\S]*?\(([0-9,]*) Screens\)"
num_screens_list = [int(match.replace(',','')) for match in re.findall(US_OPEN_WEEKEND_REGEX, weekend_contents)]
num_screens_list

In [None]:
import math

In [None]:
max_screens = max(num_screens_list)
avg_screens = sum(num_screens_list) / len(num_screens_list)
num_weekends = len(num_screens_list)

In [None]:
# ### Gross Earnings
# GROSS_CONTENT_REGEX = r"<h5>Gross</h5>([\s\S]+?)<h5>"
# gross_contents = re.findall(GROSS_CONTENT_REGEX, busi_str)[0]
# GROSS_REGEX = r"<h5>Gross</h5>\n\s*\$([0-9,]+)\s*\(USA\)"
# gross_inc_dollar = int(re.findall(GROSS_REGEX, busi_str)[0].replace(',', ''))

## Release Info Page

In [18]:
RELEASE_URL = 'http://www.imdb.com/title/{code}/releaseinfo'
cur_release_url = RELEASE_URL.format(code=movie_code)
release_page = bs(urllib.request.urlopen(cur_release_url), "html.parser")

In [38]:
release_table = release_page.find_all("table", {"id": "release_dates"})[0]

In [51]:
us_rows = []
for row in release_table.find_all("tr")[1:]:
    row_str = str(row)
    if 'USA' in row_str:
        us_rows.append(row_str)

In [68]:
USA_ROW_REGEX = "<tr[\s\S]*?USA[\s\S]*?(\d\d?)\s+([a-zA-Z]+)[\s\S]*?(\d\d\d\d)[\s\S]*?<td></td>[\s\S]*?</tr>"

In [73]:
for row in us_rows:
    if re.match(USA_ROW_REGEX, row):
        release = re.findall(USA_ROW_REGEX, row)[0]
        release_day = int(release[0])
        release_month = release[1]
        release_year = int(release[2])

In [74]:
release_day

22

In [75]:
release_month

'June'

In [76]:
release_year

1979

# Uniting Dataframes

In [None]:
import pandas as pd

In [None]:
df = pd.read_csv('/Users/shaypalachy/clones/rotten_needles/data/movie_profiles.csv')

In [None]:
df.columns

In [None]:
import os
from rotten_needles.imdb_crawl.jsondate import (load, dump)

In [None]:
profiles = []
for profile_file in os.listdir('/Users/shaypalachy/clones/rotten_needles/data/movie_profiles'):
    print('Reading {}'.format(profile_file))
    file_path = os.path.join('/Users/shaypalachy/clones/rotten_needles/data/movie_profiles', profile_file)
    with open(file_path, 'r') as json_file:
        profiles.append(load(json_file))
df = pd.DataFrame(profiles)

In [None]:
df.ix[0]

In [None]:
DEMOGRAPHICS = ['Aged under 18', 'Males under 18', 'Males Aged 45+', 'Females', 'Males Aged 18-29', 'IMDb staff', 'IMDb users', 'Males', 'Aged 30-44', 'Females Aged 45+', 'Aged 18-29', 'Females Aged 18-29', 'Aged 45+', 'Males Aged 30-44', 'Top 1000 voters', 'Females under 18', 'Females Aged 30-44', 'US users', 'Non-US users']

In [None]:
DEMOGRAPHICS = ['aged_under_18',
 'males_under_18',
 'males_aged_45+',
 'females',
 'males_aged_18-29',
 'imdb_staff',
 'imdb_users',
 'males',
 'aged_30-44',
 'females_aged_45+',
 'aged_18-29',
 'females_aged_18-29',
 'aged_45+',
 'males_aged_30-44',
 'top_1000_voters',
 'females_under_18',
 'females_aged_30-44',
 'us_users',
 'non-us_users']

In [None]:
def _parse_string(string):
    return string.lower().strip().replace(' ', '_')

In [None]:
def decompose_dict_column(df, colname, allowed_cols):
    newdf = df[colname].apply(pd.Series)
    newdf = newdf.drop([col for col in newdf.columns if col not in allowed_cols], axis=1)
    newdf.columns = [colname+'.'+col for col in newdf.columns]
    return pd.concat([df.drop([colname], axis=1), newdf], axis=1)

In [None]:
decompose_dict_column(df, 'avg_rating_per_demo', DEMOGRAPHICS);

In [None]:
decompose_dict_column(df, 'votes_per_demo', DEMOGRAPHICS);

In [None]:
decompose_dict_column(df, 'rating_freq', [str(i) for i in range(1,11)])

In [None]:
genre_set = set([genre for genre_list in df.genres.dropna() for genre in genre_list])
genre_set

In [None]:
def dummy_list_column(df, colname):
    value_set = set([value for value_list in df[colname].dropna() for value in value_list])
    def value_list_to_dict(value_list):
        try:
            return {value : 1 if value in value_list else 0 for value in value_set}
        except TypeError:
            return {value : 0 for value in value_set}
    df[colname] = df[colname].apply(value_list_to_dict)
    return decompose_dict_column(df, colname, list(value_set))

In [None]:
dummy_list_column(df, 'genres')

In [None]:
df.replace?

In [None]:
df['genres'] = df['genres'].apply(genre_list_to_dict)