# IMDB Crawler

In [85]:
from urllib.request import urlopen
from bs4 import BeautifulSoup as bs

## Search

In [86]:
TITLE_QUERY = (
    'http://www.imdb.com/find'
    '?q={title}&s=tt&ttype=ft&exact=true&ref_=fn_tt_ex'
)

In [376]:
movie_name="Amélie"

In [377]:
def convert_title(title):
    return urllib.parse.quote(title).lower()

In [378]:
convert_title(movie_name)

'am%c3%a9lie'

In [372]:
query = TITLE_QUERY.format(title=convert_title(movie_name))
search_res = bs(urlopen(query), "html.parser")

In [373]:
res_table = search_res.find_all("table", {"class": "findList"})[0]

In [374]:
first_row = res_table.find_all("tr")[0]

In [375]:
first_row

<tr class="findResult odd"> <td class="primary_photo"> <a href="/title/tt0063522/?ref_=fn_ft_tt_1"><img src="https://images-na.ssl-images-amazon.com/images/M/MV5BMjE3NzE4NzkyNl5BMl5BanBnXkFtZTgwNTYyODgwNzE@._V1_UX32_CR0,0,32,44_AL_.jpg"/></a> </td> <td class="result_text"> <a href="/title/tt0063522/?ref_=fn_ft_tt_1">Rosemary's Baby</a> (1968) </td> </tr>

### Extracting the movie code

In [342]:
import re

In [343]:
MOVIE_CODE_REGEX = r'/title/([a-z0-9]+)/'

In [344]:
movie_code = re.findall(MOVIE_CODE_REGEX, str(first_row))[0]

In [345]:
movie_code

'tt6304046'

## Movie Profile

In [346]:
PROFILE_URL = 'http://www.imdb.com/title/{code}/' #?region=us

In [347]:
cur_profile_url = PROFILE_URL.format(code=movie_code)

In [348]:
prof_page = bs(urlopen(cur_profile_url), "html.parser")

### Rating

In [351]:
prof_page


<!DOCTYPE html>

<html xmlns:fb="http://www.facebook.com/2008/fbml" xmlns:og="http://ogp.me/ns#">
<head>
<meta charset="utf-8">
<meta content="IE=edge" http-equiv="X-UA-Compatible">
<meta content="app-id=342792525, app-argument=imdb:///title/tt6304046?src=mdot" name="apple-itunes-app">
<script type="text/javascript">var ue_t0=window.ue_t0||+new Date();</script>
<script type="text/javascript">
                var ue_mid = "A1EVAM02EL8SFB"; 
                var ue_sn = "www.imdb.com";  
                var ue_furl = "fls-na.amazon.com";
                var ue_sid = "000-0000000-0000000";
                var ue_id = "1C2KMM27XVPKCFY5FTRB";
                (function(e){var c=e;var a=c.ue||{};a.main_scope="mainscopecsm";a.q=[];a.t0=c.ue_t0||+new Date();a.d=g;function g(h){return +new Date()-(h?0:a.t0)}function d(h){return function(){a.q.push({n:h,a:arguments,t:a.d()})}}function b(m,l,h,j,i){var k={m:m,f:l,l:h,c:""+j,err:i,fromOnError:1,args:arguments};c.ueLogError(k);return false}b.skipTra

In [350]:
prof_page.find_all("span", {"itemprop": "ratingValue"})

[]

In [349]:
rating = float(prof_page.find_all("span", {"itemprop": "ratingValue"})[0].contents[0])

IndexError: list index out of range

In [None]:
rating

In [306]:
rating_count = int(prof_page.find_all("span", {"itemprop": "ratingCount"})[0].contents[0].replace(',', ''))

In [307]:
rating_count

289889

### Genres

In [308]:
genres = []

In [309]:
for span in prof_page.find_all("span", {"itemprop": "genre"}):
    genres.append(span.contents[0])

In [310]:
genres

['Drama', 'Western']

### Review counts

In [311]:
REVIEW_COUNT_REGEX = r'([0-9,]+) ([a-zA-Z]+)'

In [312]:
user_review_count = 0
critic_review_count = 0

In [313]:
for span in prof_page.find_all("span", {"itemprop": "reviewCount"}):
    span_str = span.contents[0]
    res = re.findall(REVIEW_COUNT_REGEX, span_str)[0]
    if res[1] == 'user':
        user_review_count = int(res[0].replace(',', ''))
    elif res[1] == 'critic':
        critic_review_count = int(res[0].replace(',', ''))

In [314]:
user_review_count

502

In [315]:
critic_review_count

132

### Metascore

In [316]:
metascore = int(prof_page.find_all("div", {"class": "metacriticScore"})[0].contents[1].contents[0])

In [317]:
metascore

82

### Year

In [318]:
year = int(prof_page.find_all("span", {"id": "titleYear"})[0].contents[1].contents[0])

In [319]:
year

1992

### Duration

In [320]:
MOVIE_DURATION_REGEX = r'PT([0-9]+)M'

In [321]:
duration_str = prof_page.find_all("time", {"itemprop": "duration"})[0]['datetime']

In [322]:
duration_in_minutes = int(re.findall(MOVIE_DURATION_REGEX, duration_str)[0])

In [323]:
duration_in_minutes

131

### Box office section

In [324]:
BOX_CONTENT_REGEX = r"<h3.*>Box Office</h3>([\s\S]+?)<h3"

In [325]:
box_contents = re.findall(BOX_CONTENT_REGEX, str(prof_page))[0]

In [326]:
box_contents

'\n<div class="txt-block">\n<h4 class="inline">Budget:</h4>        $14,400,000        \n\n      <span class="attribute">(estimated)</span>\n</div>\n<div class="txt-block">\n<h4 class="inline">Gross:</h4>        $101,157,447        \n\n      <span class="attribute">(USA)</span>\n</div>\n<span class="see-more inline">\n<a href="business?ref_=tt_dt_bus" itemprop="url">See more</a>\xa0»\n  </span>\n<hr/>\n'

#### Budget

In [327]:
BUDGET_REGEX = r"<h4.*>Budget:</h4>\s*\$([0-9,]+)"

In [328]:
budget = int(re.findall(BUDGET_REGEX, box_contents)[0].replace(',', ''))

In [329]:
budget

14400000

#### Opening Weekend

In [330]:
from datetime import datetime

In [331]:
OPEN_DATE_REGEX = r"<h4.*>Opening Weekend:</h4>[\s\S]*?\([A-Z]+\)[\s\S]*?\(([0-9a-zA-Z\s]+)\)[\s\S]*?<h4"

In [332]:
open_date_str = re.findall(OPEN_DATE_REGEX, box_contents)[0]

IndexError: list index out of range

In [333]:
open_date = datetime.strptime(open_date_str, "%d %B %Y").date()

In [334]:
open_date

datetime.date(1994, 11, 11)

In [268]:
OPEN_PROF_REGEX = r"<h4.*>Opening Weekend:</h4>\s*[\$\£]([0-9,]+)"

In [269]:
opening_weekend_income = int(re.findall(OPEN_PROF_REGEX, box_contents)[0].replace(',', ''))

In [270]:
opening_weekend_income

4243233

In [287]:
OPEN_PROF_CURRENCY_REGEX = r"<h4.*>Opening Weekend:</h4>\s*([\$\£])[0-9,]+"

In [289]:
opening_weekend_income = re.findall(OPEN_PROF_CURRENCY_REGEX, box_contents)[0]

'£'

#### Gross

In [285]:
GROSS_DATE_REGEX = r"<h4.*>Gross:</h4>[\s\S]*?\(USA\)[\s\S]*?\(([0-9a-zA-Z\s]+)\)"

In [286]:
gross_date_str = re.findall(GROSS_DATE_REGEX, box_contents)[0]

IndexError: list index out of range

In [279]:
gross_date = datetime.strptime(gross_date_str, "%d %B %Y").date()

TypeError: strptime() argument 1 must be str, not tuple

In [280]:
gross_date

datetime.date(2011, 10, 28)

In [282]:
GROSS_REGEX = r"<h4.*>Gross:</h4>\s*\$([0-9,]+)[\s\S]*?\(USA\)"

In [283]:
gross = int(re.findall(GROSS_REGEX, box_contents)[0].replace(',', ''))

In [284]:
gross

107930000

In [158]:
# ## Business page
# BUSINESS_URL = 'http://www.imdb.com/title/{code}/business?ref_=tt_dt_bus'
# cur_business_url = BUSINESS_URL.format(code=MOVIE_CODE)
# busi_page = bs(urlopen(cur_business_url), "html.parser")
# busi_str = str(busi_page)
# #### Budget
# BUDGET_REGEX = r"<h5>Budget</h5>\n\s*\$([0-9,]+)"
# budget_dollar = int(re.findall(BUDGET_REGEX, busi_str)[0].replace(',', ''))
# ### Opening Weekend (USA)
# OPEN_WEEKEND_CONTENT_REGEX = r"<h5>Opening Weekend</h5>([\s\S]+?)<h5>"
# open_weekend_contents = re.findall(OPEN_WEEKEND_CONTENT_REGEX, busi_str)[0]
# US_OPEN_WEEKEND_REGEX = r"\$([0-9,]+)\s*\(USA\)"
# us_open_weekend = int(re.findall(US_OPEN_WEEKEND_REGEX, open_weekend_contents)[0].replace(',', ''))
# ### Gross Earnings
# GROSS_CONTENT_REGEX = r"<h5>Gross</h5>([\s\S]+?)<h5>"
# gross_contents = re.findall(GROSS_CONTENT_REGEX, busi_str)[0]
# GROSS_REGEX = r"<h5>Gross</h5>\n\s*\$([0-9,]+)\s*\(USA\)"
# gross_inc_dollar = int(re.findall(GROSS_REGEX, busi_str)[0].replace(',', ''))

## Ratings page

In [159]:
RATINGS_URL = 'http://www.imdb.com/title/{code}/ratings'
cur_ratings_url = RATINGS_URL.format(code=MOVIE_CODE)
ratings_page = bs(urlopen(cur_ratings_url), "html.parser")

In [160]:
tables = ratings_page.find_all("table")

In [161]:
def extract_table(table):
    content = []
    for row in table.find_all("tr")[1:]:
        content.append([td.get_text() for td in row.find_all("td")])
    return content

### Rating Frequency

In [162]:
hist_table = tables[0]

In [163]:
hist_content = extract_table(hist_table)

In [164]:
rating_freq = {}
for row in hist_content:
    rating_freq[int(row[2])] = int(row[0])
rating_freq

{1: 14142,
 2: 5014,
 3: 6656,
 4: 9297,
 5: 19323,
 6: 42557,
 7: 121004,
 8: 273966,
 9: 384290,
 10: 397719}

### Demographic breakdown

In [165]:
demog_table = tables[1]
demog_content = extract_table(demog_table)
demog_content

[[' Males ', '\xa0856845', '\xa08.8'],
 [' Females ', '\xa0151613', '\xa08.3'],
 [' Aged under 18 ', '\xa03878', '\xa08.5'],
 [' Males under 18 ', '\xa03197', '\xa08.6'],
 [' Females under 18 ', '\xa0652', '\xa08.3'],
 [' Aged 18-29 ', '\xa0419542', '\xa08.7'],
 [' Males Aged 18-29 ', '\xa0347236', '\xa08.7'],
 [' Females Aged 18-29 ', '\xa068657', '\xa08.2'],
 [' Aged 30-44 ', '\xa0451497', '\xa08.8'],
 [' Males Aged 30-44 ', '\xa0385615', '\xa08.8'],
 [' Females Aged 30-44 ', '\xa060572', '\xa08.4'],
 [' Aged 45+ ', '\xa093238', '\xa08.4'],
 [' Males Aged 45+ ', '\xa077458', '\xa08.5'],
 [' Females Aged 45+ ', '\xa014325', '\xa08.1'],
 [' IMDb staff ', '\xa098', '\xa08.7'],
 [' Top 1000 voters ', '\xa0942', '\xa08.2'],
 [' US users ', '\xa0212695', '\xa08.6'],
 [' Non-US users ', '\xa0574337', '\xa08.7'],
 ['\xa0'],
 [' IMDb users                         ', '\xa01273968', '\xa08.7']]

In [166]:
votes_per_demo = {}
avg_rating_per_demo = {}

In [167]:
for row in demog_content:
    try:
        votes_per_demo[row[0].strip()] = int(row[1])
        avg_rating_per_demo[row[0].strip()] = float(row[2])
    except IndexError:
        pass
print(votes_per_demo)
print(avg_rating_per_demo)

{'Top 1000 voters': 942, 'Aged 45+': 93238, 'Females Aged 18-29': 68657, 'US users': 212695, 'Males Aged 45+': 77458, 'Aged under 18': 3878, 'Aged 30-44': 451497, 'Males': 856845, 'Males under 18': 3197, 'IMDb staff': 98, 'Aged 18-29': 419542, 'Males Aged 30-44': 385615, 'IMDb users': 1273968, 'Females under 18': 652, 'Non-US users': 574337, 'Females Aged 45+': 14325, 'Males Aged 18-29': 347236, 'Females Aged 30-44': 60572, 'Females': 151613}
{'Top 1000 voters': 8.2, 'Aged 45+': 8.4, 'Females Aged 18-29': 8.2, 'US users': 8.6, 'Males Aged 45+': 8.5, 'Aged under 18': 8.5, 'Aged 30-44': 8.8, 'Males': 8.8, 'Males under 18': 8.6, 'IMDb staff': 8.7, 'Aged 18-29': 8.7, 'Males Aged 30-44': 8.8, 'IMDb users': 8.7, 'Females under 18': 8.3, 'Non-US users': 8.7, 'Females Aged 45+': 8.1, 'Males Aged 18-29': 8.7, 'Females Aged 30-44': 8.4, 'Females': 8.3}
