# Metacritic Crawler

In [94]:
import re
import urllib
import urllib.request
from datetime import datetime

from bs4 import BeautifulSoup as bs

## Search

In [14]:
movie_name = "Kingsman: The Secret Service"

In [15]:
CHARS_TO_REMOVE = "[\:\;,\.'/\!]"

In [16]:
def _parse_name_for_search(movie_name):
    parsed = re.sub(CHARS_TO_REMOVE, '', movie_name)
    return parsed.replace(' ', '+')

In [40]:
SEARCH_URL = "http://www.metacritic.com/search/all/{movie_name}/results?cats%5Bmovie%5D=1&search_type=advanced"
_HEADERS = {'User-Agent': 'Mozilla/5.0'}
METACRITIC_URL = "http://www.metacritic.com"

In [43]:
def _get_movie_url_by_name(movie_name):
    query = SEARCH_URL.format(movie_name=_parse_name_for_search(movie_name))
    request = urllib.request.Request(query, headers=_HEADERS)
    search_res = bs(urllib.request.urlopen(request), "html.parser")
    first_res = search_res.find_all("li", {"class": "result first_result"})[0]
    movie_url_suffix = first_res.find_all("a")[0]['href']
    return METACRITIC_URL + movie_url_suffix

In [45]:
movie_url = _get_movie_url_by_name(movie_name)
movie_url

'http://www.metacritic.com/movie/kingsman-the-secret-service'

## Critics Reviews Page

In [38]:
CRITICS_REVIEWS_URL_SUFFIX = "/critic-reviews"

In [46]:
critics_url = movie_url + CRITICS_REVIEWS_URL_SUFFIX

In [47]:
critics_request = urllib.request.Request(critics_url, headers=_HEADERS)
critics_page = bs(urllib.request.urlopen(critics_request), "html.parser")

In [48]:
SCORE_CLASSES = [
    "metascore_w larger movie positive",
    "metascore_w larger movie mixed",
    "metascore_w larger movie negative"
]

In [54]:
metascore = int(critics_page.find_all("span", {"class": SCORE_CLASSES})[0].contents[0])
metascore

58

In [117]:
MONTH_SHORTHAND_MAP = {
    "Jan": "January", "Feb": "February", "Mar": "March", "Apr": "April",
    "May": "May", "Jun": "June", "Jul": "July", "Aug": "August",
    "Sep": "September", "Oct": "October", "Nov": "November", "Dec": "December"
}

In [111]:
def _parse_date_str(date_str):
    for month in MONTH_SHORTHAND_MAP:
        if month in date_str:
            return date_str.replace(month, MONTH_SHORTHAND_MAP[month])

In [137]:
def _get_critic_review_props(review):
    review_props = {}
    date_str = review.find_all("span", {"class": "date"})[0].contents[0]
    date_str = _parse_date_str(date_str)
    review_props['review_date'] = datetime.strptime(date_str, "%B %d, %Y").date()
    review_props['score'] = int(review.find_all("div", {"class": "metascore_w"})[0].contents[0])
    review_props['summary'] = review.find_all('a', {'class': 'no_hover'})[0].contents[0].strip()
    review_props['publication'] = None
    review_props['critic'] = None
    for link in review.find_all("a"):
        if 'publication' in link['href']:
            review_props['publication'] = link.contents[0]
        if 'critic' in link['href']:
            review_props['critic'] = link.contents[0]
    return review_props

In [124]:
reviews = []
for review in critics_page.find_all("div", {"class": "review"}):
    try:
        reviews.append(_get_critic_review_props(review))
    except Exception:
        continue

In [128]:
len(reviews)

39

## User Reviews Page

In [129]:
USERS_REVIEWS_URL_SUFFIX = "/user-reviews?page=0"

In [130]:
users_url = movie_url + USERS_REVIEWS_URL_SUFFIX

In [131]:
users_request = urllib.request.Request(users_url, headers=_HEADERS)
users_page = bs(urllib.request.urlopen(users_request), "html.parser")

In [None]:
<div class="review pad_top1">

In [134]:
review_elements = users_page.find_all("div", {"class": "review"})

In [170]:
def _get_user_review_props(review):
    review_props = {}
    date_str = review.find_all("span", {"class": "date"})[0].contents[0]
    date_str = _parse_date_str(date_str)
    review_props['review_date'] = datetime.strptime(date_str, "%B %d, %Y").date()
    review_props['score'] = int(review.find_all("div", {"class": "metascore_w"})[0].contents[0])
    try:
        review_props['text'] = review.find_all('span', {'class': 'blurb blurb_expanded'})[0].contents[0].strip()
    except IndexError:
        review_props['text'] = review.find_all('div', {'class': 'review_body'})[0].contents[1].contents[0].strip()
    review_props['user'] = review.find_all('span', {'class': 'author'})[0].contents[0].contents[0]
    review_props['total_reactions'] = int(review.find_all('span', {'class': 'total_count'})[0].contents[0])
    review_props['pos_reactions'] = int(review.find_all('span', {'class': 'yes_count'})[0].contents[0])
    review_props['neg_reactions'] = review_props['total_reactions'] - review_props['pos_reactions']
    return review_props

In [171]:
user_reviews = []

In [172]:
for review in review_elements:
    try:
        user_reviews.append(_get_user_review_props(review))
    except Exception:
        continue
#         print(review)
#         print(review.find_all('div', {'class': 'review_body'})[0].contents[1].contents[0].strip())
#         break

In [173]:
len(user_reviews)

100