# Predicting Best Picture Winners & Nominees
*An Analysis by Sean Osier*

## Data Scraping with Beautiful Soup

In [1]:
# Import Dependencies
import pickle
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import re

# For display
import pprint
import matplotlib.pyplot as plt
%matplotlib inline

In [524]:
# Pickling functions
def pickle_it(data, filename, python_version=3):
    """
    In:
    data = the data you want to pickle (save)
    filename = file name where you want to save the data
    python_version = the python version where you will be opening the pickle file
    
    Out:
    Saves a pickle file with your data to to the filename you specify
    """
    with open(filename, "wb") as picklefile:
        pickle.dump(data, picklefile, protocol=python_version)

def load_pickle(filename):
    """
    In:
    filename = name of the pickle file you want to open (e.g "my_pickle.pkl")
    
    Out:
    Opens and returns the content of the picklefile to a variable of your choice
    """
    with open(filename, "rb") as picklefile: 
        return pickle.load(picklefile)

In [720]:
def get_HTML(url):
    """
    In:
    url = address of the website whose contents you want to scrape
    
    Out:
    html = the raw HTML of the website for scraping
    """
    response = requests.get(url)
    assert (response.status_code >= 200) and (response.status_code < 300)
    html = response.text
    html = BeautifulSoup(html, "lxml")
    return html

### Scrape IMDB Movie Index (List) Pages

In [230]:
def get_num_movies_in_year(html):
    """
    In:
    html = HTML of an IMDB movie index list page
    
    Out:
    num_movies = the total number of movies in the IMDB movie index list across all pages
    """
    wordy_num_movies = html.find(id="left").text
    num_movies = int(wordy_num_movies.split()[2].replace(",",""))
    return num_movies

In [257]:
def scrape_year_page(html):
    """
    In:
    html = HTML of an IMDB movie year index list page
    
    
    Out:
    movie_data = a list of lists of movie data, each sub-list being a list of the following movie datapoints: title, 
                 year, link, user_rating_long, user_rating_short, outline, director, starring, genre, pg_rating, 
                 runtime
    """
    # Extract relevant part of the page and pull out the td's for individual movies
    results_table = html.find(class_="results")
    even_trs = results_table.find_all(class_="even detailed")
    odd_trs = results_table.find_all(class_="odd detailed")
    trs = even_trs + odd_trs
    tds = []
    for tr in trs:
        tds += tr.find_all(class_="title")

    movie_data = []
    for td in tds:
        # Movie title
        title = td.find("a").text
        
        # Movie year
        year = td.find(class_="year_type").text
        
        # Link to individual movie page
        link = td.find("a")["href"]
        
        # User (IMDB) rating
        try:
            user_rating_long = td.find(class_="rating rating-list")["title"]
        except:
            user_rating_long = ""
        try:
            user_rating_short = td.find(class_="value").text
        except:
            user_rating_short = ""
        
        # Movie outline / summary
        try:
            outline = td.find(class_="outline").text
        except:
            outline = ""

        # Director / Stars
        try:
            credits = td.find(class_="credit").text
            credits = credits[1:-1].strip().split("\n")
        except:
            credits = ""    
        try:
            director = credits[0][5:]
        except:
            director = ""
        try:
            starring = credits[1].strip()[6:]
        except:
            starring = ""

        # Genre
        try:
            genre = td.find(class_="genre").text
        except:
            genre = ""
        
        # Parental Guidance rating
        try:
            pg_rating = td.find(class_="certificate").find("span")["title"]
        except:
            pg_rating = ""
        
        # Movie runtime
        try:
            runtime = td.find(class_="runtime").text
        except:
            runtime = ""

        movie_data.append([title, year, link, user_rating_long, user_rating_short,
                           outline, director, starring, genre, pg_rating, runtime])

    return movie_data

In [258]:
def scrape_IMBD_movie_list(start_year, end_year):
    """
    In:
    start_year = year to start scraping IMDB for
    end_year = last year to scrape from IMDB
    
    Out:
    movie_data = a list of lists of movie data, each sub-list being a list of the following movie datapoints: title, 
                 year, link, user_rating_long, user_rating_short, outline, director, starring, genre, pg_rating, 
                 runtime
    """
    years = range(start_year, end_year + 1)
    movie_data = []

    # For each year, go through all the the individual year movie index pages and scrape all the movies
    for year in years:
        current_n = 1
        url = "http://www.imdb.com/search/title?sort=moviemeter,asc&start=%s&title_type=feature&year=%s,%s" \
            % (str(current_n), str(year), str(year))
        html = get_HTML(url)
        num_movies = get_num_movies_in_year(html)
        movie_data += scrape_year_page(html)
        current_n += 50

        while current_n <= num_movies:
            url = "http://www.imdb.com/search/title?sort=moviemeter,asc&start=%s&title_type=feature&year=%s,%s" \
                % (str(current_n), str(year), str(year))
            html = get_HTML(url)
            movie_data += scrape_year_page(html)
            current_n += 50
            
    return movie_data

In [None]:
"""Uncomment this if you want to scrape the data"""
# movie_data = scrape_IMBD_movie_list(1990, 2014)

In [None]:
# Check and make sure you have the expected number of movies in your results
len(movie_data)

In [None]:
"""Uncomment this when you want to save the data you scraped"""
# pickle_it(movie_data, "movie_data.pkl")
!ls

### Scrape Wikipedia for Best Picture Winner and Nominees

In [359]:
def scrape_wiki_best_pic_page(html):
    """
    In:
    html = html for the wikipedia Best Picture page
    
    Out:
    nominee_data = list of movies categorized "W" for Best Picture win and "N" for nominated
    """
    nominee_tables = html.find_all(class_="wikitable")
    nominee_data = []
    
    for table in nominee_tables:
        year = table.find("caption").find("big").find("a").text
        trs = table.find_all("tr")    
        trs = trs[1:]
        movie_data = []
        
        for i, tr in enumerate(trs):
            title = tr.find("td").find("a").text
            if i == 0:
                status = "W"
            else:
                status = "N"
            movie_data.append([title, year, status])
            
        nominee_data += movie_data
    
    return nominee_data

In [None]:
"""Uncomment these if you want to scrape the data"""
# wiki_html = get_HTML("https://en.wikipedia.org/wiki/Academy_Award_for_Best_Picture")
# nominees_and_winners_raw = scrape_wiki_best_pic_page(wiki_html)

# Check to make sure the appopriate number of movies were scraped
len(nominees_and_winners_raw)

In [None]:
"""Uncomment this when you want to save the data you scraped"""
# pickle_it(nominees_and_winners_raw, "nominees_and_winners.pkl")
!ls

### Scrape Individual IMDB Movie Pages

In [823]:
def scrape_individual_movie_page(html):
    """
    In:
    html = HTML for an individual IMDB movie page
    
    Out:
    List of movie data containing data points for: release_date, critic_rating, critic_rating_n, writer, country, 
            language, budget, opening_weekend_gross, production_company, sound_mix, color, aspect_ratio
    """
    # Release Data
    try:
        release_date = html.find(class_="infobar").find(class_="nobr").text.strip().split("\n")[0]
    except:
        release_date = ""
    
    # Critic Rating (Metascore) and n-size
    try:
        rating_details = html.find(class_="star-box-details").text
    except:
        rating_details = ""
    try:
        critic_rating = re.search(r"Metascore: .*/100", rating_details).group().split()[-1].split("/")[0]
        critic_rating_n = re.search(r"\|.*\n.*\n.*from\n.*Metacritic.com", rating_details).group().split("\n")[1].strip()
    except:
        critic_rating = ""
        critic_rating_n = ""
    
    # Writer
    try:
        writer = [a.text for a in html.find(itemprop="creator").find_all("a")]
        if " credit" in writer[-1]:
            writer = writer[:-1]
    except:
        writer = ""

    try:
        details_text = html.find(id="titleDetails").text
        details_text = details_text.replace("\n\n", "|#|")
        details_text = details_text.replace("See full technical specs", "|#|")
        details_text = details_text.replace("\n", " ")
        details_text = details_text.replace("\t", " ")
        details_text = details_text.replace("  ", " ")
    except:
        details_text = ""
    
    # Country
    if re.search(r"Country:", details_text):
        country = re.search(r"Country\:.*?\|\#\|", details_text).group()
        country = country[8:-3].strip()
        country = country.split(" | ")
    else:
        country = ""
    
    # Language
    if re.search(r"Language:", details_text):
        language = re.search(r"Language\:.*?\|\#\|", details_text).group()
        language = language[9:-3].strip()
        language = language.split(" | ")
    else:
        language = ""
    
    # Budget
    if re.search(r"Budget:", details_text):
        budget = re.search(r"Budget\:.*?\|\#\|", details_text).group()
        budget = budget[7:-3].strip()
    else:
        budget = ""
    
    # Opening Weekend Gross
    if re.search(r"Opening Weekend:", details_text):
        opening_weekend_gross = re.search(r"Opening Weekend\:.*?\|\#\|", details_text).group()
        opening_weekend_gross = opening_weekend_gross[16:-3].strip()
    else:
        opening_weekend_gross = ""
    
    # Production Company
    if re.search(r"Production Co:", details_text):
        production_company = re.search(r"Production Co\:\|\#\|.*?\|\#\|", details_text).group()
        production_company = production_company[17:-3].strip()
        production_company = production_company.split(", ")
        if "See more" in production_company[-1]:
            production_company[-1] = production_company[-1].replace("See more", "")
            production_company[-1] = production_company[-1].replace("\xa0", "")
            production_company[-1] = production_company[-1].replace("»", "")
        production_company = [company.strip() for company in production_company]
    else:
        production_company = ""
    
    # Sound Mix
    if re.search(r"Sound Mix:", details_text):
        sound_mix = re.search(r"Sound Mix\:.*?\|\#\|", details_text).group()
        sound_mix = sound_mix[10:-3].strip()
        sound_mix = sound_mix.split(" | ")
    else:
        sound_mix = ""
    
    # Color
    if re.search(r"Color:", details_text):
        color = re.search(r"Color\:.*?\|\#\|", details_text).group()
        color = color[6:-3].strip()
        color = color.split(" | ")
    else:
        color = ""
    
    # Aspect Ratio
    if re.search(r"Aspect Ratio:", details_text):
        pass
        aspect_ratio = re.search(r"Aspect Ratio\:.*?\|\#\|", details_text).group()
        aspect_ratio = aspect_ratio[13:-3].strip()
    else:
        aspect_ratio = ""
    
    return [release_date, critic_rating, critic_rating_n, writer, country, language, \
            budget, opening_weekend_gross, production_company, sound_mix, color, aspect_ratio]

In [4]:
def scrape_multiple_individual_movie_pages(links):
    """
    In:
    links = list of links to individual IMDB movie pages to scrape
    
    Out:
    detailed_movie_data = list of list of movie data, with each sub-list corresponding to a single movie. Each sublist
                          contains release_date, critic_rating, critic_rating_n, writer, country, language, budget, 
                          opening_weekend_gross, production_company, sound_mix, color, aspect_ratio
    """
    detailed_movie_data = []
    links_visited = 0
    
    # Go through the list of links and scrape each individual page
    for link in links:
        movie_html = get_HTML(link)
        data = [link]
        data += scrape_individual_movie_page(movie_html)
        detailed_movie_data.append(data)
        links_visited += 1
        """
        Note: scrape individual pages is quite time consuming. If you are trying to do a lot at once. It is recommend
        to periodically save (pickle) your results so that you don't lose it all if something goes wrong in the code
        on some random page, your signal is cut/interupted, etc.
        
        To do this uncomment the lines below and update to your preference. Currently saves progress every 1,000 pages
        
        
        if links_visited % 1000 == 0:
            pickle_it(detailed_movie_data, "detailed_movie_data.pkl")
        """
     
    """Uncomment this to save (pickle) the scraped data once you've gone through all links (as another failsafe)"""
    # pickle_it(detailed_movie_data, "detailed_movie_data.pkl")
    
    return detailed_movie_data

In [824]:
# Load individual IMDB page links to scrape from pickle
all_links = load_pickle("all_links.pkl")

"""Uncomment these if you want to scrape the data"""
# detailed_movie_data = scrape_multiple_individual_movie_pages(all_links)

In [None]:
"""Uncomment this when you want to save the data you scraped"""
# pickle_it(detailed_movie_data, "detailed_movie_data.pkl")
!ls