# Predicting Best Picture Winners & Nominees
*An Analysis by Sean Osier*

## Data Scraping with Selenium

In [117]:
# Import Dependencies
import pickle
import os
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.keys import Keys
import time

In [1]:
# Pickling functions
def pickle_it(data, filename, python_version=3):
    """
    In:
    data = the data you want to pickle (save)
    filename = file name where you want to save the data
    python_version = the python version where you will be opening the pickle file
    
    Out:
    Saves a pickle file with your data to to the filename you specify
    """
    with open(filename, "wb") as picklefile:
        pickle.dump(data, picklefile, protocol=python_version)

def load_pickle(filename):
    """
    In:
    filename = name of the pickle file you want to open (e.g "my_pickle.pkl")
    
    Out:
    Opens and returns the content of the picklefile to a variable of your choice
    """
    with open(filename, "rb") as picklefile: 
        return pickle.load(picklefile)

In [2]:
# Selenium class and methods
class Selenium(object):
    """
    Object that initializes a "Selenium" object / session and contains methods for the key commands needs to scrape
    with Selenium
    """
    
    def __init__(self, url):
        """
        In:
        url = website you want to go to first when the browser / Selenium session opens
        
        Out:
        Starts browser / Selenium session
        """
        self.chromedriver = "/Users/seanosier/Downloads/chromedriver" # Replace this with the location of your driver
        os.environ["webdriver.chrome.driver"] = chromedriver
        self.driver = webdriver.Chrome(chromedriver)
        self.driver.get(url)
        self.delay = 2  # seconds
        time.sleep(self.delay)
    
    def open_page(self, url):
        """
        In:
        url = webpage you want to go to
        
        Out:
        Goes to requested webpage
        """
        self.driver.get(url)
        time.sleep(self.delay) 

    def find(self, tag, attr, value):
        """
        In:
        tag = HTML tag you want to find
        attr = sub-element of the tag you want to check (e.g. href, class, id, etc.)
        value = attribute value you want to find (e.g. "XYZ" in class="XYZ")
        
        Out:
        return_list = list of the text contents all instances of requested tag, attr, value combination
        """
        selector = "//%s[contains(@%s, '%s')]" % (tag, attr, value)
        return_list = []
        for item in self.driver.find_elements_by_xpath(selector):
            return_list.append(item.text)

        return return_list
        
    def close(self):
        """
        In:
        None
        
        Out:
        Close the browser / current Selenium session
        """
        self.driver.close()

In [107]:
# Selenium funcitons
def selenium_single_scrape(urls, tag, attr, value):
    """
    In:
    urls = list of webpage(s) you wish to scrape
    tag = HTML tag you want to find
    attr = sub-element of the tag you want to check (e.g. href, class, id, etc.)
    value = attribute value you want to find (e.g. "XYZ" in class="XYZ")
    
    Out:
    Tuple of results with each tuple entry being the found contents for each url scraped
    """
    sel = "Not yet initiated"
    all_results = []
    for i, url in enumerate(urls):
        if i == 0:
            sel = Selenium(url)
        else:
            sel.open_page(url)
        result_list = sel.find(tag, attr, value)
        all_results.append(result_list)
    sel.close()
    return tuple(all_results)

def scrape_google_frequently_mentioned_bar_names(urls):
    """
    In:
    urls = list of Google search results urls containing a "X frequenly mentioned on the web" bar
    
    Out:
    Tuple of results with each tuple entry being the found contents for each url scraped
    """
    results = selenium_single_scrape(urls, "div", "class", "kltat")
    return results

In [112]:
def scrape_directors_actors_actresses():
    """
    In:
    None
    
    Out:
    directors, actors, actresses = Lists of 51 frequently mentioned directors, actors, actresses on the web
    """
    urls = ["https://www.google.com/#q=list%20of%20top%20directors",
            "https://www.google.com/#q=list%20of%20top%20actors",
            "https://www.google.com/#q=list%20of%20top%20actresses"]
    directors, actors, actresses = scrape_google_frequently_mentioned_bar_names(urls)
    return directors, actors, actresses

directors, actors, actresses = scrape_directors_actors_actresses()

In [None]:
"""Uncomment this when you want to save the data you scraped"""
# pickle_it(tuple([directors, actors, actresses]), "directors_actors_actresses.pkl")
!ls

In [122]:
def scrape_writers():
    """
    In:
    None
    
    Out:
    writers = Lists of 51 frequently mentioned writers on the web
    """
    urls = ["https://www.google.com/#q=list%20of%20top%20movie%20writers"]
    writers = scrape_google_frequently_mentioned_bar_names(urls)
    return writers[0]

writers = scrape_writers()

In [None]:
"""Uncomment this when you want to save the data you scraped"""
# pickle_it(writers, "writers.pkl")
!ls