# Predicting Best Picture Winners & Nominees
*An Analysis by Sean Osier*

### Data Cleaning and Processing

In [143]:
# Import Dependencies
import pickle
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import re

# import csv
# from collections import defaultdict
# import dateutil.parser

# For display
import pprint
import matplotlib.pyplot as plt
%matplotlib inline

In [144]:
# Pickling functions
def pickle_it(data, filename, python_version=3):
    with open(filename, "wb") as picklefile:
        pickle.dump(data, picklefile, protocol=python_version)

def load_pickle(filename):
    with open(filename, "rb") as picklefile: 
        return pickle.load(picklefile)

In [184]:
def process_IMDB_data(movie_data):
    headers = ["title", "year", "link", "user_rating_long", "user_rating_short",
                           "outline", "director", "starring", "genre", "pg_rating", "runtime"]
    df = pd.DataFrame(movie_data, columns=headers)
    df.drop_duplicates(inplace=True)
    
    # Some initial cleaning
    df["year"] = df["year"].replace("(2012 Documentary)", "(2012)")
    
    # Key
    df["key"] = df["title"] + " " + df["year"]
    
    # Year
    df["year"] = df["year"].apply(lambda x: x[1:-1])
    
    # Link
    df["link"] = "http://www.imdb.com" + df["link"]
    
    # User Rating
    df["user_rating_short"] = df["user_rating_short"].convert_objects(convert_numeric=True)
    
    # User Rating n-size
    df["user_rating_n"] = df["user_rating_long"].apply(lambda x: x.split()[4] if x != "" else "")
    df["user_rating_n"] = df["user_rating_n"].apply(lambda x: x[1:].replace(",", "") if x != "" else "")
    df["user_rating_n"] = df["user_rating_n"].convert_objects(convert_numeric=True)
    
    # Director(s)
    df["director"] = df["director"].apply(lambda x: x.split(", "))
    
    # Lead Actors
    df["starring"] = df["starring"].apply(lambda x: x.split(", "))
    
    # Genre
    df["genre"] = df["genre"].apply(lambda x: x.split(" | "))
    
    # Runtime
    df["runtime"] = df["runtime"].apply(lambda x: x.split()[0] if x != "" else "")
    df["runtime"] = df["runtime"].convert_objects(convert_numeric=True)
    
    # Removed unneed columns
    df = df[["key", "title", "year", "user_rating_short", "user_rating_n", "director", "starring", "genre", \
            "runtime", "pg_rating", "link"]]
    
    # More cleaning
    df = df[(df["year"] != "2015") & (df["year"] != "2016") & (df["year"] != "????") & (df["year"] != "???? ")]
    keep_criterion = df["runtime"].map(lambda x: ((x >= 45) and (x <= 360)) or (pd.isnull(x)))
    df = df[keep_criterion]
    
    return df

In [175]:
def process_nominee_and_winner_data(data):
    headers = ["title", "year", "status"]
    df = pd.DataFrame(data, columns=headers)
    df.drop_duplicates(inplace=True)
    
    # Initial cleaning
    df["title"] = df["title"].replace("The Godfather Part III", "The Godfather: Part III")
    df["title"] = df["title"].replace("Good Night, and Good Luck", "Good Night, and Good Luck.")
    df["title"] = df["title"].replace("Precious: Based on the Novel \"Push\" by Sapphire", "Precious")
    df["title"] = df["title"].replace("Extremely Loud and Incredibly Close", "Extremely Loud & Incredibly Close")
    df["title"] = df["title"].replace("Birdman or (The Unexpected Virtue of Ignorance)", 
                                      "Birdman: Or (The Unexpected Virtue of Ignorance)")
    
    # Key
    df["key"] = df["title"] + " (" + df["year"] + ")"
    
    # Status Year (Year Nominated For)
    df["status_year"] = df["year"]
    
    df = df[["key", "title", "year", "status", "status_year"]]
    
    # More cleaning
    df["key"] = df["key"].replace("Il Postino: The Postman (1995)", "Il Postino: The Postman (1994)")
    df["key"] = df["key"].replace("Life Is Beautiful (1998)", "Life Is Beautiful (1997)")
    df["key"] = df["key"].replace("Crash (2005)", "Crash (2004)")
    df["key"] = df["key"].replace("The Hurt Locker (2009)", "The Hurt Locker (2008)")
    
    return df

In [171]:
def merge_in_nominees_and_wins(df, nom_win_df):
    nom_win_for_merge = nom_win_df[nom_win_df["year"].astype(int) >= 1990]
    nom_win_for_merge = nom_win_for_merge[["key", "status", "status_year"]]
    
    new_df = pd.merge(df, nom_win_for_merge, on='key', how='left')

    new_df["status"][10186] = np.nan
    new_df["status_year"][10186] = np.nan
    new_df["status"][57111] = np.nan
    new_df["status_year"][57111] = np.nan
    
    return new_df

In [172]:
# Load in raw scraped data
movie_data = load_pickle("movie_data.pkl")
nominees_and_winners_raw = load_pickle("nominees_and_winners.pkl")

In [None]:
# Process the data
df = process_IMDB_data(movie_data)
nom_win_df = process_nominee_and_winner_data(nominees_and_winners_raw)
df = merge_in_nominees_and_wins(df, nom_win_df)
print(len(df))

In [193]:
links = df["link"].values
# pickle_it(links, "all_links.pkl")
!ls

Data_Processing.ipynb          detailed_movie_data3.pkl
Data_Scraping.ipynb            detailed_movie_data4.pkl
Selenium_Scraping.ipynb        detailed_movie_data5.pkl
all_links.pkl                  detailed_movie_data6.pkl
detailed_movie_data1.pkl       detailed_movie_data7.pkl
detailed_movie_data10.pkl      detailed_movie_data7b.pkl
detailed_movie_data11.pkl      detailed_movie_data8.pkl
detailed_movie_data12.pkl      detailed_movie_data9.pkl
detailed_movie_data13.pkl      directors_actors_actresses.pkl
detailed_movie_data14.pkl      movie_data.pkl
detailed_movie_data2.pkl       nominees_and_winners.pkl


In [190]:
df.head(1)

Unnamed: 0,key,title,year,user_rating_short,user_rating_n,director,starring,genre,runtime,pg_rating,link,status,status_year
0,Inception (2010),Inception,2010,8.8,1303923,[Christopher Nolan],"[Leonardo DiCaprio, Joseph Gordon-Levitt, Elle...","[Action, Mystery, Sci-Fi, Thriller]",148,PG_13,http://www.imdb.com/title/tt1375666/,N,2010


In [198]:
by = df.groupby("pg_rating")
# by.size()

In [None]:
def final_clean(df):
    # Parental Guidance (MPAA) Rating
    df = df[df["pg_rating"] != "APPROVED"]
    df["pg_rating"] = df["pg_rating"].replace("X", "NC_17")
    df["pg_rating"] = df["pg_rating"].replace("NOT_RATED", "UNRATED")
    df["pg_rating"] = df["pg_rating"].replace("", "UNRATED")

    # Status and Status Score
    df["status_score"] = df["status"]
    df["status_score"] = df["status_score"].replace("W", 10)
    df["status_score"] = df["status_score"].replace("N", 5)
    df["status_score"] = df["status_score"].replace(np.nan, 0)
    
    # Status Year
    df["status_year"].fillna(df["year"], inplace=True)
    
    # Number Nominees
    df["num_nominees"] = df["status_year"].apply(lambda x: "5" if int(x) <= 2008 else ">5")

    return df
    
df = final_clean(df)

In [202]:
df.head(1)

Unnamed: 0,key,title,year,user_rating_short,user_rating_n,director,starring,genre,runtime,pg_rating,link,status,status_year,status_score,num_nominees
0,Inception (2010),Inception,2010,8.8,1303923,[Christopher Nolan],"[Leonardo DiCaprio, Joseph Gordon-Levitt, Elle...","[Action, Mystery, Sci-Fi, Thriller]",148,PG_13,http://www.imdb.com/title/tt1375666/,N,2010,5,>5


In [203]:
by = df.groupby("num_nominees")
by.size()

num_nominees
5     73834
>5    46902
dtype: int64

In [219]:
directors, actors, actresses = load_pickle("directors_actors_actresses.pkl")
stars = actors + actresses

genres = sorted(list(set([genre for genre_list in df["genre"].values for genre in genre_list]))[1:])
genres_to_exclude = ["News", "Talk-Show", "Game-Show", "Reality-TV", "Documentary", "Adult"]
genres = [genre for genre in genres if genre not in genres_to_exclude]

In [220]:
def make_boolean_columns(df, old_column, new_columns):
    for column in new_columns:
        column_name = old_column + "_"
        column_name += column.lower().replace(" ", "_")
        df[column_name] = df[old_column].apply(lambda x: 1 if column in x else 0)
        
    return df

In [221]:
df = make_boolean_columns(df, "director", directors)
df = make_boolean_columns(df, "starring", stars)
df = make_boolean_columns(df, "genre", genres)

In [226]:
df = df.drop(["title", "year", "user_rating_n", "director", "starring", "genre", "link", "status"], axis=1)

In [228]:
# test_df[test_df["title"] == "Her"].head(1)
# df.head()
len(df)

120736

In [229]:
# pickle_it(df, "df_for_regression.pkl")
!ls

Data_Processing.ipynb          detailed_movie_data3.pkl
Data_Scraping.ipynb            detailed_movie_data4.pkl
Selenium_Scraping.ipynb        detailed_movie_data5.pkl
all_links.pkl                  detailed_movie_data6.pkl
detailed_movie_data1.pkl       detailed_movie_data7.pkl
detailed_movie_data10.pkl      detailed_movie_data7b.pkl
detailed_movie_data11.pkl      detailed_movie_data8.pkl
detailed_movie_data12.pkl      detailed_movie_data9.pkl
detailed_movie_data13.pkl      df_for_regression.pkl
detailed_movie_data14.pkl      directors_actors_actresses.pkl
detailed_movie_data15.pkl      movie_data.pkl
detailed_movie_data2.pkl       nominees_and_winners.pkl


In [205]:
def make_hist_dict(column):
    d = {}
    for lst in df[column].values:
        for x in lst:
            d[x] = d.get(x, 0) + 1
    
    return d

In [121]:
criterion = df["genre"].map(lambda x: "Adult" in x)
df[criterion].head()

Unnamed: 0,key,title,year,user_rating_short,user_rating_n,director,starring,genre,runtime,pg_rating,link,status,status_year,status_score
6610,The Writers (2011),The Writers,2011,3.7,58,[Todd Michael Smith],"[Ben Kacon, Steven Michael McKenzie, Shane Miles]","[Adult, Crime, Drama, Thriller]",90,UNRATED,http://www.imdb.com/title/tt1879084/,,2011,0
42080,The Japanese Wife Next Door (2004),The Japanese Wife Next Door,2004,5.3,861,[Yutaka Ikejima],"[Reiko Yamaguchi, Kaoru Akitsu, Naohiro Hirakawa]","[Comedy, Adult]",60,UNRATED,http://www.imdb.com/title/tt1439456/,,2004,0
42432,Locked Up (2004),Locked Up,2004,5.1,830,[JÃ¶rg Andreas],"[Marcel Schlutt, Mike Sale, Ralph Steel]","[Adult, Crime, Drama, Romance]",96,UNRATED,http://www.imdb.com/title/tt0410006/,,2004,0
45743,Queer FT 2 (2005),Queer FT 2,2005,4.8,33,"[Laurence Chanfro, Mino D.C.]","[GÃ©rald Marix, Ãrik RÃ©mÃ¨s, Jean-Marc S.]","[Fantasy, Adult]",75,UNRATED,http://www.imdb.com/title/tt0480722/,,2005,0
53345,Queer FT 3 (2007),Queer FT 3,2007,4.3,20,"[Angelstud, Mino D.C.]","[Mino D.C., Manu Lebon, HervÃ© Joseph Lebrun]",[Adult],54,UNRATED,http://www.imdb.com/title/tt0970833/,,2007,0


In [142]:
len(df)

87147