# Predicting Best Picture Winners & Nominees
*An Analysis by Sean Osier*

### Data Cleaning and Processing

In [308]:
# Import Dependencies
import pickle
import datetime
import dateutil.parser


import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import re


# import csv
# from collections import defaultdict


# For display
import pprint
import matplotlib.pyplot as plt
%matplotlib inline

In [259]:
# Pickling functions
def pickle_it(data, filename, python_version=3):
    with open(filename, "wb") as picklefile:
        pickle.dump(data, picklefile, protocol=python_version)

def load_pickle(filename):
    with open(filename, "rb") as picklefile: 
        return pickle.load(picklefile)

In [260]:
def process_IMDB_data(movie_data):
    headers = ["title", "year", "link", "user_rating_long", "user_rating_short",
                           "outline", "director", "starring", "genre", "pg_rating", "runtime"]
    df = pd.DataFrame(movie_data, columns=headers)
    df.drop_duplicates(inplace=True)
    
    # Some initial cleaning
    df["year"] = df["year"].replace("(2012 Documentary)", "(2012)")
    
    # Key
    df["key"] = df["title"] + " " + df["year"]
    
    # Year
    df["year"] = df["year"].apply(lambda x: x[1:-1])
    
    # Link
    df["link"] = "http://www.imdb.com" + df["link"]
    
    # User Rating
    df["user_rating_short"] = df["user_rating_short"].convert_objects(convert_numeric=True)
    
    # User Rating n-size
    df["user_rating_n"] = df["user_rating_long"].apply(lambda x: x.split()[4] if x != "" else "")
    df["user_rating_n"] = df["user_rating_n"].apply(lambda x: x[1:].replace(",", "") if x != "" else "")
    df["user_rating_n"] = df["user_rating_n"].convert_objects(convert_numeric=True)
    
    # Director(s)
    df["director"] = df["director"].apply(lambda x: x.split(", "))
    
    # Lead Actors
    df["starring"] = df["starring"].apply(lambda x: x.split(", "))
    
    # Genre
    df["genre"] = df["genre"].apply(lambda x: x.split(" | "))
    
    # Runtime
    df["runtime"] = df["runtime"].apply(lambda x: x.split()[0] if x != "" else "")
    df["runtime"] = df["runtime"].convert_objects(convert_numeric=True)
    
    # Removed unneed columns
    df = df[["key", "title", "year", "user_rating_short", "user_rating_n", "director", "starring", "genre", \
            "runtime", "pg_rating", "link"]]
    
    # More cleaning
    df = df[(df["year"] != "2015") & (df["year"] != "2016") & (df["year"] != "????") & (df["year"] != "???? ")]
    keep_criterion = df["runtime"].map(lambda x: ((x >= 45) and (x <= 360)) or (pd.isnull(x)))
    df = df[keep_criterion]
    
    return df

In [261]:
def process_nominee_and_winner_data(data):
    headers = ["title", "year", "status"]
    df = pd.DataFrame(data, columns=headers)
    df.drop_duplicates(inplace=True)
    
    # Initial cleaning
    df["title"] = df["title"].replace("The Godfather Part III", "The Godfather: Part III")
    df["title"] = df["title"].replace("Good Night, and Good Luck", "Good Night, and Good Luck.")
    df["title"] = df["title"].replace("Precious: Based on the Novel \"Push\" by Sapphire", "Precious")
    df["title"] = df["title"].replace("Extremely Loud and Incredibly Close", "Extremely Loud & Incredibly Close")
    df["title"] = df["title"].replace("Birdman or (The Unexpected Virtue of Ignorance)", 
                                      "Birdman: Or (The Unexpected Virtue of Ignorance)")
    
    # Key
    df["key"] = df["title"] + " (" + df["year"] + ")"
    
    # Status Year (Year Nominated For)
    df["status_year"] = df["year"]
    
    df = df[["key", "title", "year", "status", "status_year"]]
    
    # More cleaning
    df["key"] = df["key"].replace("Il Postino: The Postman (1995)", "Il Postino: The Postman (1994)")
    df["key"] = df["key"].replace("Life Is Beautiful (1998)", "Life Is Beautiful (1997)")
    df["key"] = df["key"].replace("Crash (2005)", "Crash (2004)")
    df["key"] = df["key"].replace("The Hurt Locker (2009)", "The Hurt Locker (2008)")
    
    return df

In [382]:
def eval_division_string(s):
    numbers = s.split("/")
    result = ""
    for i, n in enumerate(numbers):
        if i == 0:
            result = float(n)
        else:
            result /= float(n)
    return result

In [428]:
def process_detailed_movie_data(data):
    headers = ["link", "release_date", "critic_rating", "critic_rating_n", "writer", "country", "language", \
            "budget", "opening_weekend_gross", "production_company", "sound_mix", "color", "aspect_ratio"]
    df = pd.DataFrame(data, columns=headers)
    # df.drop_duplicates(inplace=True)
    
    # Release date
    df["release_date_details"] = df["release_date"].apply(lambda x: len(x.split()))
    default = datetime.datetime(1000, 1, 1)
    df["release_date_datetime"] = df["release_date"].apply(lambda x: dateutil.parser.parse(x, default=default))
    df["release_month"] = df[df["release_date_details"] >= 2]["release_date_datetime"].apply(lambda x: x.month)
    df["release_day_in_year"] = df[df["release_date_details"] >= 3]["release_date_datetime"].apply( \
        lambda x: x.timetuple().tm_yday)
    
    # Budget
    df["budget"] = df["budget"].apply(lambda x: x.replace(",", ""))
    df["budget_currency"] = df["budget"].apply(lambda x: x[0] if x != "" else x)
    
    # Gross
    df["opening_weekend_gross"] = df["opening_weekend_gross"].apply(lambda x: x.replace(",", ""))
    df["opening_weekend_gross_currency"] = df["opening_weekend_gross"].apply(lambda x: x[0] if x != "" else x)
    
    # Color
    def extract_color(color_list):
        colors = []
        for color in color_list:
            if ("Color" in color) or ("color" in color):
                colors.append("Color")
            if ("Black" in color) or ("White" in color) or ("B&W" in color):
                colors.append("B&W")
        colors = list(set(colors))
        return colors
    
    df["color"] = df["color"].apply(lambda x: extract_color(x))
    
    # Aspect ratio
    df["aspect_ratio"] = df["aspect_ratio"].apply(lambda x: x.replace("x", ":"))
    df["aspect_ratio"] = df["aspect_ratio"].apply(lambda x: "!!!!!" if (":" not in x) and (x != "") else x)
    df["aspect_ratio"] = df["aspect_ratio"].apply(lambda x: re.sub(r"[a-zA-Z]", "", x))
    df["aspect_ratio"] = df["aspect_ratio"].apply(lambda x: x.replace(",", "."))
    df["aspect_ratio"] = df["aspect_ratio"].apply(lambda x: x.replace("2:35", "2.35"))
    df["aspect_ratio"] = df["aspect_ratio"].apply(lambda x: x.replace(" : ", ":"))
    df["aspect_ratio"] = df["aspect_ratio"].apply(lambda x: x.replace("4: 3", "4:3"))
    df["aspect_ratio"] = df["aspect_ratio"].apply(lambda x: x.replace("16: 9", ":"))
    df["aspect_ratio"] = df["aspect_ratio"].apply(lambda x: x.split()[0] if x != "" else x)
    df["aspect_ratio"] = df["aspect_ratio"].apply(lambda x: x.strip(":"))
    df["aspect_ratio"] = df["aspect_ratio"].apply(lambda x: x.replace(":", "/"))
    df["aspect_ratio"] = df["aspect_ratio"].apply(lambda x: x.replace("!!!!!", "Other"))
    df["aspect_ratio"] = df["aspect_ratio"].apply(lambda x: eval_division_string(x) \
                                                  if (x != "" and x != "Other") else x)
    
    return df

In [429]:
detailed_df = process_detailed_movie_data(detailed_movie_data)

In [430]:
detailed_df[detailed_df["opening_weekend_gross"] != ""].head(25)
# g = detailed_df.groupby("aspect_ratio")
# g.size()
# n = [name for name, group in g.groups.items()]
#n
# writer_hist = make_hist_dict(detailed_df, "production_company")
# countries = list(writer_hist.items())
# countries = sorted(countries, key=lambda x: x[1], reverse=True)[:200]
# countries

Unnamed: 0,link,release_date,critic_rating,critic_rating_n,writer,country,language,budget,opening_weekend_gross,production_company,sound_mix,color,aspect_ratio,release_date_details,release_date_datetime,release_month,release_day_in_year,budget_currency,opening_weekend_gross_currency
35,http://www.imdb.com/title/tt0496595/,23 December 2005,,,[Flávio de Souza],[Brazil],[Portuguese],,$278593,"[Diler & Associados, Globo Filmes, Twister Stu...",,[Color],,3,2005-12-23 00:00:00,12.0,357.0,,$
104,http://www.imdb.com/title/tt0391726/,19 December 2003,,,[Flávio de Souza],[Brazil],[Portuguese],,$555600,"[Diler & Associados, Globo Filmes, Labo Cine d...","[DTS, Dolby Digital, SDDS]",[Color],,3,2003-12-19 00:00:00,12.0,353.0,,$
128,http://www.imdb.com/title/tt4009460/,14 November 2014,18.0,9.0,"[Darren Doane, Cheston Hervey]",[USA],[English],,$651276,"[Camfam Studios, Provident Films]",,[Color],,3,2014-11-14 00:00:00,11.0,318.0,,$
135,http://www.imdb.com/title/tt0294543/,14 December 2001,,,"[Vivian Perl, Wagner de Assis]",[Brazil],[Portuguese],,$367343,"[Diler & Associados, Globo Filmes, Labo Cine d...",[Dolby Digital],[Color],,3,2001-12-14 00:00:00,12.0,348.0,,$
144,http://www.imdb.com/title/tt0484964/,6 July 2006,,,"[Salvador Valverde Freire, Salvador Valverde C...",[Argentina],[Spanish],,$1947464,"[Argentina Sono Film S.A.C.I., Televisión Fede...",[Dolby Digital],[Color],,3,2006-07-06 00:00:00,7.0,187.0,,$
196,http://www.imdb.com/title/tt2344678/,29 March 2013,,,"[K. Raghavendra Rao, Sajid Khan]",[India],[Hindi],,$197770,"[Puja Entertainment (India), UTV Motion Pictures]",,[Color],2.35,3,2013-03-29 00:00:00,3.0,88.0,,$
209,http://www.imdb.com/title/tt0470833/,6 January 2006,,,[Tayfun Güneyer],[Turkey],[Turkish],,$1340723,[Energy Prodüksiyon],,[Color],,3,2006-01-06 00:00:00,1.0,6.0,,$
234,http://www.imdb.com/title/tt0270846/,27 August 2004,9.0,19.0,"[Steven Paul, Gregory Poppen]","[Germany, UK, USA]",[English],,$3251856,"[ApolloMedia Distribution, Crystal Sky Worldwi...","[DTS, Dolby Digital, SDDS]",[Color],2.35,3,2004-08-27 00:00:00,8.0,240.0,,$
235,http://www.imdb.com/title/tt0804492/,21 February 2008,7.0,18.0,[Heidi Ferrer],[USA],[English],,$27696,"[Purple Pictures, Adrenalina Films, Nevinny / ...",[Dolby Digital],[Color],1.85,3,2008-02-21 00:00:00,2.0,52.0,,$
246,http://www.imdb.com/title/tt1213644/,29 August 2008,15.0,12.0,"[Jason Friedberg, Aaron Seltzer]",[USA],[English],$25000000,$6945535,"[Lionsgate, Grosvenor Park Media, 3 in the Box]","[SDDS, Dolby Digital, DTS]",[Color],1.85,3,2008-08-29 00:00:00,8.0,242.0,$,$


In [None]:
country_hist = make_hist_dict(detailed_df, "country")
countries = list(country_hist.items())
countries = sorted(countries, key=lambda x: x[1])[-25:]
countries = [x[0] for x in countries]

In [None]:
language_hist = make_hist_dict(detailed_df, "language")
languages = list(language_hist.items())
languages = sorted(languages, key=lambda x: x[1])[-25:]
languages = [x[0] for x in languages]
languages

In [262]:
def merge_in_nominees_and_wins(df, nom_win_df):
    nom_win_for_merge = nom_win_df[nom_win_df["year"].astype(int) >= 1990]
    nom_win_for_merge = nom_win_for_merge[["key", "status", "status_year"]]
    
    new_df = pd.merge(df, nom_win_for_merge, on="key", how="left")

    new_df["status"][10186] = np.nan
    new_df["status_year"][10186] = np.nan
    new_df["status"][57111] = np.nan
    new_df["status_year"][57111] = np.nan
    
    return new_df

In [None]:
def merge_in_detailed_movie_data(df, detailed_df):
    new_df = pd.merge(df, detailed_df, on="key", how="left")
    
    return new_df

In [278]:
# Load in raw scraped data
movie_data = load_pickle("movie_data.pkl")
nominees_and_winners_raw = load_pickle("nominees_and_winners.pkl")
detailed_movie_data = load_pickle("detailed_movie_data.pkl")

In [264]:
# Process the data
df = process_IMDB_data(movie_data)
nom_win_df = process_nominee_and_winner_data(nominees_and_winners_raw)

df = merge_in_nominees_and_wins(df, nom_win_df)
print(len(df))

A value is trying to be set on a copy of a slice from a DataFrame

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


120763


In [265]:
links = df["link"].values
# pickle_it(links, "all_links.pkl")
!ls

Data_Processing.ipynb          detailed_movie_data2.pkl
Data_Scraping.ipynb            detailed_movie_data3.pkl
Regression.ipynb               detailed_movie_data4.pkl
Selenium_Scraping.ipynb        detailed_movie_data5.pkl
all_links.pkl                  detailed_movie_data6.pkl
detailed_movie_data.pkl        detailed_movie_data7.pkl
detailed_movie_data1.pkl       detailed_movie_data7b.pkl
detailed_movie_data10.pkl      detailed_movie_data8.pkl
detailed_movie_data11.pkl      detailed_movie_data9.pkl
detailed_movie_data12.pkl      df_for_regression.pkl
detailed_movie_data13.pkl      directors_actors_actresses.pkl
detailed_movie_data14.pkl      movie_data.pkl
detailed_movie_data15.pkl      nominees_and_winners.pkl


In [266]:
df.head(1)

Unnamed: 0,key,title,year,user_rating_short,user_rating_n,director,starring,genre,runtime,pg_rating,link,status,status_year
0,Inception (2010),Inception,2010,8.8,1303923,[Christopher Nolan],"[Leonardo DiCaprio, Joseph Gordon-Levitt, Elle...","[Action, Mystery, Sci-Fi, Thriller]",148,PG_13,http://www.imdb.com/title/tt1375666/,N,2010


In [267]:
def final_clean(df):
    # Parental Guidance (MPAA) Rating
    df = df[df["pg_rating"] != "APPROVED"]
    df["pg_rating"] = df["pg_rating"].replace("X", "NC_17")
    df["pg_rating"] = df["pg_rating"].replace("NOT_RATED", "UNRATED")
    df["pg_rating"] = df["pg_rating"].replace("", "UNRATED")

    # Status and Status Score
    df["status_score"] = df["status"]
    df["status_score"] = df["status_score"].replace("W", 10)
    df["status_score"] = df["status_score"].replace("N", 5)
    df["status_score"] = df["status_score"].replace(np.nan, 0)
    
    # Status Year
    df["status_year"].fillna(df["year"], inplace=True)
    
    # Number Nominees
    df["num_nominees"] = df["status_year"].apply(lambda x: "5" if int(x) <= 2008 else ">5")

    return df
    
df = final_clean(df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [268]:
df.head(1)

Unnamed: 0,key,title,year,user_rating_short,user_rating_n,director,starring,genre,runtime,pg_rating,link,status,status_year,status_score,num_nominees
0,Inception (2010),Inception,2010,8.8,1303923,[Christopher Nolan],"[Leonardo DiCaprio, Joseph Gordon-Levitt, Elle...","[Action, Mystery, Sci-Fi, Thriller]",148,PG_13,http://www.imdb.com/title/tt1375666/,N,2010,5,>5


In [269]:
by = df.groupby("num_nominees")
by.size()

num_nominees
5     73834
>5    46902
dtype: int64

In [270]:
directors, actors, actresses = load_pickle("directors_actors_actresses.pkl")
stars = actors + actresses

genres = sorted(list(set([genre for genre_list in df["genre"].values for genre in genre_list]))[1:])
genres_to_exclude = ["News", "Talk-Show", "Game-Show", "Reality-TV", "Documentary", "Adult"]
genres = [genre for genre in genres if genre not in genres_to_exclude]

In [271]:
def make_boolean_columns(df, old_column, new_columns):
    for column in new_columns:
        column_name = old_column + "_"
        column_name += column.lower().replace(" ", "_").replace("-", "_").replace(".", "")
        df[column_name] = df[old_column].apply(lambda x: 1 if column in x else 0)
        
    return df

In [272]:
df = make_boolean_columns(df, "director", directors)
df = make_boolean_columns(df, "starring", stars)
df = make_boolean_columns(df, "genre", genres)

In [273]:
df = df.drop(["title", "year", "user_rating_n", "director", "starring", "genre", "link", "status"], axis=1)

In [274]:
# test_df[test_df["title"] == "Her"].head(1)
# df.head()
len(df)

120736

In [277]:
# pickle_it(df, "df_for_regression.pkl")
!ls

Data_Processing.ipynb          detailed_movie_data2.pkl
Data_Scraping.ipynb            detailed_movie_data3.pkl
Regression.ipynb               detailed_movie_data4.pkl
Selenium_Scraping.ipynb        detailed_movie_data5.pkl
all_links.pkl                  detailed_movie_data6.pkl
detailed_movie_data.pkl        detailed_movie_data7.pkl
detailed_movie_data1.pkl       detailed_movie_data7b.pkl
detailed_movie_data10.pkl      detailed_movie_data8.pkl
detailed_movie_data11.pkl      detailed_movie_data9.pkl
detailed_movie_data12.pkl      df_for_regression.pkl
detailed_movie_data13.pkl      directors_actors_actresses.pkl
detailed_movie_data14.pkl      movie_data.pkl
detailed_movie_data15.pkl      nominees_and_winners.pkl


In [332]:
def make_hist_dict(df, column):
    d = {}
    for lst in df[column].values:
        for x in lst:
            d[x] = d.get(x, 0) + 1
    
    return d