# Predicting Best Picture Winners & Nominees
*An Analysis by Sean Osier*

### Data Cleaning and Processing

In [627]:
# Import Dependencies
import pickle
import datetime
import dateutil.parser
import string

import requests
from bs4 import BeautifulSoup

import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.options.display.mpl_style = 'default'

import numpy as np
import re


# import csv
# from collections import defaultdict


# For display
import pprint
import matplotlib.pyplot as plt
%matplotlib inline

In [628]:
# Pickling functions
def pickle_it(data, filename, python_version=3):
    with open(filename, "wb") as picklefile:
        pickle.dump(data, picklefile, protocol=python_version)

def load_pickle(filename):
    with open(filename, "rb") as picklefile: 
        return pickle.load(picklefile)

In [629]:
def process_IMDB_data(movie_data):
    headers = ["title", "year", "link", "user_rating_long", "user_rating_short",
                           "outline", "director", "starring", "genre", "pg_rating", "runtime"]
    df = pd.DataFrame(movie_data, columns=headers)
    df.drop_duplicates(inplace=True)
    
    # Some initial cleaning
    df["year"] = df["year"].replace("(2012 Documentary)", "(2012)")
    
    # Key
    df["key"] = df["title"] + " " + df["year"]
    
    # Year
    df["year"] = df["year"].apply(lambda x: x[1:-1])
    
    # Link
    df["link"] = "http://www.imdb.com" + df["link"]
    
    # User Rating
    df["user_rating_short"] = df["user_rating_short"].convert_objects(convert_numeric=True)
    
    # User Rating n-size
    df["user_rating_n"] = df["user_rating_long"].apply(lambda x: x.split()[4] if x != "" else "")
    df["user_rating_n"] = df["user_rating_n"].apply(lambda x: x[1:].replace(",", "") if x != "" else "")
    df["user_rating_n"] = df["user_rating_n"].convert_objects(convert_numeric=True)
    
    # Director(s)
    df["director"] = df["director"].apply(lambda x: x.split(", "))
    
    # Lead Actors
    df["starring"] = df["starring"].apply(lambda x: x.split(", "))
    
    # Genre
    df["genre"] = df["genre"].apply(lambda x: x.split(" | "))
    
    # Runtime
    df["runtime"] = df["runtime"].apply(lambda x: x.split()[0] if x != "" else "")
    df["runtime"] = df["runtime"].convert_objects(convert_numeric=True)
    
    # Removed unneed columns
    df = df[["key", "title", "year", "user_rating_short", "user_rating_n", "director", "starring", "genre", \
            "runtime", "pg_rating", "link"]]
    
    # More cleaning
    df = df[(df["year"] != "2015") & (df["year"] != "2016") & (df["year"] != "????") & (df["year"] != "???? ")]
    keep_criterion = df["runtime"].map(lambda x: ((x >= 45) and (x <= 360)) or (pd.isnull(x)))
    df = df[keep_criterion]
    
    return df

In [630]:
def process_nominee_and_winner_data(data):
    headers = ["title", "year", "status"]
    df = pd.DataFrame(data, columns=headers)
    df.drop_duplicates(inplace=True)
    
    # Initial cleaning
    df["title"] = df["title"].replace("The Godfather Part III", "The Godfather: Part III")
    df["title"] = df["title"].replace("Good Night, and Good Luck", "Good Night, and Good Luck.")
    df["title"] = df["title"].replace("Precious: Based on the Novel \"Push\" by Sapphire", "Precious")
    df["title"] = df["title"].replace("Extremely Loud and Incredibly Close", "Extremely Loud & Incredibly Close")
    df["title"] = df["title"].replace("Birdman or (The Unexpected Virtue of Ignorance)", 
                                      "Birdman: Or (The Unexpected Virtue of Ignorance)")
    
    # Key
    df["key"] = df["title"] + " (" + df["year"] + ")"
    
    # Status Year (Year Nominated For)
    df["status_year"] = df["year"]
    
    df = df[["key", "title", "year", "status", "status_year"]]
    
    # More cleaning
    df["key"] = df["key"].replace("Il Postino: The Postman (1995)", "Il Postino: The Postman (1994)")
    df["key"] = df["key"].replace("Life Is Beautiful (1998)", "Life Is Beautiful (1997)")
    df["key"] = df["key"].replace("Crash (2005)", "Crash (2004)")
    df["key"] = df["key"].replace("The Hurt Locker (2009)", "The Hurt Locker (2008)")
    
    return df

In [631]:
def extract_color(color_list):
        colors = []
        for color in color_list:
            if ("Color" in color) or ("color" in color):
                colors.append("Color")
            if ("Black" in color) or ("White" in color) or ("B&W" in color):
                colors.append("Black and White")
        colors = list(set(colors))
        return colors

In [632]:
def eval_division_string(s):
    numbers = s.split("/")
    result = ""
    for i, n in enumerate(numbers):
        if i == 0:
            result = float(n)
        else:
            result /= float(n)
    return result

In [633]:
def remove_numbers(s):
    return "".join([c for c in s if c not in string.digits])

In [634]:
exchange_rates = load_pickle("exchange_rates.pkl")

In [635]:
def convert_to_USD(n, currency):
    return n*exchange_rates[currency]

In [636]:
def get_inflation_multipliers():
    inflation_rates = ["5.4", "4.2", "3", "3", "2.6", "2.8", "3", "2.3", "1.6", "2.2", "3.4", "2.8", "1.6", "2.3", \
                       "2.7", "3.4", "3.2", "2.8", "3.8", "-0.4", "1.6", "3.2", "2.1", "1.5", "1.6"]
    inflation_rates = sorted(list(zip(range(1990, 2015), inflation_rates)), reverse=True)
    inflation_rates = [(y, float(rate)/100 + 1) for y, rate in inflation_rates]
    
    inflation_multipliers = {}
    for year, rate in inflation_rates:
        inflation_multiple = 1
        for year, rate in inflation_rates[:(2015-year)]:
            inflation_multiple *= rate
        inflation_multipliers[year] = inflation_multiple
    
    return inflation_multipliers

inflation_multipliers = get_inflation_multipliers()

In [637]:
def process_detailed_movie_data(data):
    headers = ["link", "release_date", "critic_rating", "critic_rating_n", "writer", "country", "language", \
            "budget", "opening_weekend_gross", "production_company", "sound_mix", "color", "aspect_ratio"]
    df = pd.DataFrame(data, columns=headers)
    
    # Release date
    df["release_date_details"] = df["release_date"].apply(lambda x: len(x.split()))
    default = datetime.datetime(1000, 1, 1)
    df["release_date_datetime"] = df["release_date"].apply(lambda x: dateutil.parser.parse(x, default=default))
    df["release_month"] = df[df["release_date_details"] >= 2]["release_date_datetime"].apply(lambda x: x.month)
    df["release_day_in_year"] = df[df["release_date_details"] >= 3]["release_date_datetime"].apply( \
        lambda x: x.timetuple().tm_yday)
    
    # Critic rating
    df["critic_rating"] = df["critic_rating"].convert_objects(convert_numeric=True)
    
    # Budget
    df["budget"] = df["budget"].apply(lambda x: x.replace(",", ""))
    df["budget_currency"] = df["budget"].apply(lambda x: remove_numbers(x.split()[0]) if x != "" else x)
    df["budget_currency"] = df["budget_currency"].replace("$", "USD")
    df["budget_currency"] = df["budget_currency"].replace("£", "GBP")
    df["budget_currency"] = df["budget_currency"].replace("€", "EUR")
    df["budget_currency"] = df["budget_currency"].replace("€.", "EUR")
    df["budget_USD"] = df["budget"].apply(lambda x: float(x[1:].split()[-1]) if x != "" else x)
    df["budget_USD"] = df.apply(lambda x: convert_to_USD(x["budget_USD"], x["budget_currency"]) 
                                if x["budget_USD"] != "" else "", axis=1)
    
    # Gross
    df["opening_weekend_gross"] = df["opening_weekend_gross"].apply(lambda x: x.replace(",", ""))
    df["opening_weekend_gross_currency"] = df["opening_weekend_gross"].apply(lambda x: remove_numbers(x.split()[0]) 
                                                                             if x != "" else x)
    df["opening_weekend_gross_currency"] = df["opening_weekend_gross_currency"].replace("$", "USD")
    df["opening_weekend_gross_currency"] = df["opening_weekend_gross_currency"].replace("£", "GBP")
    df["opening_weekend_gross_currency"] = df["opening_weekend_gross_currency"].replace("€", "EUR")
    df["opening_weekend_gross_currency"] = df["opening_weekend_gross_currency"].replace("€.", "EUR")
    df["opening_weekend_gross_USD"] = df["opening_weekend_gross"].apply(lambda x: float(x[1:].split()[-1]) \
                                                                        if x != "" else x)
    df["opening_weekend_gross_USD"] = df.apply(lambda x: convert_to_USD( \
                                               x["opening_weekend_gross_USD"], x["opening_weekend_gross_currency"]) \
                                               if (x["opening_weekend_gross_USD"] != "") and \
                                               (x["opening_weekend_gross_currency"] != "") else "", axis=1)
    
    # Color
    df["color"] = df["color"].apply(lambda x: extract_color(x))
    
    # Aspect ratio
    df["aspect_ratio"] = df["aspect_ratio"].apply(lambda x: x.replace("x", ":"))
    df["aspect_ratio"] = df["aspect_ratio"].apply(lambda x: "!!!!!" if (":" not in x) and (x != "") else x)
    df["aspect_ratio"] = df["aspect_ratio"].apply(lambda x: re.sub(r"[a-zA-Z]", "", x))
    df["aspect_ratio"] = df["aspect_ratio"].apply(lambda x: x.replace(",", "."))
    df["aspect_ratio"] = df["aspect_ratio"].apply(lambda x: x.replace("2:35", "2.35"))
    df["aspect_ratio"] = df["aspect_ratio"].apply(lambda x: x.replace(" : ", ":"))
    df["aspect_ratio"] = df["aspect_ratio"].apply(lambda x: x.replace("4: 3", "4:3"))
    df["aspect_ratio"] = df["aspect_ratio"].apply(lambda x: x.replace("16: 9", ":"))
    df["aspect_ratio"] = df["aspect_ratio"].apply(lambda x: x.split()[0] if x != "" else x)
    df["aspect_ratio"] = df["aspect_ratio"].apply(lambda x: x.strip(":"))
    df["aspect_ratio"] = df["aspect_ratio"].apply(lambda x: x.replace(":", "/"))
    df["aspect_ratio"] = df["aspect_ratio"].apply(lambda x: x.replace("!!!!!", "Other"))
    df["aspect_ratio"] = df["aspect_ratio"].apply(lambda x: eval_division_string(x) \
                                                  if (x != "" and x != "Other") else x)
    df["aspect_ratio"] = df["aspect_ratio"].convert_objects(convert_numeric=True)
    
    return df

In [638]:
def merge_in_nominees_and_wins(df, nom_win_df):
    nom_win_for_merge = nom_win_df[nom_win_df["year"].astype(int) >= 1990]
    nom_win_for_merge = nom_win_for_merge[["key", "status", "status_year"]]
    
    new_df = pd.merge(df, nom_win_for_merge, on="key", how="left")

    new_df["status"][10186] = np.nan
    new_df["status_year"][10186] = np.nan
    new_df["status"][57111] = np.nan
    new_df["status_year"][57111] = np.nan
    
    return new_df

In [639]:
def merge_in_detailed_movie_data(df, detailed_df):
    new_df = pd.merge(df, detailed_df, on="link", how="left")
    
    return new_df

In [640]:
# Load in raw scraped data
movie_data = load_pickle("movie_data.pkl")
nominees_and_winners_raw = load_pickle("nominees_and_winners.pkl")
detailed_movie_data = load_pickle("detailed_movie_data.pkl")

In [None]:
# Process the data
df = process_IMDB_data(movie_data)
nom_win_df = process_nominee_and_winner_data(nominees_and_winners_raw)
detailed_df = process_detailed_movie_data(detailed_movie_data)

df = merge_in_nominees_and_wins(df, nom_win_df)
df = merge_in_detailed_movie_data(df, detailed_df)

In [642]:
links = df["link"].values
# pickle_it(links, "all_links.pkl")
!ls

Data_Processing.ipynb          directors_actors_actresses.pkl
Data_Scraping.ipynb            exchange_rates.pkl
README.md                      get_exchange_rates.py
Regression.ipynb               [34mimg[m[m
Selenium_Scraping.ipynb        movie_data.pkl
all_links.pkl                  nominees_and_winners.pkl
detailed_movie_data.pkl        writers.pkl
df_for_regression.pkl


In [643]:
df.head(1)

Unnamed: 0,key,title,year,user_rating_short,user_rating_n,director,starring,genre,runtime,pg_rating,link,status,status_year,release_date,critic_rating,critic_rating_n,writer,country,language,budget,opening_weekend_gross,production_company,sound_mix,color,aspect_ratio,release_date_details,release_date_datetime,release_month,release_day_in_year,budget_currency,budget_USD,opening_weekend_gross_currency,opening_weekend_gross_USD
0,Inception (2010),Inception,2010,8.8,1303923,[Christopher Nolan],"[Leonardo DiCaprio, Joseph Gordon-Levitt, Elle...","[Action, Mystery, Sci-Fi, Thriller]",148,PG_13,http://www.imdb.com/title/tt1375666/,N,2010,16 July 2010,74,42,[Christopher Nolan],"[USA, UK]","[English, Japanese, French]",$160000000,£5912814,"[Warner Bros., Legendary Pictures, Syncopy]","[Dolby Digital, DTS, SDDS]",[Color],2.35,3,2010-07-16 00:00:00,7,197,USD,160000000.0,GBP,9059174


In [None]:
def final_clean(df):
    # Parental Guidance (MPAA) Rating
    df = df[df["pg_rating"] != "APPROVED"]
    df["pg_rating"] = df["pg_rating"].replace("X", "NC_17")
    df["pg_rating"] = df["pg_rating"].replace("NOT_RATED", "UNRATED")
    df["pg_rating"] = df["pg_rating"].replace("", "UNRATED")

    # Status and Status Score
    df["status_score"] = df["status"]
    df["status_score"] = df["status_score"].replace("W", 10)
    df["status_score"] = df["status_score"].replace("N", 5)
    df["status_score"] = df["status_score"].replace(np.nan, 0)
    
    # Status Year
    df["status_year"].fillna(df["year"], inplace=True)
    
    # Number Nominees
    df["num_nominees"] = df["status_year"].apply(lambda x: "5" if int(x) <= 2008 else ">5")
    
    # Budget
    df["budget_USD_real"] = df.apply(lambda x: x["budget_USD"] * inflation_multipliers[int(x["year"])] \
                                     if x["budget_USD"] != "" else x["budget_USD"], axis=1)
    df["budget_USD_real"] = df["budget_USD_real"].convert_objects(convert_numeric=True)
    
    # Opening Weekend Gross
    df["opening_weekend_gross_USD_real"] = df.apply(lambda x: x["opening_weekend_gross_USD"] * \
                                                    inflation_multipliers[int(x["year"])] \
                                                    if x["opening_weekend_gross_USD"] != "" \
                                                    else x["opening_weekend_gross_USD"], axis=1)
    df["opening_weekend_gross_USD_real"] = df["opening_weekend_gross_USD_real"].convert_objects(convert_numeric=True)
    
    return df
    
df = final_clean(df)

In [645]:
test_df.head(1)

Unnamed: 0,key,title,year,user_rating_short,user_rating_n,director,starring,genre,runtime,pg_rating,link,status,status_year,release_date,critic_rating,critic_rating_n,writer,country,language,budget,opening_weekend_gross,production_company,sound_mix,color,aspect_ratio,release_date_details,release_date_datetime,release_month,release_day_in_year,budget_currency,budget_USD,opening_weekend_gross_currency,opening_weekend_gross_USD,status_score,num_nominees,budget_USD_real,opening_weekend_gross_USD_real
0,Inception (2010),Inception,2010,8.8,1303923,[Christopher Nolan],"[Leonardo DiCaprio, Joseph Gordon-Levitt, Elle...","[Action, Mystery, Sci-Fi, Thriller]",148,PG_13,http://www.imdb.com/title/tt1375666/,N,2010,16 July 2010,74,42,[Christopher Nolan],"[USA, UK]","[English, Japanese, French]",$160000000,£5912814,"[Warner Bros., Legendary Pictures, Syncopy]","[Dolby Digital, DTS, SDDS]",[Color],2.35,3,2010-07-16 00:00:00,7,197,USD,160000000.0,GBP,9059174,5,>5,176635900.0,10001090.0


In [646]:
directors, actors, actresses = load_pickle("directors_actors_actresses.pkl")
stars = actors + actresses
writers = load_pickle("writers.pkl")

genres = sorted(list(set([genre for genre_list in df["genre"].values for genre in genre_list]))[1:])
genres_to_exclude = ["News", "Talk-Show", "Game-Show", "Reality-TV", "Documentary", "Adult"]
genres = [genre for genre in genres if genre not in genres_to_exclude]

In [647]:
def make_hist_dict(df, column):
    d = {}
    for lst in df[column].values:
        for x in lst:
            d[x] = d.get(x, 0) + 1
    
    return d

In [648]:
country_hist = make_hist_dict(detailed_df, "country")
countries = list(country_hist.items())
countries = sorted(countries, key=lambda x: x[1])[-25:]
countries = [x[0] for x in countries]

In [649]:
language_hist = make_hist_dict(detailed_df, "language")
languages = list(language_hist.items())
languages = sorted(languages, key=lambda x: x[1])[-25:]
languages = [x[0] for x in languages]

In [650]:
def make_boolean_columns(df, old_column, new_columns):
    for column in new_columns:
        column_name = old_column + "_"
        column_name += column.lower().replace(" ", "_").replace("-", "_").replace(".", "")
        df[column_name] = df[old_column].apply(lambda x: 1 if column in x else 0)
        
    return df

In [651]:
df = make_boolean_columns(df, "director", directors)
df = make_boolean_columns(df, "starring", stars)
df = make_boolean_columns(df, "genre", genres)
df = make_boolean_columns(df, "writer", writers)
df = make_boolean_columns(df, "country", countries)
df = make_boolean_columns(df, "language", languages)
df = make_boolean_columns(df, "color", ["Color", "Black and White"])

In [652]:
df.head(1)

Unnamed: 0,key,title,year,user_rating_short,user_rating_n,director,starring,genre,runtime,pg_rating,link,status,status_year,release_date,critic_rating,critic_rating_n,writer,country,language,budget,opening_weekend_gross,production_company,sound_mix,color,aspect_ratio,release_date_details,release_date_datetime,release_month,release_day_in_year,budget_currency,budget_USD,opening_weekend_gross_currency,opening_weekend_gross_USD,status_score,num_nominees,budget_USD_real,opening_weekend_gross_USD_real,director_martin_scorsese,director_quentin_tarantino,director_steven_spielberg,director_christopher_nolan,director_alfred_hitchcock,director_david_fincher,director_stanley_kubrick,director_james_cameron,director_clint_eastwood,director_peter_jackson,director_francis_ford_coppola,director_ridley_scott,director_woody_allen,director_david_lynch,director_paul_thomas_anderson,director_akira_kurosawa,director_tim_burton,director_sergio_leone,director_charlie_chaplin,director_roman_polanski,director_tony_tarantino,director_wes_anderson,director_ingmar_bergman,director_orson_welles,director_david_cronenberg,director_joel_coen,director_william_wyler,director_john_ford,director_steven_soderbergh,director_federico_fellini,director_michael_mann,director_miloš_forman,director_jean_luc_godard,director_oliver_stone,director_hayao_miyazaki,director_ang_lee,director_george_lucas,director_frank_capra,director_andrei_tarkovsky,director_terrence_malick,director_alejandro_gonzález_iñárritu,director_sidney_lumet,director_elia_kazan,director_david_lean,director_sam_mendes,director_werner_herzog,director_alfonso_cuarón,director_brian_de_palma,director_spike_jonze,director_frank_darabont,director_darren_aronofsky,starring_johnny_depp,starring_leonardo_dicaprio,starring_robert_downey_jr,starring_brad_pitt,starring_tom_hanks,starring_jack_nicholson,starring_denzel_washington,starring_will_smith,starring_al_pacino,starring_tom_cruise,starring_christian_bale,starring_robert_de_niro,starring_daniel_day_lewis,starring_morgan_freeman,starring_clint_eastwood,starring_matt_damon,starring_russell_crowe,starring_liam_neeson,starring_george_clooney,starring_dustin_hoffman,starring_anthony_hopkins,starring_marlon_brando,starring_harrison_ford,starring_sean_penn,starring_michael_caine,starring_shah_rukh_khan,starring_gary_oldman,starring_laurence_olivier,starring_hugh_jackman,starring_ralph_fiennes,starring_bruce_willis,starring_sean_connery,starring_samuel_l_jackson,starring_kevin_spacey,starring_mark_wahlberg,starring_mel_gibson,starring_heath_ledger,starring_robert_duvall,starring_ryan_gosling,starring_keanu_reeves,starring_nicolas_cage,starring_salman_khan,starring_amitabh_bachchan,starring_robin_williams,starring_adam_sandler,starring_philip_seymour_hoffman,starring_humphrey_bogart,starring_javier_bardem,starring_jeff_bridges,starring_edward_norton,starring_aamir_khan,starring_natalie_portman,starring_scarlett_johansson,starring_jennifer_lawrence,starring_angelina_jolie,starring_charlize_theron,starring_emma_stone,starring_mila_kunis,starring_sandra_bullock,starring_nicole_kidman,starring_keira_knightley,starring_emma_watson,starring_meryl_streep,starring_jennifer_aniston,starring_anne_hathaway,starring_jessica_alba,starring_kate_winslet,starring_cate_blanchett,starring_megan_fox,starring_audrey_hepburn,starring_julianne_moore,starring_katharine_hepburn,starring_penélope_cruz,starring_julia_roberts,starring_amanda_seyfried,starring_rachel_mcadams,starring_kristen_stewart,starring_marilyn_monroe,starring_ingrid_bergman,starring_amy_adams,starring_jodie_foster,starring_kate_beckinsale,starring_sophia_loren,starring_olivia_wilde,starring_marion_cotillard,starring_bette_davis,starring_zooey_deschanel,starring_grace_kelly,starring_reese_witherspoon,starring_naomi_watts,starring_emily_blunt,starring_helen_mirren,starring_cameron_diaz,starring_olivia_de_havilland,starring_michelle_pfeiffer,starring_vivien_leigh,starring_halle_berry,starring_ellen_page,starring_jane_fonda,starring_amber_heard,starring_gwyneth_paltrow,starring_sigourney_weaver,genre_action,genre_adventure,genre_animation,genre_biography,genre_comedy,genre_crime,genre_drama,genre_family,genre_fantasy,genre_history,genre_horror,genre_music,genre_musical,genre_mystery,genre_romance,genre_sci_fi,genre_sport,genre_thriller,genre_war,genre_western,writer_woody_allen,writer_charlie_kaufman,writer_christopher_nolan,writer_stephen_king,writer_quentin_tarantino,writer_truman_capote,writer_spike_jonze,writer_steven_spielberg,writer_david_cronenberg,writer_roman_polanski,writer_stanley_kubrick,writer_william_goldman,writer_james_cameron,writer_rob_reiner,writer_leonardo_dicaprio,writer_billy_wilder,writer_will_ferrell,writer_jane_campion,writer_guillermo_del_toro,writer_francis_ford_coppola,writer_sam_raimi,writer_wes_anderson,writer_wes_craven,writer_hal_hartley,writer_paul_thomas_anderson,writer_ridley_scott,writer_sidney_lumet,writer_alan_rudolph,writer_steve_martin,writer_david_mamet,writer_peter_jackson,writer_john_carpenter,writer_owen_wilson,writer_m_night_shyamalan,writer_ingmar_bergman,writer_jason_reitman,writer_stephen_frears,writer_william_s_burroughs,writer_seth_rogen,writer_ben_affleck,writer_terry_rossio,writer_oliver_stone,writer_walt_disney,writer_martin_scorsese,writer_alfred_hitchcock,writer_noah_baumbach,writer_robert_altman,writer_michael_mann,writer_peter_benchley,writer_brian_helgeland,writer_johnny_depp,country_poland,country_austria,country_iran,country_turkey,country_sweden,country_belgium,country_brazil,country_netherlands,country_australia,country_china,country_south_korea,country_argentina,country_russia,country_mexico,country_hong_kong,country_philippines,country_spain,country_italy,country_germany,country_canada,country_uk,country_france,country_japan,country_india,country_usa,language_indonesian,language_bengali,language_polish,language_swedish,language_dutch,language_persian,language_turkish,language_tamil,language_arabic,language_portuguese,language_telugu,language_malayalam,language_korean,language_cantonese,language_mandarin,language_tagalog,language_filipino,language_russian,language_italian,language_german,language_hindi,language_french,language_japanese,language_spanish,language_english,color_color,color_black_and_white
0,Inception (2010),Inception,2010,8.8,1303923,[Christopher Nolan],"[Leonardo DiCaprio, Joseph Gordon-Levitt, Elle...","[Action, Mystery, Sci-Fi, Thriller]",148,PG_13,http://www.imdb.com/title/tt1375666/,N,2010,16 July 2010,74,42,[Christopher Nolan],"[USA, UK]","[English, Japanese, French]",$160000000,£5912814,"[Warner Bros., Legendary Pictures, Syncopy]","[Dolby Digital, DTS, SDDS]",[Color],2.35,3,2010-07-16 00:00:00,7,197,USD,160000000.0,GBP,9059174,5,>5,176635900.0,10001093.489419,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,1,1,0


In [653]:
df = df.drop(["title", "year", "user_rating_n", "director", "starring", "genre", "link", "status", "release_date", \
              "critic_rating_n", "writer", "country", "language", "budget", "budget", "opening_weekend_gross", \
              "production_company", "sound_mix", "color", "release_date_details", "release_date_datetime", \
              "budget_currency", "budget_USD", "opening_weekend_gross_currency", "opening_weekend_gross_USD"], axis=1)

In [654]:
df.head(1)

Unnamed: 0,key,user_rating_short,runtime,pg_rating,status_year,critic_rating,aspect_ratio,release_month,release_day_in_year,status_score,num_nominees,budget_USD_real,opening_weekend_gross_USD_real,director_martin_scorsese,director_quentin_tarantino,director_steven_spielberg,director_christopher_nolan,director_alfred_hitchcock,director_david_fincher,director_stanley_kubrick,director_james_cameron,director_clint_eastwood,director_peter_jackson,director_francis_ford_coppola,director_ridley_scott,director_woody_allen,director_david_lynch,director_paul_thomas_anderson,director_akira_kurosawa,director_tim_burton,director_sergio_leone,director_charlie_chaplin,director_roman_polanski,director_tony_tarantino,director_wes_anderson,director_ingmar_bergman,director_orson_welles,director_david_cronenberg,director_joel_coen,director_william_wyler,director_john_ford,director_steven_soderbergh,director_federico_fellini,director_michael_mann,director_miloš_forman,director_jean_luc_godard,director_oliver_stone,director_hayao_miyazaki,director_ang_lee,director_george_lucas,director_frank_capra,director_andrei_tarkovsky,director_terrence_malick,director_alejandro_gonzález_iñárritu,director_sidney_lumet,director_elia_kazan,director_david_lean,director_sam_mendes,director_werner_herzog,director_alfonso_cuarón,director_brian_de_palma,director_spike_jonze,director_frank_darabont,director_darren_aronofsky,starring_johnny_depp,starring_leonardo_dicaprio,starring_robert_downey_jr,starring_brad_pitt,starring_tom_hanks,starring_jack_nicholson,starring_denzel_washington,starring_will_smith,starring_al_pacino,starring_tom_cruise,starring_christian_bale,starring_robert_de_niro,starring_daniel_day_lewis,starring_morgan_freeman,starring_clint_eastwood,starring_matt_damon,starring_russell_crowe,starring_liam_neeson,starring_george_clooney,starring_dustin_hoffman,starring_anthony_hopkins,starring_marlon_brando,starring_harrison_ford,starring_sean_penn,starring_michael_caine,starring_shah_rukh_khan,starring_gary_oldman,starring_laurence_olivier,starring_hugh_jackman,starring_ralph_fiennes,starring_bruce_willis,starring_sean_connery,starring_samuel_l_jackson,starring_kevin_spacey,starring_mark_wahlberg,starring_mel_gibson,starring_heath_ledger,starring_robert_duvall,starring_ryan_gosling,starring_keanu_reeves,starring_nicolas_cage,starring_salman_khan,starring_amitabh_bachchan,starring_robin_williams,starring_adam_sandler,starring_philip_seymour_hoffman,starring_humphrey_bogart,starring_javier_bardem,starring_jeff_bridges,starring_edward_norton,starring_aamir_khan,starring_natalie_portman,starring_scarlett_johansson,starring_jennifer_lawrence,starring_angelina_jolie,starring_charlize_theron,starring_emma_stone,starring_mila_kunis,starring_sandra_bullock,starring_nicole_kidman,starring_keira_knightley,starring_emma_watson,starring_meryl_streep,starring_jennifer_aniston,starring_anne_hathaway,starring_jessica_alba,starring_kate_winslet,starring_cate_blanchett,starring_megan_fox,starring_audrey_hepburn,starring_julianne_moore,starring_katharine_hepburn,starring_penélope_cruz,starring_julia_roberts,starring_amanda_seyfried,starring_rachel_mcadams,starring_kristen_stewart,starring_marilyn_monroe,starring_ingrid_bergman,starring_amy_adams,starring_jodie_foster,starring_kate_beckinsale,starring_sophia_loren,starring_olivia_wilde,starring_marion_cotillard,starring_bette_davis,starring_zooey_deschanel,starring_grace_kelly,starring_reese_witherspoon,starring_naomi_watts,starring_emily_blunt,starring_helen_mirren,starring_cameron_diaz,starring_olivia_de_havilland,starring_michelle_pfeiffer,starring_vivien_leigh,starring_halle_berry,starring_ellen_page,starring_jane_fonda,starring_amber_heard,starring_gwyneth_paltrow,starring_sigourney_weaver,genre_action,genre_adventure,genre_animation,genre_biography,genre_comedy,genre_crime,genre_drama,genre_family,genre_fantasy,genre_history,genre_horror,genre_music,genre_musical,genre_mystery,genre_romance,genre_sci_fi,genre_sport,genre_thriller,genre_war,genre_western,writer_woody_allen,writer_charlie_kaufman,writer_christopher_nolan,writer_stephen_king,writer_quentin_tarantino,writer_truman_capote,writer_spike_jonze,writer_steven_spielberg,writer_david_cronenberg,writer_roman_polanski,writer_stanley_kubrick,writer_william_goldman,writer_james_cameron,writer_rob_reiner,writer_leonardo_dicaprio,writer_billy_wilder,writer_will_ferrell,writer_jane_campion,writer_guillermo_del_toro,writer_francis_ford_coppola,writer_sam_raimi,writer_wes_anderson,writer_wes_craven,writer_hal_hartley,writer_paul_thomas_anderson,writer_ridley_scott,writer_sidney_lumet,writer_alan_rudolph,writer_steve_martin,writer_david_mamet,writer_peter_jackson,writer_john_carpenter,writer_owen_wilson,writer_m_night_shyamalan,writer_ingmar_bergman,writer_jason_reitman,writer_stephen_frears,writer_william_s_burroughs,writer_seth_rogen,writer_ben_affleck,writer_terry_rossio,writer_oliver_stone,writer_walt_disney,writer_martin_scorsese,writer_alfred_hitchcock,writer_noah_baumbach,writer_robert_altman,writer_michael_mann,writer_peter_benchley,writer_brian_helgeland,writer_johnny_depp,country_poland,country_austria,country_iran,country_turkey,country_sweden,country_belgium,country_brazil,country_netherlands,country_australia,country_china,country_south_korea,country_argentina,country_russia,country_mexico,country_hong_kong,country_philippines,country_spain,country_italy,country_germany,country_canada,country_uk,country_france,country_japan,country_india,country_usa,language_indonesian,language_bengali,language_polish,language_swedish,language_dutch,language_persian,language_turkish,language_tamil,language_arabic,language_portuguese,language_telugu,language_malayalam,language_korean,language_cantonese,language_mandarin,language_tagalog,language_filipino,language_russian,language_italian,language_german,language_hindi,language_french,language_japanese,language_spanish,language_english,color_color,color_black_and_white
0,Inception (2010),8.8,148,PG_13,2010,74,2.35,7,197,5,>5,176635900.0,10001093.489419,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,1,1,0


In [655]:
pickle_it(df, "df_for_regression.pkl")
!ls

Data_Processing.ipynb          directors_actors_actresses.pkl
Data_Scraping.ipynb            exchange_rates.pkl
README.md                      get_exchange_rates.py
Regression.ipynb               [34mimg[m[m
Selenium_Scraping.ipynb        movie_data.pkl
all_links.pkl                  nominees_and_winners.pkl
detailed_movie_data.pkl        writers.pkl
df_for_regression.pkl
