In [24]:
# imports
import pandas as pd
import seaborn as sns
import numpy as np

## Data preprocessing

### Import

In [50]:
movie_df = pd.read_csv("oscar_movies.csv")
movie_df

Unnamed: 0,year,movie,movie_id,certificate,duration,genre,rate,metascore,synopsis,votes,...,New_York_Film_Critics_Circle_nominated,New_York_Film_Critics_Circle_nominated_categories,Los_Angeles_Film_Critics_Association_won,Los_Angeles_Film_Critics_Association_won_categories,Los_Angeles_Film_Critics_Association_nominated,Los_Angeles_Film_Critics_Association_nominated_categories,release_date.year,release_date.month,release_date.day-of-month,release_date.day-of-week
0,2001,Kate & Leopold,tt0035423,PG-13,118,Comedy|Fantasy|Romance,6.4,44.0,An English Duke from 1876 is inadvertedly drag...,66660,...,0,,0,,0,,2001.0,12.0,25.0,2.0
1,2000,Chicken Run,tt0120630,G,84,Animation|Adventure|Comedy,7.0,88.0,When a cockerel apparently flies into a chicke...,144475,...,1,Best Animated Film,1,Best Animation,1,Best Animation,2000.0,6.0,23.0,5.0
2,2005,Fantastic Four,tt0120667,PG-13,106,Action|Adventure|Family,5.7,40.0,A group of astronauts gain superpowers after a...,273203,...,0,,0,,0,,2005.0,7.0,8.0,5.0
3,2002,Frida,tt0120679,R,123,Biography|Drama|Romance,7.4,61.0,"A biography of artist Frida Kahlo, who channel...",63852,...,0,,0,,0,,2002.0,11.0,22.0,5.0
4,2001,The Lord of the Rings: The Fellowship of the Ring,tt0120737,PG-13,178,Adventure|Drama|Fantasy,8.8,92.0,A meek Hobbit from the Shire and eight compani...,1286275,...,0,,1,Best Music,2,Best Music|Best Production Design,2001.0,12.0,19.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1178,2017,Call Me by Your Name,tt5726616,R,132,Drama|Romance,8.3,93.0,"In Northern Italy in 1983, seventeen year-old ...",38170,...,1,Best Actor,3,Best Actor|Best Director|Best Picture,3,Best Actor|Best Director|Best Picture,,,,
1179,2017,Phantom Thread,tt5776858,R,130,Drama|Romance,8.4,90.0,"Set in 1950's London, Reynolds Woodcock is a r...",7380,...,1,Best Screenplay,1,Best Music,1,Best Music,,,,
1180,2017,Victoria & Abdul,tt5816682,PG-13,111,Biography|Drama|History,6.8,58.0,Queen Victoria strikes up an unlikely friendsh...,12888,...,0,,0,,0,,2017.0,10.0,6.0,5.0
1181,2017,"Roman J. Israel, Esq.",tt6000478,PG-13,122,Crime|Drama,6.3,58.0,"Roman J. Israel, Esq., a driven, idealistic de...",3205,...,0,,0,,0,,,,,


### Parse

In [51]:
# make sure all are unique

assert len(movie_df[movie_df.duplicated()]) == 0 

In [52]:
# convert Oscar flags to boolean instead of "Yes", "No" (all starting with "Oscar_" except "Oscar_nominated","Oscar_nominated_categories")
for column in movie_df.columns:
    # get relevant Oscar flag columns
    if column.startswith("Oscar_") and column not in ["Oscar_nominated", "Oscar_nominated_categories"]:
        movie_df[column] = movie_df[column].replace({"Yes": True, "No": False})

movie_df["Oscar_Best_Picture_won"].dtype

dtype('bool')

In [53]:
movie_df[ # fill empty year, month, day-of-month, day-of-week with 0 for following processing
    ["release_date.year", "release_date.month", "release_date.day-of-month", "release_date.day-of-week"]
    ] = movie_df[
        ["release_date.year", "release_date.month", "release_date.day-of-month", "release_date.day-of-week"]
        ].fillna(0)
movie_df[ # convert "release_date.____" columns to ints instead of floats
    ["release_date.year", "release_date.month", "release_date.day-of-month", "release_date.day-of-week"]
    ] = movie_df[
        ["release_date.year", "release_date.month", "release_date.day-of-month", "release_date.day-of-week"]
        ].astype(int)

# convert "release_date" column to datetime types
movie_df["release_date"] = pd.to_datetime(movie_df["release_date"], dayfirst=False)

movie_df["release_date"].dtype
# dtype('<M8[ns]') is a datetype type

dtype('<M8[ns]')

In [54]:
no_release_date = movie_df["release_date"].isna()

no_year = (movie_df["release_date.year"] == 0)
no_month = (movie_df["release_date.month"] == 0)
no_day_month = (movie_df["release_date.day-of-month"] == 0)
no_day_week = (movie_df["release_date.day-of-week"] == 0)

# assert all refer to the same rows
assert ((no_release_date == no_year).all() 
        and (no_year == no_month).all() 
        and (no_month == no_day_month).all() 
        and (no_day_month == no_day_week).all())

# get movies without entered release dates
movie_df[movie_df["release_date"].isna()]["movie"]

1045                                     Mudbound
1060                                       Wonder
1165                                    Lady Bird
1169    Three Billboards Outside Ebbing, Missouri
1173                                     Marshall
1177                          The Florida Project
1178                         Call Me by Your Name
1179                               Phantom Thread
1181                        Roman J. Israel, Esq.
Name: movie, dtype: object

In [55]:
# TODO: fill in this information manually? It's only nine movies.

In [56]:
# convert "categories" nominated/won from strings to lists of strings
for column in movie_df.columns:
    if "categories" in column:
        # convert to list if string, if already list then keep as list, otherwise empty list (no nominations)
        movie_df[column] = movie_df[column].apply(lambda entry: entry.split("|") if isinstance(entry, str) else (entry if isinstance(entry, list) else []))

# sanity check: make sure each entry length matches (for oscar at least)
oscar_mismatch_rows = movie_df[movie_df['Oscar_nominated_categories'].apply(len) != movie_df['Oscar_nominated']]
oscar_mismatch_rows[["Oscar_nominated_categories", "Oscar_nominated"]]

Unnamed: 0,Oscar_nominated_categories,Oscar_nominated


In [57]:
# convert "genre" from strings into lists of strings
assert movie_df["genre"].isna().sum() == 0 # luckily, every movie has a genre already

# convert to list if string, if already list then keep as list, otherwise empty list (should never happen)
movie_df["genre"] = movie_df["genre"].apply(lambda entry: entry.split("|") if isinstance(entry, str) else (entry if isinstance(entry, list) else []))

assert all (movie_df["genre"].apply(lambda x: len(x) > 0 and isinstance(x, list))) # confirm all entries still have a genre and that all are lists

movie_df["genre"]

0           [Comedy, Fantasy, Romance]
1       [Animation, Adventure, Comedy]
2          [Action, Adventure, Family]
3          [Biography, Drama, Romance]
4          [Adventure, Drama, Fantasy]
                     ...              
1178                  [Drama, Romance]
1179                  [Drama, Romance]
1180       [Biography, Drama, History]
1181                    [Crime, Drama]
1182       [Biography, Drama, History]
Name: genre, Length: 1183, dtype: object