### Calculate coverage of dataset in academy awards

First, we get the academy awards data from Wikipedia.

In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup as bs

In [114]:
page = requests.get("https://en.wikipedia.org/wiki/List_of_Academy_Award%E2%80%93winning_films")

In [115]:
# parse page with beautifulsoup
soup = bs(page.content, 'html.parser')

In [116]:
table = soup.find("table", class_="wikitable")

In [117]:
all_rows = table.find_all("tr")

In [118]:
all_data = [[td.text.strip() for td in row.find_all("td")] + [row.has_attr('style')] for row in all_rows[1:]]

In [119]:
data_df = pd.DataFrame(all_data, columns=["Film", "Year", "Awards", "Nominations", "Win"])

In [120]:
data_df.to_csv("academy_award_winners.csv", index=False)

In [2]:
data_df = pd.read_csv("academy_award_winners.csv")

In [4]:
data_df['decade'] = data_df['Year'].apply(lambda x: str(x)[:-1] + "0s")

In [7]:
data_df.groupby('decade').size()

decade
1927/20s     12
1928/20s      7
1929/30s      6
1930/30s      6
1930s        81
1931/30s     10
1932/30s      9
1940s       180
1950s       154
1960s       147
1970s       134
1980s       136
1990s       139
2000s       147
2010s       149
2020/20s     15
2020s        41
dtype: int64

In [121]:
data_df

Unnamed: 0,Film,Year,Awards,Nominations,Win
0,Oppenheimer,2023,7,13,True
1,Poor Things,2023,4,11,False
2,The Zone of Interest,2023,2,5,False
3,American Fiction,2023,1,5,False
4,Anatomy of a Fall,2023,1,5,False
...,...,...,...,...,...
1368,The Yankee Doodle Mouse,1943,1,1,False
1369,The Yearling,1946,2,7,False
1370,"Yesterday, Today and Tomorrow (Ieri, oggi, dom...",1964,1,1,False
1371,You Can't Take It with You,1938,2,7,True


##### We read our metadata file and proceed with matching

In [128]:
df_x = pd.read_csv("../data/finalized_data/metadata_with_screenplay_subtitles_tmdb.csv", dtype={"movie_id": str})

In [129]:
df_x.shape

(3263, 15)

In [130]:
from thefuzz import fuzz

In [166]:
meta_df = df_x[['title', 'year']]

In [167]:
meta_df.loc[:, 'awarded_film'] = None

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  meta_df.loc[:, 'awarded_film'] = None


In [168]:
# get decade
meta_df['decade'] = meta_df['year'] // 10 * 10

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  meta_df['decade'] = meta_df['year'] // 10 * 10


In [169]:
data_df['Year'] = data_df['Year'].str.replace(r"(\/.*)", "", regex=True).str.strip()

In [170]:
data_df['decade'] = data_df['Year'].astype(int) // 10 * 10

In [171]:
counter = 0
matches = []

for index, row in meta_df.iterrows():
    decade_films = data_df.loc[data_df['decade'] == row['decade']]
    for film in decade_films['Film']:
        if fuzz.ratio(row['title'], film) > 90:
            meta_df.loc[index, 'awarded_film'] = film
            matches.append((row['title'], film))
            counter += 1

In [172]:
meta_df['awarded_film'].notnull().sum()

55

In [173]:
# drop nulls in awarded_film column
meta_df_clean = meta_df.dropna(subset=['awarded_film'])

In [174]:
count_df_xray = meta_df_clean.groupby('decade').count()
count_df_xray

Unnamed: 0_level_0,title,year,awarded_film
decade,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1940.0,1,1,1
1950.0,4,4,4
1960.0,5,5,5
1970.0,6,6,6
1980.0,1,1,1
1990.0,7,7,7
2000.0,8,8,8
2010.0,20,20,20
2020.0,3,3,3


In [179]:
meta_df_clean[meta_df_clean['decade'] == 2010]

Unnamed: 0,title,year,awarded_film,decade
61,The Salesman,2016.0,The Salesman,2010.0
295,Arrival,2016.0,Arrival,2010.0
510,The King's Speech,2010.0,The King's Speech,2010.0
965,Manchester by the Sea,2016.0,Manchester by the Sea,2010.0
1067,Ida,2013.0,Ida,2010.0
1097,Vice,2018.0,Vice,2010.0
1472,Frozen,2010.0,Frozen,2010.0
1487,Alex in Wonderland,2019.0,Alice in Wonderland,2010.0
1874,Hacksaw Ridge,2016.0,Hacksaw Ridge,2010.0
1911,Fences,2016.0,Fences,2010.0


##### Perform matching for screenplays

In [180]:
sc_meta = pd.read_csv("../data/8_screenplays/2_metadata/validated_movies.csv", dtype=str)

In [192]:
sc_meta_df = sc_meta[['title', 'year']]
sc_meta_df.loc[:, 'awarded_film'] = None
sc_meta_df['decade'] = sc_meta_df['year'].astype(int) // 10 * 10

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sc_meta_df.loc[:, 'awarded_film'] = None
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sc_meta_df['decade'] = sc_meta_df['year'].astype(int) // 10 * 10


In [193]:
counter = 0
matches = []

for index, row in sc_meta_df.iterrows():
    decade_films = data_df.loc[data_df['decade'] == row['decade']]
    for film in decade_films['Film']:
        if fuzz.ratio(row['title'], film) > 90:
            sc_meta_df.loc[index, 'awarded_film'] = film
            matches.append((row['title'], film))
            counter += 1

In [194]:
sc_meta_df['awarded_film'].notnull().sum()

260

In [196]:
sc_meta_df_clean = sc_meta_df.dropna(subset=['awarded_film'])

In [197]:
count_df_sc = sc_meta_df_clean.groupby('decade').count()
count_df_sc

Unnamed: 0_level_0,title,year,awarded_film
decade,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1920,1,1,1
1930,7,7,7
1940,6,6,6
1950,10,10,10
1960,9,9,9
1970,26,26,26
1980,27,27,27
1990,47,47,47
2000,42,42,42
2010,76,76,76
