In [4]:
%pip install pandas requests numpy lxml beautifulsoup4 html5lib

Collecting html5lib
  Downloading html5lib-1.1-py2.py3-none-any.whl.metadata (16 kB)
Downloading html5lib-1.1-py2.py3-none-any.whl (112 kB)
Installing collected packages: html5lib
Successfully installed html5lib-1.1
Note: you may need to restart the kernel to use updated packages.


In [6]:
import requests
import pandas as pd
import numpy as np
import io


In [7]:
url_wiki = "https://kt.ijs.si/~ljupco/lectures/appr-2324/dn1-viri/List%20of%20Academy%20Award-winning%20films%20-%20Wikipedia.html"
headers = {"User-Agent": "Mozilla/5.0"}

resp_wiki = requests.get(url_wiki, headers=headers)
resp_wiki.raise_for_status()

# Essa página tem mais de uma tabela; a que queremos é a com colunas Film / Year / Awards / Nominations
tables_wiki = pd.read_html(io.StringIO(resp_wiki.text))

oscars = tables_wiki[0].copy()
oscars

Unnamed: 0,Film,Year,Awards,Nominations
0,Everything Everywhere All at Once,2022,7,11
1,All Quiet on the Western Front,2022,4,9
2,The Whale,2022,2,3
3,Top Gun: Maverick,2022,1,6
4,Black Panther: Wakanda Forever,2022,1,5
...,...,...,...,...
1355,The Yankee Doodle Mouse,1943,1,1
1356,The Yearling,1946,2,7
1357,"Yesterday, Today and Tomorrow (Ieri, oggi, dom...",1964,1,1
1358,You Can't Take It with You,1938,2,7


In [8]:
# Remove qualquer texto entre parênteses em Awards (casos tipo "0 (1)")
oscars["Awards_clean"] = (
    oscars["Awards"]
    .astype(str)
    .str.replace(r"\s*\(.*\)", "", regex=True)
)

# Converte para numérico, forçando erros para NaN
oscars["Awards_clean"] = pd.to_numeric(oscars["Awards_clean"], errors="coerce")
oscars["Nominations_clean"] = pd.to_numeric(oscars["Nominations"], errors="coerce")

# Remove linhas sem números válidos
oscars_num = oscars.dropna(subset=["Awards_clean", "Nominations_clean"]).copy()

# ---------------------------------------------------
# 2) Min–max normalization de Awards e Nominations
# ---------------------------------------------------

def minmax(col):
    return (col - col.min()) / (col.max() - col.min())

oscars_num["Awards_norm"] = minmax(oscars_num["Awards_clean"])
oscars_num["Nominations_norm"] = minmax(oscars_num["Nominations_clean"])

# Exemplo de score composto (50% wins, 50% indicações)
oscars_num["Score_norm"] = 0.5 * oscars_num["Awards_norm"] + 0.5 * oscars_num["Nominations_norm"]

# Top 10 filmes com maior score normalizado
oscars_num.sort_values("Score_norm", ascending=False).head(10)

Unnamed: 0,Film,Year,Awards,Nominations,Awards_clean,Nominations_clean,Awards_norm,Nominations_norm,Score_norm
365,Titanic,1997,11,14,11.0,14.0,1.0,1.0,1.0
855,Ben-Hur,1959,11,12,11.0,12.0,1.0,0.857143,0.928571
283,The Lord of the Rings: The Return of the King,2003,11,11,11.0,11.0,1.0,0.785714,0.892857
1328,West Side Story,1961,10,11,10.0,11.0,0.909091,0.785714,0.847403
376,The English Patient,1996,9,12,9.0,12.0,0.818182,0.857143,0.837662
977,From Here to Eternity,1953,8,13,8.0,13.0,0.727273,0.928571,0.827922
994,Gone with the Wind,1939,8 (2),13,8.0,13.0,0.727273,0.928571,0.827922
1132,My Fair Lady,1964,8,12,8.0,12.0,0.727273,0.857143,0.792208
1154,On the Waterfront,1954,8,12,8.0,12.0,0.727273,0.857143,0.792208
353,Shakespeare in Love,1998,7,13,7.0,13.0,0.636364,0.928571,0.782468


In [10]:
# ---------------------------------------------------
# 3) Tabela: Best Picture Oscar Winners (The Numbers)
# ---------------------------------------------------

url_numbers = "https://www.the-numbers.com/movies/comparisons/Best-Picture-Oscar-Winners"
resp_num = requests.get(url_numbers, headers=headers)
resp_num.raise_for_status()

tables_num = pd.read_html(io.StringIO(resp_num.text))

# Procure a tabela com filmes/ano/budget/box office etc.
for i, t in enumerate(tables_num):
    print(i, t.columns)

# Suponha que a tabela principal seja tables_num[0] (ajuste após inspeção)
bp = tables_num[0].copy()
bp.head()


0 Index(['Release Date', 'Movie', 'Production Budget',
       'Domestic Opening Weekend', 'Domestic Box Office',
       'Worldwide Box Office', 'Trailer'],
      dtype='str')


Unnamed: 0,Release Date,Movie,Production Budget,Domestic Opening Weekend,Domestic Box Office,Worldwide Box Office,Trailer
0,"Oct 22, 1964",My Fair Lady,"$17,000,000",,"$72,000,000","$72,073,063",
1,"Mar 2, 1965",The Sound of Music,"$8,200,000",,"$164,815,523","$287,814,441",
2,"Dec 12, 1966",A Man for All Seasons,"$3,900,000",,"$28,350,000","$28,350,000",
3,"Aug 2, 1967",In the Heat of the Night,"$2,000,000",,"$24,379,978","$24,407,647",
4,"Dec 11, 1968",Oliver!,"$10,000,000",,"$37,402,877","$37,402,877",


In [14]:
# Realiza o merge (join) entre os dataframes
oscars_num = oscars_num.merge(
    bp[['Movie', 'Worldwide Box Office']], # Selecionamos apenas as colunas necessárias de bp
    left_on='Film',                       # Coluna de busca no oscars_num
    right_on='Movie',                     # Coluna de busca no bp
    how='left'                            # Mantém todos os filmes da lista de Oscars
)

# Remove a coluna 'Movie' que fica duplicada após o merge
oscars_num = oscars_num.drop(columns='Movie')

# Remove as linhas em que não há o valor para essa(s) coluna(s)
oscars_num = oscars_num.dropna(subset=['Worldwide Box Office']).copy()

# Visualiza o resultado
oscars_num.head()

Unnamed: 0,Film,Year,Awards,Nominations,Awards_clean,Nominations_clean,Awards_norm,Nominations_norm,Score_norm,Worldwide Box Office_x,Worldwide Box Office_y,Worldwide Box Office
13,CODA,2021,3,3,3.0,3.0,0.272727,0.214286,0.243506,"$2,237,618","$2,237,618","$2,237,618"
28,Nomadland,2020/21,3,6,3.0,6.0,0.272727,0.428571,0.350649,"$38,728,987","$38,728,987","$38,728,987"
59,Green Book,2018,3,5,3.0,5.0,0.272727,0.357143,0.314935,"$319,995,019","$319,995,019","$319,995,019"
74,The Shape of Water,2017,4,13,4.0,13.0,0.363636,0.928571,0.646104,"$195,790,794","$195,790,794","$195,790,794"
89,Moonlight,2016,3,8,3.0,8.0,0.272727,0.571429,0.422078,"$64,828,447","$64,828,447","$64,828,447"
