In [68]:
from dotenv import load_dotenv
load_dotenv()

True

In [69]:
import pandas as pd
import os

In [70]:
%run ./utils/save_result.ipynb

In [71]:
# load table from wikipedia
tables = pd.read_html("https://en.wikipedia.org/wiki/List_of_Marvel_Cinematic_Universe_films", header=0)

filmography_table = pd.concat([tables[1], tables[2]])

filmography_table.drop(columns=["Screenwriter(s)", "Producer(s)", "Unnamed: 5", "Status"], axis=1, inplace=True)

In [72]:
# display table
filmography_table

Unnamed: 0,Film,U.S. release date,Director(s)
0,Phase One[24],Phase One[24],Phase One[24]
1,Iron Man,"May 2, 2008",Jon Favreau[27]
2,The Incredible Hulk,"June 13, 2008",Louis Leterrier[29]
3,Iron Man 2,"May 7, 2010",Jon Favreau[31]
4,Thor,"May 6, 2011",Kenneth Branagh[33]
5,Captain America: The First Avenger,"July 22, 2011",Joe Johnston[35]
6,The Avengers,"May 4, 2012",Joss Whedon[37]
7,Phase Two[24],Phase Two[24],Phase Two[24]
8,Iron Man 3,"May 3, 2013",Shane Black[38]
9,Thor: The Dark World,"November 8, 2013",Alan Taylor[40]


In [73]:
# filter out phase rows
filmography_table.drop(filmography_table[filmography_table["Film"].str.startswith("Phase")].index, inplace=True)

In [74]:
# parse director column
filmography_table = filmography_table.rename(columns={"Director(s)": "Director"})

filmography_table["Director"] = filmography_table["Director"].apply(lambda x: x.split("[")[0])

filmography_table = filmography_table[filmography_table["Director"] != "TBA"]

filmography_table.iloc[0]

Film                    Iron Man
U.S. release date    May 2, 2008
Director             Jon Favreau
Name: 1, dtype: object

In [75]:
# parse release date column
filmography_table = filmography_table.rename(columns={"U.S. release date": "Year"})

# remove wikipedia reference tags
filmography_table["Year"] = filmography_table["Year"].apply(lambda x: x.split("[")[0])

# extract last four numbers
filmography_table["Year"] = filmography_table["Year"].apply(lambda x: int(str(x)[-4:]))

In [76]:
# convert dataframe to exportable list
films = []

for film in filmography_table.values.tolist():
    films.append({
        "title": film[0],
        "year": film[1],
        "director": film[2]
    })

films[0]

{'title': 'Iron Man', 'year': 2008, 'director': 'Jon Favreau'}

In [77]:
save("Marvel Cinematic Universe", "All feature films in the Marvel Cinematic Universe (all phases)", "https://en.wikipedia.org/wiki/List_of_Marvel_Cinematic_Universe_films", films) # type: ignore

https://film-collections.simse.dev/marvel-cinematic-universe.json
