 Getting Marvel Data from Wikipedia Web Page

In [None]:
%pip install pandas

%pip install requests

%pip install beautifulsoup4

%pip install python-dotenv

%pip install boto3


In [None]:
import pandas as pd
import os
import requests
from bs4 import BeautifulSoup
import re
from dotenv import load_dotenv
import boto3
import base64


In [None]:
wiki_url = "https://en.wikipedia.org/wiki/List_of_Marvel_Cinematic_Universe_films"
response = requests.get(wiki_url)
var = BeautifulSoup(response.content, 'html.parser')
table = var.find_all('table', class_='wikitable')

In [None]:
column = []
for tbl in table[:1]:
    header_row = tbl.find_all('th', scope='col')
    for col in header_row:
        column.append(col.get_text().strip().replace('\n', ''))
column


In [None]:
wiki_data = []
film_names = []
for tbl in table[:7]:
    rows = tbl.find_all('th', scope='row')
    for row in rows:
        film_names.append(row.get_text().replace('\n', ''))
    for row in rows:
        data = row.find_next_siblings('td')
        text = [i.get_text().replace('\n', '').replace('\xa0', ' ') for i in data]
        text = [re.sub(r'\(.*?\)', '', t) for t in text]  # Remove hidden date part
        wiki_data.append(text)

# Append film names to the corresponding wiki_data sublists
for i, film_name in enumerate(film_names):
    wiki_data[i].insert(0, film_name)

# Ensure each list in wiki_data has exactly 5 elements
for row in wiki_data:
    while len(row) < len(column):
        row.append(None)  # Append None for missing values
    while len(row) > len(column):
        row.pop()  # Remove extra elements

# Create DataFrame with the correct shape
wiki_data_df = pd.DataFrame(wiki_data, columns=column)
wiki_data_df

Cleaning & Modifying the Pandas Dataframe

In [None]:
# Define new colum names
new_column_names = {
    'Film[30]': 'film_name',
    'U.S. release date': 'us_release_date',
    'Director': 'director_name',
    'Screenwriter(s)': 'screen_writer',
    'Producer(s)': 'producers'
}

# Rename columns
wiki_data_df.rename(columns=new_column_names, inplace=True)

# Forward fill missing screen_writer values
wiki_data_df['screen_writer'] = wiki_data_df['screen_writer'].ffill()

# Forward fill missing producers values
wiki_data_df['producers'] = wiki_data_df['producers'].ffill()

# Remove numbers in square brackets from all columns
wiki_data_df.replace(to_replace=r'\[\d+\]', value='', regex=True, inplace=True)

# Convert 'us_release_date' to datetime
# wiki_data_df['us_release_date'] = pd.to_datetime(wiki_data_df['us_release_date'])

wiki_data_df.tail(10)


Getting OMDB API Data

In [None]:
# Load environment variables from .env file in the current directory
load_dotenv()
# Access the environment variables
api_key = os.getenv('API_KEY')
response_list = []
for film_name in wiki_data_df['film_name']:
    url = f'http://www.omdbapi.com/?t={film_name}&apikey={api_key}'
    response = requests.get(url)
    response_list.append(response.json())
response_list

Cleaning and making a OMDB Dataframe

In [47]:
for i in response_list:
    if i.get('Error') == 'Movie not found!':
        i.pop('Error', None)

omdb_data_df = pd.DataFrame(response_list)

# Select only the columns you want to keep
columns_to_keep = ['Title', 'Rated', 'Released', 'Runtime', 'Genre', 'Actors', 'Awards', 'imdbRating', 'BoxOffice', 'Type']
omdb_data_df = omdb_data_df[columns_to_keep]

new_column_names = {
    'Title': 'film_name',
    'Rated': 'rated',
    'Released': 'release_date',
    'Runtime': 'runtime',
    'Genre': 'genre',
    'Actors': 'actors',
    'Awards': 'awards',
    'imdbRating': 'imdb_rating',
    'BoxOffice': 'box_office_collection',
    'Type': 'type',
}
# Rename columns to merge with wiki_data_df
omdb_data_df.rename(columns=new_column_names, inplace=True)

# Remove rows with null values in the film_name column
omdb_data_df.dropna(subset=['film_name'], inplace=True)

omdb_data_df.head()

Unnamed: 0,film_name,rated,release_date,runtime,genre,actors,awards,imdb_rating,box_office_collection,type
0,Iron Man,PG-13,02 May 2008,126 min,"Action, Adventure, Sci-Fi","Robert Downey Jr., Gwyneth Paltrow, Terrence H...",Nominated for 2 Oscars. 24 wins & 73 nominatio...,7.9,"$319,034,126",movie
1,The Incredible Hulk,PG-13,13 Jun 2008,112 min,"Action, Adventure, Sci-Fi","Edward Norton, Liv Tyler, Tim Roth",2 wins & 10 nominations,6.6,"$134,806,913",movie
2,Iron Man 2,PG-13,07 May 2010,124 min,"Action, Sci-Fi","Robert Downey Jr., Mickey Rourke, Gwyneth Paltrow",Nominated for 1 Oscar. 7 wins & 45 nominations...,6.9,"$312,433,331",movie
3,Thor,PG-13,06 May 2011,115 min,"Action, Fantasy","Chris Hemsworth, Anthony Hopkins, Natalie Portman",5 wins & 30 nominations,7.0,"$181,030,624",movie
4,Captain America: The First Avenger,PG-13,22 Jul 2011,124 min,"Action, Adventure, Sci-Fi","Chris Evans, Hugo Weaving, Samuel L. Jackson",4 wins & 50 nominations,6.9,"$176,654,505",movie


In [53]:
# Joining the two dataframes to get the final result
final_df = wiki_data_df.merge(omdb_data_df, on='film_name', how='left')
final_df.fillna('N/A', inplace=True)
final_df.to_csv('final_df.csv', index=False)
final_df.tail(20)

Unnamed: 0,film_name,us_release_date,director_name,screen_writer,producers,rated,release_date,runtime,genre,actors,awards,imdb_rating,box_office_collection,type
25,Eternals,"November 5, 2021",Chloé Zhao,Chloé Zhao and Chloé Zhao & Patrick Burleigh a...,Kevin Feige and Nate Moore,PG-13,05 Nov 2021,156 min,"Action, Adventure, Fantasy","Gemma Chan, Richard Madden, Angelina Jolie",7 wins & 18 nominations,6.3,"$164,870,234",movie
26,Spider-Man: No Way Home,"December 17, 2021",Jon Watts,Chris McKenna & Erik Sommers,Kevin Feige and Amy Pascal,PG-13,17 Dec 2021,148 min,"Action, Adventure, Fantasy","Tom Holland, Zendaya, Benedict Cumberbatch",Nominated for 1 Oscar. 35 wins & 71 nomination...,8.2,"$814,866,759",movie
27,Doctor Strange in the Multiverse of Madness,"May 6, 2022",Sam Raimi,Michael Waldron,Kevin Feige,PG-13,06 May 2022,126 min,"Action, Adventure, Fantasy","Benedict Cumberbatch, Elizabeth Olsen, Chiwete...",10 wins & 30 nominations,6.9,"$411,331,607",movie
28,Thor: Love and Thunder,"July 8, 2022",Taika Waititi,Taika Waititi & Jennifer Kaytin Robinson,Kevin Feige and Brad Winderbaum,PG-13,08 Jul 2022,118 min,"Adventure, Comedy, Romance","Chris Hemsworth, Natalie Portman, Christian Bale",3 wins & 22 nominations,6.2,"$343,256,830",movie
29,Black Panther: Wakanda Forever,"November 11, 2022",Ryan Coogler,Ryan Coogler & Joe Robert Cole,Kevin Feige and Nate Moore,PG-13,11 Nov 2022,161 min,"Action, Adventure, Drama","Letitia Wright, Lupita Nyong'o, Danai Gurira",Won 1 Oscar. 50 wins & 175 nominations total,6.7,"$453,829,060",movie
30,Ant-Man and the Wasp: Quantumania,"February 17, 2023",Peyton Reed,Jeff Loveness,Kevin Feige and Stephen Broussard,PG-13,17 Feb 2023,124 min,"Action, Adventure, Comedy","Paul Rudd, Evangeline Lilly, Michael Douglas",14 nominations,6.0,"$214,504,909",movie
31,Guardians of the Galaxy Vol. 3,"May 5, 2023",James Gunn,Kevin Feige,Kevin Feige and Stephen Broussard,PG-13,05 May 2023,150 min,"Action, Adventure, Comedy","Chris Pratt, Chukwudi Iwuji, Bradley Cooper",Nominated for 1 Oscar. 11 wins & 88 nomination...,7.9,"$358,995,815",movie
32,The Marvels,"November 10, 2023",Nia DaCosta,Nia DaCosta and Megan McDonnell and Elissa Kar...,Kevin Feige and Stephen Broussard,PG-13,10 Nov 2023,105 min,"Action, Adventure, Fantasy","Brie Larson, Teyonah Parris, Iman Vellani",4 wins & 11 nominations,5.5,"$84,500,223",movie
33,Deadpool & Wolverine,"July 26, 2024",Shawn Levy,Ryan Reynolds & Rhett Reese & Paul Wernick & Z...,"Kevin Feige, Lauren Shuler Donner, Ryan Reynol...",,,,,,,,,
34,Captain America: Brave New World,"February 14, 2025",Julius Onah,Julius Onah & Peter Glanz and Matthew Orton[be...,Kevin Feige and Nate Moore,,14 Feb 2025,,"Action, Adventure, Sci-Fi","Rosa Salazar, Harrison Ford, Liv Tyler",,,,movie
