In [1]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
import re

## Wikipedia Table of Super Bowl 2024 Ads

In [2]:
# get the html for the page and format it as text

url = 'https://en.wikipedia.org/wiki/List_of_Super_Bowl_commercials'
r = requests.get(url)
text = r.text

In [3]:
# create a beatiful soup object

soup = BeautifulSoup(text)

# find the heading for 2024

heading = soup.find("h3", {"id": "2024_(LVIII)"})

# find the next table after that heading

table_2024 = heading.find_next("table", {"class": "wikitable"})

In [4]:
rows = []
current_product_type = None

# loop through all rows in the table

for row in table_2024.find_all("tr"):
    cells = row.find_all("td")
    
    # skip the header row

    if len(cells) == 0:
        continue
    
    # if there's a product type in the first cell, update current_product_type

    first_cell = row.find("td")
    if first_cell and first_cell.get("rowspan"):
        current_product_type = first_cell.text.strip()
    
    # if current_product_type is set, add it to the row only once

    if current_product_type:

        # check if the first cell already contains the current product type

        if cells[0].text.strip() != current_product_type:
            row_data = [current_product_type] + [cell.text.strip() for cell in cells]
        else:
            row_data = [cell.text.strip() for cell in cells]
        
        rows.append(row_data)
    else:
        # if there's no product type, just add the row as is
        
        row_data = [cell.text.strip() for cell in cells]
        rows.append(row_data)

def remove_citations(text):
    return re.sub(r'\[.*?\]', '', text)

for row in rows:
    row[3] = remove_citations(row[3]) 

# manually fix rows that scraped incorrectly

rows[8] = ['FIlm', 'Wicked', 'Trailer', 'Trailer']
rows[22] = ['Retail', 'Bass Pro Shops', '"Making Memories on the Water"', 'Promotes Tracker fishing and pontoon boats.']
rows[23] = ['Pharmaceutical', 'Pfizer', "Here's to Science", "Various paintings, sculptures, and photographs of members of the science community lip synch to Queen's Don't Stop Me Now."]
rows[24] = ['Restaurant', 'Popeyes', '"Popeyes Finally Has Wings"', 'Howie (played by Ken Jeong) was released from a fifty-year  cryosleep when a group scientists gave him a box of Popeyes wings while he discovers the future such as kick scooters, drones, etc.']
rows[25] = ['Shoes', 'Skechers', '"There\'s No \'T\' in Skechers"', 'Mr. T shows Tony Romo how he wears Skechers Slip-Ins where there\'s no "T" in Skechers.']
rows[62] = ['Political', 'American Values 2024', '"American Values"', 'In an advertisement for the Robert F. Kennedy Jr. 2024 presidential campaign, it reuses the advertisement of his uncle John F. Kennedyâ€™s presidential campaign, with images of Robert Jr. replacing images of John.']


In [5]:
# extract headers

headers = [header.text.replace('[558]', '').strip() for header in table_2024.find_all("th")]

# create a DataFrame from the rows and headers

df = pd.DataFrame(rows, columns=headers)

# looks perfect! save locally.

# df.to_csv('wikipedia_ad_list.csv', index = False)
