In [None]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
from tqdm import tqdm
import time
import csv

N = 1 # How many companies to process; set to a higher number for full run

# Step 1: Get S&P 500 companies table
url = "https://en.wikipedia.org/wiki/List_of_S%26P_500_companies"
headers = {"User-Agent": "Mozilla/5.0 (compatible; sp500-fetcher/2.0; +https://example.com)"}
resp = requests.get(url, headers=headers, timeout=20)
resp.raise_for_status()

soup = BeautifulSoup(resp.text, "html.parser")
table = soup.find("table", {"class": "wikitable"})
df = pd.read_html(str(table))[0]

  df = pd.read_html(str(table))[0]
Fetching intros: 100%|██████████| 1/1 [00:00<00:00,  2.09it/s]

Rows before dedupe: 1, after dedupe: 1 (unique symbols)
✅ Exported 1 rows to sp500_wiki_intros_full.csv
Unique symbols: 1





In [None]:
# Step 2: Get company Wikipedia URLs
base = "https://en.wikipedia.org"
links = [base + a["href"] for a in table.select("tbody tr td:nth-of-type(2) a[href]")]
links = links[:N]              # trim links list first
df = df.iloc[:N].copy()        # then trim df to same length
df["Wikipedia_URL"] = links    # now they match

In [None]:
# Step 3: Fetch the *intro paragraphs* directly from the article HTML
def get_intro(url):
    try:
        r = requests.get(url, headers=headers, timeout=15)
        r.raise_for_status()
        soup = BeautifulSoup(r.text, "html.parser")

        # Target main content area
        content_div = soup.find("div", {"class": "mw-parser-output"})
        if not content_div:
            return ""

        # Gather all <p> tags before the first <h2> (usually the intro)
        paragraphs = []
        for el in content_div.find_all(["p", "h2"], recursive=False):
            if el.name == "h2":
                break
            if el.name == "p" and el.get_text(strip=True):
                paragraphs.append(el.get_text(" ", strip=True))

        intro_text = " ".join(paragraphs)
        return intro_text.strip()

    except Exception as e:
        return f""

In [None]:
# Step 4: Fetch intros for all companies
intros = []
for link in tqdm(df["Wikipedia_URL"], desc="Fetching intros"):
    intros.append(get_intro(link))
    time.sleep(0.3)  # polite delay

df["Intro"] = intros

In [None]:
# Step 5: Clean and deduplicate data
df = df[["Symbol", "Security", "Wikipedia_URL", "Intro"]].copy()
df["Symbol"] = df["Symbol"].astype(str).str.strip()
df["Security"] = df["Security"].astype(str).str.strip()
df = df[df["Symbol"] != ""]
df = df.dropna(subset=["Symbol"])

before = len(df)
df = df.drop_duplicates(subset=["Symbol"], keep="first").reset_index(drop=True)
after = len(df)

print(f"Rows before dedupe: {before}, after dedupe: {after} (unique symbols)")

In [None]:
# Step 6: Add ID and save to CSV
df.insert(0, "ID", range(1, len(df) + 1))
out_path = "sp500_wiki_intros_full.csv"
df.to_csv(out_path, index=False, encoding="utf-8", quoting=csv.QUOTE_ALL)

print(f"✅ Exported {len(df)} rows to {out_path}")
print(f"Unique symbols: {df['Symbol'].nunique()}")