In [1]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import time
import csv

N = 1  # How many companies to process; set higher for full run

print("üü¢ [Step 1] Starting: Fetching S&P 500 companies table from Wikipedia...")
start_time = time.time()

url = "https://en.wikipedia.org/wiki/List_of_S%26P_500_companies"
headers = {"User-Agent": "Mozilla/5.0 (compatible; sp500-fetcher/2.0; +https://example.com)"}

try:
    resp = requests.get(url, headers=headers, timeout=20)
    resp.raise_for_status()
    print("‚úÖ Wikipedia page fetched successfully.")
except Exception as e:
    print(f"‚ùå Error fetching page: {e}")
    raise

soup = BeautifulSoup(resp.text, "html.parser")
table = soup.find("table", {"class": "wikitable"})
df = pd.read_html(str(table))[0]

elapsed = time.time() - start_time
print(f"‚úÖ [Step 1 Complete] Retrieved S&P 500 table with {len(df)} entries in {elapsed:.2f}s.\n")

üü¢ [Step 1] Starting: Fetching S&P 500 companies table from Wikipedia...
‚úÖ Wikipedia page fetched successfully.
‚úÖ [Step 1 Complete] Retrieved S&P 500 table with 503 entries in 0.37s.



  df = pd.read_html(str(table))[0]


In [2]:
print("üü¢ [Step 2] Starting: Extracting company Wikipedia URLs...")
step2_start = time.time()

base = "https://en.wikipedia.org"

# Extract links from the S&P 500 table
links = [base + a["href"] for a in table.select("tbody tr td:nth-of-type(2) a[href]")]

# Limit to first N companies
links = links[:N]
df = df.iloc[:N].copy()
df["Wikipedia_URL"] = links

elapsed = time.time() - step2_start
print(f"‚úÖ [Step 2 Complete] Collected {len(links)} Wikipedia URLs in {elapsed:.2f}s.\n")

üü¢ [Step 2] Starting: Extracting company Wikipedia URLs...
‚úÖ [Step 2 Complete] Collected 1 Wikipedia URLs in 0.05s.



In [3]:
print("üü¢ [Step 3] Starting: Defining function to fetch Wikipedia intro paragraphs...")
step3_start = time.time()

# Step 3: Fetch the *intro paragraphs* directly from the article HTML
def get_intro(url):
    try:
        r = requests.get(url, headers=headers, timeout=15)
        r.raise_for_status()
        soup = BeautifulSoup(r.text, "html.parser")

        # Target main content area
        content_div = soup.find("div", {"class": "mw-parser-output"})
        if not content_div:
            return ""

        # Gather all <p> tags before the first <h2> (usually the intro)
        paragraphs = []
        for el in content_div.find_all(["p", "h2"], recursive=False):
            if el.name == "h2":
                break
            if el.name == "p" and el.get_text(strip=True):
                paragraphs.append(el.get_text(" ", strip=True))

        intro_text = " ".join(paragraphs)
        return intro_text.strip()

    except Exception as e:
        return f""

elapsed = time.time() - step3_start
print(f"‚úÖ [Step 3 Complete] Intro extraction function defined successfully in {elapsed:.2f}s.\n")

üü¢ [Step 3] Starting: Defining function to fetch Wikipedia intro paragraphs...
‚úÖ [Step 3 Complete] Intro extraction function defined successfully in 0.00s.



In [4]:
print("üü¢ [Step 4] Starting: Fetching intros for all companies...")
step4_start = time.time()

# Step 4: Fetch intros for all companies
intros = []
for idx, link in enumerate(df["Wikipedia_URL"], 1):
    intros.append(get_intro(link))
    time.sleep(0.3)  # polite delay

    # Print status every 10 companies
    if idx % 10 == 0 or idx == len(df):
        print(f"   ‚è±Ô∏è  Fetched intros for {idx}/{len(df)} companies...")

df["Intro"] = intros

elapsed = time.time() - step4_start
print(f"‚úÖ [Step 4 Complete] All intros fetched successfully in {elapsed:.2f}s.\n")

üü¢ [Step 4] Starting: Fetching intros for all companies...
   ‚è±Ô∏è  Fetched intros for 1/1 companies...
‚úÖ [Step 4 Complete] All intros fetched successfully in 0.53s.



In [5]:
print("üü¢ [Step 5] Starting: Cleaning and deduplicating data...")
step5_start = time.time()

# Step 5: Clean and deduplicate data
df = df[["Symbol", "Security", "Wikipedia_URL", "Intro"]].copy()
df["Symbol"] = df["Symbol"].astype(str).str.strip()
df["Security"] = df["Security"].astype(str).str.strip()
df = df[df["Symbol"] != ""]
df = df.dropna(subset=["Symbol"])

before = len(df)
df = df.drop_duplicates(subset=["Symbol"], keep="first").reset_index(drop=True)
after = len(df)

elapsed = time.time() - step5_start
print(f"‚úÖ [Step 5 Complete] Cleaned and deduplicated data in {elapsed:.2f}s. Rows before: {before}, after: {after} (unique symbols)\n")

üü¢ [Step 5] Starting: Cleaning and deduplicating data...
‚úÖ [Step 5 Complete] Cleaned and deduplicated data in 0.01s. Rows before: 1, after: 1 (unique symbols)



In [6]:
print("üü¢ [Step 6] Starting: Adding ID and saving to CSV...")
step6_start = time.time()

# Step 6: Add ID and save to CSV
df.insert(0, "ID", range(1, len(df) + 1))
out_path = "sp500_wiki_intros_full.csv"
df.to_csv(out_path, index=False, encoding="utf-8", quoting=csv.QUOTE_ALL)

elapsed = time.time() - step6_start
print(f"‚úÖ [Step 6 Complete] Exported {len(df)} rows to {out_path} in {elapsed:.2f}s.")
print(f"   üîπ Unique symbols: {df['Symbol'].nunique()}\n")

üü¢ [Step 6] Starting: Adding ID and saving to CSV...
‚úÖ [Step 6 Complete] Exported 1 rows to sp500_wiki_intros_full.csv in 0.01s.
   üîπ Unique symbols: 1

