In [1]:
import yfinance as yf
import pandas as pd
from collections import defaultdict

In [9]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# Step 1: Fetch the Wikipedia page
url = "https://en.wikipedia.org/wiki/List_of_S%26P_500_companies"
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"}
response = requests.get(url, headers=headers)
response.raise_for_status()

# Step 2: Parse HTML with BeautifulSoup
soup = BeautifulSoup(response.text, "html.parser")

# Step 3: Find the main table
table = soup.find("table", {"id": "constituents"})

# Step 4: Extract all rows
rows = table.find_all("tr")

# Step 5: Parse rows into structured data
data = []
for row in rows[1:]:  # skip the header
    cols = row.find_all("td")
    if len(cols) >= 8:
        symbol = cols[0].text.strip().replace(".", "-")  # Yahoo uses "-" instead of "."
        security = cols[1].text.strip()
        sector = cols[2].text.strip()
        sub_industry = cols[3].text.strip()
        headquarters = cols[4].text.strip()
        date_added = cols[5].text.strip()
        cik = cols[6].text.strip()
        founded = cols[7].text.strip()

        data.append({
            "Symbol": symbol,
            "Security": security,
            "GICS Sector": sector,
            "GICS Sub-Industry": sub_industry,
            "Headquarters": headquarters,
            "Date Added": date_added,
            "CIK": cik,
            "Founded": founded
        })

# Step 6: Convert to DataFrame
df = pd.DataFrame(data)

# Step 7: Group by GICS Sector and Sub-Industry
grouped = df.groupby("GICS Sector")["Symbol"].apply(list).reset_index()

# Step 8: Show or save the results
print(grouped.head())
# grouped.to_csv("sp500_by_sector_industry.csv", index=False)





              GICS Sector                                             Symbol
0  Communication Services  [GOOGL, GOOG, T, CHTR, CMCSA, EA, FOXA, FOX, I...
1  Consumer Discretionary  [ABNB, AMZN, APTV, AZO, BBY, BKNG, KMX, CCL, C...
2        Consumer Staples  [MO, ADM, BF-B, BG, CPB, CHD, CLX, KO, CL, CAG...
3                  Energy  [APA, BKR, CVX, COP, CTRA, DVN, FANG, EOG, EQT...
4              Financials  [AFL, ALL, AXP, AIG, AMP, AON, APO, ACGL, AJG,...


In [10]:
grouped.head(50)

Unnamed: 0,GICS Sector,Symbol
0,Communication Services,"[GOOGL, GOOG, T, CHTR, CMCSA, EA, FOXA, FOX, I..."
1,Consumer Discretionary,"[ABNB, AMZN, APTV, AZO, BBY, BKNG, KMX, CCL, C..."
2,Consumer Staples,"[MO, ADM, BF-B, BG, CPB, CHD, CLX, KO, CL, CAG..."
3,Energy,"[APA, BKR, CVX, COP, CTRA, DVN, FANG, EOG, EQT..."
4,Financials,"[AFL, ALL, AXP, AIG, AMP, AON, APO, ACGL, AJG,..."
5,Health Care,"[ABT, ABBV, A, ALGN, AMGN, BAX, BDX, TECH, BII..."
6,Industrials,"[MMM, AOS, ALLE, AME, ADP, AXON, BA, BR, BLDR,..."
7,Information Technology,"[ACN, ADBE, AMD, AKAM, APH, ADI, AAPL, AMAT, A..."
8,Materials,"[APD, ALB, AMCR, AVY, BALL, CF, CTVA, DOW, DD,..."
9,Real Estate,"[ARE, AMT, AVB, BXP, CPT, CBRE, CSGP, CCI, DLR..."


In [None]:
# Assuming 'grouped' is your grouped DataFrame
sector_dict = dict(zip(grouped["GICS Sector"], grouped["Symbol"]))

# Example: access sector tickers directly
tech_tickers = sector_dict["Information Technology"]
healthcare_tickers = sector_dict["Health Care"]
financials_tickers = sector_dict["Financials"]
communications_tickers = sector_dict["Communication Services"]
consumer_discretionary_tickers = sector_dict["Consumer Discretionary"]
energy_tickers = sector_dict["Energy"]
utilities_tickers = sector_dict["Utilities"]
consumer_staples_tickers = sector_dict["Consumer Staples"]
real_estate_tickers = sector_dict["Real Estate"]
industrials_tickers = sector_dict["Industrials"]
materials_tickers = sector_dict["Materials"]
