# Generating dataset for ML classifier

## Imports and Config

In [2]:
import yfinance as yf
import pandas as pd
import numpy as np
from ta.momentum import RSIIndicator
from ta.volatility import AverageTrueRange
import matplotlib.pyplot as plt
import seaborn as sns
import requests
from bs4 import BeautifulSoup

sns.set(style="whitegrid")

# ‚îÄ‚îÄ‚îÄ CONFIG ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
start_date = "2004-01-01"
end_date   = "2025-12-31"
horizon    = 1    # days ahead for the target


In [4]:
def get_sp500_tickers():
    """
    Scrape the current list of S&P 500 tickers from Wikipedia.
    Uses requests + BeautifulSoup to avoid HTTP 403 errors.
    """
    url = "https://en.wikipedia.org/wiki/List_of_S%26P_500_companies"

    # Add browser headers to bypass bot protection
    headers = {
        "User-Agent": (
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
            "AppleWebKit/537.36 (KHTML, like Gecko) "
            "Chrome/120.0 Safari/537.36"
        )
    }

    response = requests.get(url, headers=headers)
    response.raise_for_status()   # will show any real connection error

    # Parse the HTML with BeautifulSoup
    soup = BeautifulSoup(response.text, "lxml")
    table = soup.find("table", {"id": "constituents"})

    if table is None:
        raise ValueError("Could not find the constituents table on the Wikipedia page.")

    # Extract the table using pandas
    df = pd.read_html(str(table))[0]

    tickers = [t.replace(".", "-") for t in df["Symbol"].astype(str).tolist()]
    return tickers

# Usage
sp500_tickers = get_sp500_tickers()
print(f"‚úÖ Fetched {len(sp500_tickers)} tickers, e.g.: {sp500_tickers[:10]}")


‚úÖ Fetched 503 tickers, e.g.: ['MMM', 'AOS', 'ABT', 'ABBV', 'ACN', 'ADBE', 'AMD', 'AES', 'AFL', 'A']


  df = pd.read_html(str(table))[0]


In [5]:
tickers = get_sp500_tickers()

  df = pd.read_html(str(table))[0]


## Step 1: Fetch Price Data for All Tickers

In [6]:
# ‚îÄ‚îÄ Step 1: Fetch price data for the entire S&P 500 ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
print("Fetching S&P 500 ticker list‚Ä¶")
tickers = get_sp500_tickers()
print(f"Got {len(tickers)} tickers. Date range: {start_date} ‚Üí {end_date}")

# batch size to avoid timeouts / throttling
batch_size = 50
chunks = [tickers[i:i+batch_size] for i in range(0, len(tickers), batch_size)]

frames = []
for chunk in chunks:
    print(f"  Downloading tickers {chunk[0]} ‚Ä¶ {chunk[-1]} ‚Ä¶")
    df_chunk = yf.download(
        chunk,
        start=start_date,
        end=end_date,
        group_by="ticker",
        auto_adjust=False,
        threads=True
    )
    frames.append(df_chunk)

# concatenate all batches side-by-side
raw = pd.concat(frames, axis=1)

# drop rows where *all* tickers missed (e.g. holidays)
raw.dropna(how="all", inplace=True)

print("‚úÖ Price download complete. Raw shape:", raw.shape)
display(raw.head())


Fetching S&P 500 ticker list‚Ä¶


  df = pd.read_html(str(table))[0]


Got 503 tickers. Date range: 2004-01-01 ‚Üí 2025-12-31
  Downloading tickers MMM ‚Ä¶ ADSK ‚Ä¶


[*********************100%***********************]  50 of 50 completed


  Downloading tickers ADP ‚Ä¶ CMG ‚Ä¶


[*********************100%***********************]  50 of 50 completed


  Downloading tickers CB ‚Ä¶ D ‚Ä¶


[*********************100%***********************]  50 of 50 completed


  Downloading tickers DPZ ‚Ä¶ FTNT ‚Ä¶


[*********************100%***********************]  50 of 50 completed


  Downloading tickers FTV ‚Ä¶ IBKR ‚Ä¶


[*********************100%***********************]  50 of 50 completed


  Downloading tickers ICE ‚Ä¶ MMC ‚Ä¶


[*********************100%***********************]  50 of 50 completed


  Downloading tickers MLM ‚Ä¶ OXY ‚Ä¶


[*********************100%***********************]  50 of 50 completed


  Downloading tickers ODFL ‚Ä¶ ROK ‚Ä¶


[*********************100%***********************]  50 of 50 completed


  Downloading tickers ROL ‚Ä¶ TT ‚Ä¶


[*********************100%***********************]  50 of 50 completed


  Downloading tickers TDG ‚Ä¶ YUM ‚Ä¶


[*********************100%***********************]  50 of 50 completed


  Downloading tickers ZBRA ‚Ä¶ ZTS ‚Ä¶


[*********************100%***********************]  3 of 3 completed


‚úÖ Price download complete. Raw shape: (5495, 3018)


Ticker,GOOG,GOOG,GOOG,GOOG,GOOG,GOOG,AJG,AJG,AJG,AJG,...,ZBH,ZBH,ZBH,ZBH,ZBRA,ZBRA,ZBRA,ZBRA,ZBRA,ZBRA
Price,Open,High,Low,Close,Adj Close,Volume,Open,High,Low,Close,...,Low,Close,Adj Close,Volume,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2004-01-02,,,,,,,32.400002,32.650002,31.74,31.950001,...,67.864075,67.961166,60.486706,688143,44.126667,44.186668,43.406666,43.586666,43.586666,339900
2004-01-05,,,,,,,31.969999,32.810001,31.969999,32.310001,...,66.990288,67.349518,59.94231,1223846,43.486668,44.18,43.393333,43.666668,43.666668,640950
2004-01-06,,,,,,,32.400002,32.419998,31.450001,31.620001,...,66.252426,67.019417,59.648548,1202216,43.566666,43.886665,43.366669,43.766666,43.766666,311700
2004-01-07,,,,,,,31.620001,31.74,31.379999,31.709999,...,66.912621,67.766991,60.313881,866127,43.633331,44.653332,43.5,44.5,44.5,495150
2004-01-08,,,,,,,31.799999,32.209999,31.65,32.080002,...,67.592232,69.077667,61.480404,1114151,44.939999,45.246666,44.133331,44.599998,44.599998,290850


## Step 2: Build Unified Event Dates Table (Earnings Only)

In [None]:
tickers

['MMM',
 'AOS',
 'ABT',
 'ABBV',
 'ACN',
 'ADBE',
 'AMD',
 'AES',
 'AFL',
 'A',
 'APD',
 'ABNB',
 'AKAM',
 'ALB',
 'ARE',
 'ALGN',
 'ALLE',
 'LNT',
 'ALL',
 'GOOGL',
 'GOOG',
 'MO',
 'AMZN',
 'AMCR',
 'AEE',
 'AEP',
 'AXP',
 'AIG',
 'AMT',
 'AWK',
 'AMP',
 'AME',
 'AMGN',
 'APH',
 'ADI',
 'AON',
 'APA',
 'APO',
 'AAPL',
 'AMAT',
 'APP',
 'APTV',
 'ACGL',
 'ADM',
 'ANET',
 'AJG',
 'AIZ',
 'T',
 'ATO',
 'ADSK',
 'ADP',
 'AZO',
 'AVB',
 'AVY',
 'AXON',
 'BKR',
 'BALL',
 'BAC',
 'BAX',
 'BDX',
 'BRK-B',
 'BBY',
 'TECH',
 'BIIB',
 'BLK',
 'BX',
 'XYZ',
 'BK',
 'BA',
 'BKNG',
 'BSX',
 'BMY',
 'AVGO',
 'BR',
 'BRO',
 'BF-B',
 'BLDR',
 'BG',
 'BXP',
 'CHRW',
 'CDNS',
 'CPT',
 'CPB',
 'COF',
 'CAH',
 'CCL',
 'CARR',
 'CAT',
 'CBOE',
 'CBRE',
 'CDW',
 'COR',
 'CNC',
 'CNP',
 'CF',
 'CRL',
 'SCHW',
 'CHTR',
 'CVX',
 'CMG',
 'CB',
 'CHD',
 'CI',
 'CINF',
 'CTAS',
 'CSCO',
 'C',
 'CFG',
 'CLX',
 'CME',
 'CMS',
 'KO',
 'CTSH',
 'COIN',
 'CL',
 'CMCSA',
 'CAG',
 'COP',
 'ED',
 'STZ',
 'CEG',
 'COO',


In [9]:
bad_tickers = []
events = []

for t in tickers:
    try:
        print(f"Gathering earnings for {t}‚Ä¶", end=" ")

        ticker_obj = yf.Ticker(t)
        ed = ticker_obj.earnings_dates

        # skip if None or empty DataFrame
        if ed is None or ed.empty:
            print("‚ö†Ô∏è  No data ‚Äî skipping")
            bad_tickers.append(t)
            continue

        ed = ed.reset_index()
        ed.columns = ["Date", "Estimate", "Reported", "Surprise_%"]

        # normalize dates
        ed["Date"] = ed["Date"].dt.tz_localize(None).dt.normalize()
        ed["Ticker"] = t
        events.append(ed[["Date", "Ticker"]])
        print("‚úÖ done")

    except Exception as e:
        print(f"‚ùå Error: {e}")
        bad_tickers.append(t)
        continue

# Combine all successful results
if events:
    earnings_dates = pd.concat(events, ignore_index=True)
    earnings_dates = earnings_dates[
        (earnings_dates["Date"] >= pd.to_datetime(start_date))
        & (earnings_dates["Date"] <= pd.to_datetime(end_date))
    ].sort_values(["Date", "Ticker"]).reset_index(drop=True)
else:
    earnings_dates = pd.DataFrame(columns=["Date", "Ticker"])

print(f"\n‚úÖ Total valid earnings events: {len(earnings_dates)}")
print(f"üö´ Skipped {len(bad_tickers)} tickers with no data: {bad_tickers[:10]}‚Ä¶")


Gathering earnings for MMM‚Ä¶ ‚úÖ done
Gathering earnings for AOS‚Ä¶ ‚úÖ done
Gathering earnings for ABT‚Ä¶ ‚úÖ done
Gathering earnings for ABBV‚Ä¶ ‚úÖ done
Gathering earnings for ACN‚Ä¶ ‚úÖ done
Gathering earnings for ADBE‚Ä¶ ‚úÖ done
Gathering earnings for AMD‚Ä¶ ‚úÖ done
Gathering earnings for AES‚Ä¶ ‚úÖ done
Gathering earnings for AFL‚Ä¶ ‚úÖ done
Gathering earnings for A‚Ä¶ ‚úÖ done
Gathering earnings for APD‚Ä¶ ‚úÖ done
Gathering earnings for ABNB‚Ä¶ ‚úÖ done
Gathering earnings for AKAM‚Ä¶ ‚úÖ done
Gathering earnings for ALB‚Ä¶ ‚úÖ done
Gathering earnings for ARE‚Ä¶ ‚úÖ done
Gathering earnings for ALGN‚Ä¶ ‚úÖ done
Gathering earnings for ALLE‚Ä¶ ‚úÖ done
Gathering earnings for LNT‚Ä¶ ‚úÖ done
Gathering earnings for ALL‚Ä¶ ‚úÖ done
Gathering earnings for GOOGL‚Ä¶ 

  df['Earnings Date'] = pd.to_datetime(df['Event Start Date'])


‚úÖ done
Gathering earnings for GOOG‚Ä¶ ‚úÖ done
Gathering earnings for MO‚Ä¶ ‚úÖ done
Gathering earnings for AMZN‚Ä¶ ‚úÖ done
Gathering earnings for AMCR‚Ä¶ ‚úÖ done
Gathering earnings for AEE‚Ä¶ ‚úÖ done
Gathering earnings for AEP‚Ä¶ ‚úÖ done
Gathering earnings for AXP‚Ä¶ ‚úÖ done
Gathering earnings for AIG‚Ä¶ ‚úÖ done
Gathering earnings for AMT‚Ä¶ ‚úÖ done
Gathering earnings for AWK‚Ä¶ ‚úÖ done
Gathering earnings for AMP‚Ä¶ ‚úÖ done
Gathering earnings for AME‚Ä¶ ‚úÖ done
Gathering earnings for AMGN‚Ä¶ ‚úÖ done
Gathering earnings for APH‚Ä¶ ‚úÖ done
Gathering earnings for ADI‚Ä¶ ‚úÖ done
Gathering earnings for AON‚Ä¶ ‚úÖ done
Gathering earnings for APA‚Ä¶ ‚úÖ done
Gathering earnings for APO‚Ä¶ ‚úÖ done
Gathering earnings for AAPL‚Ä¶ ‚úÖ done
Gathering earnings for AMAT‚Ä¶ ‚úÖ done
Gathering earnings for APP‚Ä¶ ‚úÖ done
Gathering earnings for APTV‚Ä¶ ‚úÖ done
Gathering earnings for ACGL‚Ä¶ ‚úÖ done
Gathering earnings for ADM‚Ä¶ ‚úÖ done
Gathering earnings for ANET‚Ä¶ ‚úÖ done
Gatherin

FOX: $FOX: possibly delisted; no earnings dates found


‚úÖ done
Gathering earnings for FOX‚Ä¶ ‚ö†Ô∏è  No data ‚Äî skipping
Gathering earnings for BEN‚Ä¶ ‚úÖ done
Gathering earnings for FCX‚Ä¶ ‚úÖ done
Gathering earnings for GRMN‚Ä¶ ‚úÖ done
Gathering earnings for IT‚Ä¶ ‚úÖ done
Gathering earnings for GE‚Ä¶ ‚úÖ done
Gathering earnings for GEHC‚Ä¶ ‚úÖ done
Gathering earnings for GEV‚Ä¶ ‚úÖ done
Gathering earnings for GEN‚Ä¶ ‚úÖ done
Gathering earnings for GNRC‚Ä¶ ‚úÖ done
Gathering earnings for GD‚Ä¶ ‚úÖ done
Gathering earnings for GIS‚Ä¶ ‚úÖ done
Gathering earnings for GM‚Ä¶ ‚úÖ done
Gathering earnings for GPC‚Ä¶ ‚úÖ done
Gathering earnings for GILD‚Ä¶ ‚úÖ done
Gathering earnings for GPN‚Ä¶ ‚úÖ done
Gathering earnings for GL‚Ä¶ ‚úÖ done
Gathering earnings for GDDY‚Ä¶ ‚úÖ done
Gathering earnings for GS‚Ä¶ ‚úÖ done
Gathering earnings for HAL‚Ä¶ ‚úÖ done
Gathering earnings for HIG‚Ä¶ ‚úÖ done
Gathering earnings for HAS‚Ä¶ ‚úÖ done
Gathering earnings for HCA‚Ä¶ ‚úÖ done
Gathering earnings for DOC‚Ä¶ ‚úÖ done
Gathering earnings for HSIC‚Ä¶ ‚úÖ d

NWS: $NWS: possibly delisted; no earnings dates found


‚úÖ done
Gathering earnings for NWS‚Ä¶ ‚ö†Ô∏è  No data ‚Äî skipping
Gathering earnings for NEE‚Ä¶ ‚úÖ done
Gathering earnings for NKE‚Ä¶ ‚úÖ done
Gathering earnings for NI‚Ä¶ ‚úÖ done
Gathering earnings for NDSN‚Ä¶ ‚úÖ done
Gathering earnings for NSC‚Ä¶ ‚úÖ done
Gathering earnings for NTRS‚Ä¶ ‚úÖ done
Gathering earnings for NOC‚Ä¶ ‚úÖ done
Gathering earnings for NCLH‚Ä¶ ‚úÖ done
Gathering earnings for NRG‚Ä¶ ‚úÖ done
Gathering earnings for NUE‚Ä¶ ‚úÖ done
Gathering earnings for NVDA‚Ä¶ ‚úÖ done
Gathering earnings for NVR‚Ä¶ ‚úÖ done
Gathering earnings for NXPI‚Ä¶ ‚úÖ done
Gathering earnings for ORLY‚Ä¶ ‚úÖ done
Gathering earnings for OXY‚Ä¶ ‚úÖ done
Gathering earnings for ODFL‚Ä¶ ‚úÖ done
Gathering earnings for OMC‚Ä¶ ‚úÖ done
Gathering earnings for ON‚Ä¶ ‚úÖ done
Gathering earnings for OKE‚Ä¶ ‚úÖ done
Gathering earnings for ORCL‚Ä¶ ‚úÖ done
Gathering earnings for OTIS‚Ä¶ ‚úÖ done
Gathering earnings for PCAR‚Ä¶ ‚úÖ done
Gathering earnings for PKG‚Ä¶ ‚úÖ done
Gathering earnings for PLTR

PSKY: $PSKY: possibly delisted; no earnings dates found


‚ö†Ô∏è  No data ‚Äî skipping
Gathering earnings for PH‚Ä¶ ‚úÖ done
Gathering earnings for PAYX‚Ä¶ ‚úÖ done
Gathering earnings for PAYC‚Ä¶ ‚úÖ done
Gathering earnings for PYPL‚Ä¶ ‚úÖ done
Gathering earnings for PNR‚Ä¶ ‚úÖ done
Gathering earnings for PEP‚Ä¶ ‚úÖ done
Gathering earnings for PFE‚Ä¶ ‚úÖ done
Gathering earnings for PCG‚Ä¶ ‚úÖ done
Gathering earnings for PM‚Ä¶ ‚úÖ done
Gathering earnings for PSX‚Ä¶ ‚úÖ done
Gathering earnings for PNW‚Ä¶ ‚úÖ done
Gathering earnings for PNC‚Ä¶ ‚úÖ done
Gathering earnings for POOL‚Ä¶ ‚úÖ done
Gathering earnings for PPG‚Ä¶ ‚úÖ done
Gathering earnings for PPL‚Ä¶ ‚úÖ done
Gathering earnings for PFG‚Ä¶ ‚úÖ done
Gathering earnings for PG‚Ä¶ ‚úÖ done
Gathering earnings for PGR‚Ä¶ ‚úÖ done
Gathering earnings for PLD‚Ä¶ ‚úÖ done
Gathering earnings for PRU‚Ä¶ ‚úÖ done
Gathering earnings for PEG‚Ä¶ ‚úÖ done
Gathering earnings for PTC‚Ä¶ ‚úÖ done
Gathering earnings for PSA‚Ä¶ ‚úÖ done
Gathering earnings for PHM‚Ä¶ ‚úÖ done
Gathering earnings for PWR‚Ä¶ ‚úÖ 

SOLS: $SOLS: possibly delisted; no earnings dates found


‚ö†Ô∏è  No data ‚Äî skipping
Gathering earnings for SOLV‚Ä¶ ‚úÖ done
Gathering earnings for SO‚Ä¶ ‚úÖ done
Gathering earnings for LUV‚Ä¶ ‚úÖ done
Gathering earnings for SWK‚Ä¶ ‚úÖ done
Gathering earnings for SBUX‚Ä¶ ‚úÖ done
Gathering earnings for STT‚Ä¶ ‚úÖ done
Gathering earnings for STLD‚Ä¶ ‚úÖ done
Gathering earnings for STE‚Ä¶ ‚úÖ done
Gathering earnings for SYK‚Ä¶ ‚úÖ done
Gathering earnings for SMCI‚Ä¶ ‚úÖ done
Gathering earnings for SYF‚Ä¶ ‚úÖ done
Gathering earnings for SNPS‚Ä¶ ‚úÖ done
Gathering earnings for SYY‚Ä¶ ‚úÖ done
Gathering earnings for TMUS‚Ä¶ ‚úÖ done
Gathering earnings for TROW‚Ä¶ ‚úÖ done
Gathering earnings for TTWO‚Ä¶ ‚úÖ done
Gathering earnings for TPR‚Ä¶ ‚úÖ done
Gathering earnings for TRGP‚Ä¶ ‚úÖ done
Gathering earnings for TGT‚Ä¶ ‚úÖ done
Gathering earnings for TEL‚Ä¶ ‚úÖ done
Gathering earnings for TDY‚Ä¶ ‚úÖ done
Gathering earnings for TER‚Ä¶ ‚úÖ done
Gathering earnings for TSLA‚Ä¶ ‚úÖ done
Gathering earnings for TXN‚Ä¶ ‚úÖ done
Gathering earnings for TPL

In [10]:
# Step 2 ‚Äî Build unified earnings_dates with Surprise_% included
events = []

# Filter tickers to exclude those with no data (collected earlier)
good_tickers = [t for t in tickers if t not in bad_tickers]
print(f"Processing {len(good_tickers)} valid tickers (skipping {len(bad_tickers)} bad ones)")

for t in good_tickers:
    try:
        ed = yf.Ticker(t).earnings_dates
        if ed is None or ed.empty:
            print(f"‚ö†Ô∏è  No earnings data for {t} ‚Äî skipping")
            bad_tickers.append(t)
            continue

        ed = ed.reset_index()
        ed.columns = ["Date", "Earnings_Estimate", "Reported_Earnings", "Surprise_%"]
        ed["Date"] = ed["Date"].dt.tz_localize(None).dt.normalize()
        ed["Ticker"] = t
        events.append(ed[["Date", "Ticker", "Surprise_%"]])

    except Exception as e:
        print(f"‚ùå {t}: {e}")
        bad_tickers.append(t)
        continue

earnings_dates = (
    pd.concat(events, ignore_index=True)
      .query("@start_date <= Date <= @end_date")
      .sort_values(["Date", "Ticker"])
      .reset_index(drop=True)
)

print(f"‚úÖ Total earnings events (with Surprise_%): {len(earnings_dates)}")
display(earnings_dates.head())

Processing 499 valid tickers (skipping 4 bad ones)


  df['Earnings Date'] = pd.to_datetime(df['Event Start Date'])


‚úÖ Total earnings events (with Surprise_%): 5901


Unnamed: 0,Date,Ticker,Surprise_%
0,2021-11-01,L,
1,2022-02-07,L,
2,2022-05-03,SMCI,38.7
3,2022-06-30,STZ,5.89
4,2022-07-12,PEP,7.11


## Step 3: Feature Engineering Per Ticker

In [None]:
feature_list = []

for t in tickers:
    print(f"  Engineering features for {t}‚Ä¶")
    df_t = raw[t].copy()  # slice out ticker
    
    # 1) flatten MultiIndex
    if isinstance(df_t.columns, pd.MultiIndex):
        df_t.columns = df_t.columns.get_level_values(0)
    # 2) drop stray Price
    df_t.drop(columns=[c for c in ["Price"] if c in df_t], inplace=True)
    
    # 3) coerce core series
    df_t["Close"]  = pd.to_numeric(df_t["Close"],  errors="coerce")
    df_t["Volume"] = pd.to_numeric(df_t["Volume"], errors="coerce")
    df_t.dropna(subset=["Close","Volume"], inplace=True)
    
    # 4) basic features
    df_t["Return"]      = df_t["Close"].pct_change()
    df_t["Volatility"]  = df_t["Return"].rolling(5).std()
    df_t["RSI"]         = RSIIndicator(close=df_t["Close"], window=14).rsi()
    df_t["MA5"]         = df_t["Close"].rolling(5).mean()
    df_t["MA10"]        = df_t["Close"].rolling(10).mean()
    df_t["MA_ratio"]    = df_t["MA5"] / df_t["MA10"] - 1
    df_t["Volume_Avg20"]= df_t["Volume"].rolling(20).mean()
    df_t["Volume_Spike"]= df_t["Volume"] / df_t["Volume_Avg20"] - 1
    
    # 5) new predictive features
    df_t["Momentum3"]   = df_t["Close"].pct_change(3)
    atr = AverageTrueRange(high=df_t["High"], low=df_t["Low"], 
                           close=df_t["Close"], window=14)
    df_t["ATR14"]       = atr.average_true_range()
    df_t["DayOfWeek"]   = df_t.index.dayofweek
    df_t["Month"]       = df_t.index.month

    # ‚Äî‚Äî‚Äî HERE: merge in Surprise_% from your earnings_dates ‚Äî‚Äî‚Äî
    # slice out only this ticker‚Äôs surprises
    ed_t = (
        earnings_dates
        .loc[earnings_dates["Ticker"] == t, ["Date","Surprise_%"]]
        .set_index("Date")
    )
    # align on the same dates, left‚Äêjoin so non-event days get NaN
    df_t = df_t.join(ed_t, how="left")
    # fill non‚Äêevents with zero surprise
    df_t["Surprise_%"] = df_t["Surprise_%"].fillna(0)
    
    # 6) drop NaNs from rolling/pct_change (but keep Surprise_% zeros)
    df_t.dropna(subset=[
        "Return","Volatility","RSI","MA5","MA10","MA_ratio",
        "Volume_Avg20","Volume_Spike","Momentum3","ATR14"
    ], inplace=True)
    
    # 7) select your expanded feature set
    keep = [
        "Close","Volume",
        "Return","Volatility","RSI",
        "MA5","MA10","MA_ratio",
        "Volume_Avg20","Volume_Spike",
        "Momentum3","ATR14",
        "DayOfWeek","Month",
        "Surprise_%"
    ]
    feats = df_t[keep].copy()
    feats["Ticker"] = t
    feature_list.append(feats)

# concatenate all tickers
features_df = pd.concat(feature_list)
features_df.index.name = "Date"
features_df = features_df.sort_index()

print("Features shape (all tickers):", features_df.shape)
display(features_df.head())

## Step 4: Create Event-Only Labels

In [54]:
# Step 4: Label creation (target variable) for multi‚Äêticker MultiIndex

def create_labels(event_dates, price_df, horizon=3):
    """
    event_dates: DataFrame with ['Date','Ticker'] columns of pd.Timestamps
    price_df:    DataFrame with a MultiIndex (Date, Ticker) and at least a 'Close' column
    horizon:     how many trading days ahead to look
    """
    labels = []
    # 1) pre‚Äêshift the Close series within each ticker
    future_close = price_df['Close'].groupby(level='Ticker').shift(-horizon)
    
    for _, ev in event_dates.iterrows():
        dt, tkr = ev['Date'], ev['Ticker']
        key = (dt, tkr)
        # 2) skip if that (Date, Ticker) combo isn't in your features
        if key not in price_df.index:
            continue
        
        past = price_df.at[key, 'Close']
        fut  = future_close.at[key]
        # 3) skip if we ran off the end
        if pd.isna(fut):
            continue
        
        ret = (fut - past) / past
        labels.append({
          'Date':   dt,
          'Ticker': tkr,
          'Target': int(ret > 0)
        })
    
    return pd.DataFrame(labels)


# ‚Äî how to call it ‚Äî
# make sure features_df is a MultiIndexed DF: index names must be ['Date','Ticker']
features_df = features_df.reset_index().set_index(['Date','Ticker'])

labels_df = create_labels(earnings_dates, features_df, horizon=horizon)
print(f"Labeled {len(labels_df)} events:")
display(labels_df)

Labeled 2963 events:


Unnamed: 0,Date,Ticker,Target
0,2022-07-28,KIM,1
1,2022-08-01,SPG,0
2,2022-08-04,FRT,1
3,2022-08-17,AMCR,0
4,2022-10-27,AOS,1
...,...,...,...
2958,2024-12-19,KMX,1
2959,2024-12-19,LW,0
2960,2024-12-19,NKE,0
2961,2024-12-19,PAYX,1


## Step 5: Merge features & labels for multi-ticker dataset

In [55]:
# 1) Ensure the feature and label DataFrames share the same MultiIndex
#    (Date,Ticker) before joining:

# features_df should already be indexed by (Date,Ticker)
# if not, do it explicitly:
features_df = features_df.reset_index().set_index(['Date','Ticker'])

# labels_df just needs to have the same index
labels_df = labels_df.set_index(['Date','Ticker'])

# 2) Join on that MultiIndex, pulling in only the 'Target' column from labels_df
final_df = features_df.join(
    labels_df[['Target']],
    how='inner'
).reset_index()

# 3) Inspect & save
print("Final dataset shape:", final_df.shape)
display(final_df.head())

final_df.to_csv("multi_ticker_earnings_dataset.csv", index=False)
print("‚úÖ Saved to multi_ticker_earnings_dataset.csv")

Final dataset shape: (2963, 17)


Unnamed: 0,Date,Ticker,Close,Volume,Return,Volatility,RSI,MA5,MA10,MA_ratio,Volume_Avg20,Volume_Spike,Momentum3,ATR14,DayOfWeek,Month,Target
0,2022-07-28,KIM,21.860001,4028800.0,0.016744,0.013179,62.512911,21.498,21.18,0.015014,3674420.0,0.096445,0.016272,0.522246,3,7,1
1,2022-08-01,SPG,108.629997,1650100.0,-9.2e-05,0.017185,65.657759,106.286,104.909999,0.013116,1648115.0,0.001204,0.043315,2.614542,0,8,0
2,2022-08-04,FRT,105.040001,797300.0,0.008352,0.009903,59.129102,104.944,103.873,0.010311,644255.0,0.237553,-0.00709,2.419999,3,8,1
3,2022-08-17,AMCR,13.01,8882400.0,-0.021068,0.017779,56.644113,12.954,12.713,0.018957,9221750.0,-0.036799,0.015613,0.295485,2,8,0
4,2022-10-27,AOS,51.900002,1332100.0,-0.000962,0.014431,52.561661,51.376,50.882,0.009709,1311860.0,0.015428,0.021654,1.601343,3,10,1


‚úÖ Saved to multi_ticker_earnings_dataset.csv
