# Masterframes Ingestion (Clean)

Purpose -> Fetch OHLCV, dividends, splits, earnings_dates, and meta_kv for the ticker universe and save to analysis/data.

In [9]:
from pathlib import Path
import pandas as pd
import sys

# Paths (analysis/data)
DATA_DIR = Path("data")
DATA_DIR.mkdir(parents=True, exist_ok=True)

INGEST_DIR = Path("../ingestion/yfinance")
sys.path.append(str(INGEST_DIR))

import yfinance_methods_v2 as ym
print('DATA_DIR ->', DATA_DIR.resolve())
print('INGEST_DIR ->', INGEST_DIR.resolve())


DATA_DIR -> C:\Users\sherv\workingdir\Projects\stock_analysis_tool\git\analysis\data
INGEST_DIR -> C:\Users\sherv\workingdir\Projects\stock_analysis_tool\git\ingestion\yfinance


In [10]:
TICKERS = [
    'AAPL','MSFT','NVDA','AMZN','META','GOOG','TSLA',
    'JPM','XOM','BRK-B','UNH','SPY','QQQ','DIA','IWM','XLK','XLF','XLV','XLE','TLT'
]
PERIOD = 'max'
INTERVAL = '1d'
print('Tickers ->', len(TICKERS))


Tickers -> 20


## Build masterframes

This cell fetches tables and concatenates them safely.

In [11]:
def _cat(frames, cols):
    frames = [f for f in frames if isinstance(f, pd.DataFrame) and not f.empty]
    return pd.concat(frames, ignore_index=True) if frames else pd.DataFrame(columns=cols)

ohlcv = _cat([ym.get_ohlcv_data(tk, period=PERIOD, interval=INTERVAL) for tk in TICKERS],
             ['ticker','date','open','high','low','close','adj_close','volume','dividends','splits'])

dividends = _cat([ym.get_dividends(tk) for tk in TICKERS], ['ticker','date','dividends'])

splits = _cat([ym.get_splits(tk) for tk in TICKERS], ['ticker','date','splits'])

earnings = _cat([ym.get_earnings_dates(tk) for tk in TICKERS], ['ticker','earnings_date','eps_estimate','reported_eps','surprise'])

meta_kv = _cat([ym.get_company_metadata(tk) for tk in TICKERS], ['ticker','key','value'])

print('Rows ->', len(ohlcv), len(dividends), len(splits), len(earnings), len(meta_kv))


DIA: $DIA: possibly delisted; no earnings dates found
IWM: $IWM: possibly delisted; no earnings dates found
TLT: $TLT: possibly delisted; no earnings dates found


Rows -> 153332 2053 47 194 2754


## Save outputs (optional)

Set SAVE = True to persist CSVs to analysis/data.

In [12]:
SAVE = True
if SAVE:
    ohlcv.to_csv(DATA_DIR / 'ohlcv.csv', index=False)
    dividends.to_csv(DATA_DIR / 'dividends.csv', index=False)
    splits.to_csv(DATA_DIR / 'splits.csv', index=False)
    earnings.to_csv(DATA_DIR / 'earnings_dates.csv', index=False)
    meta_kv.to_csv(DATA_DIR / 'meta_kv.csv', index=False)
    print('Saved CSVs to', DATA_DIR.resolve())
else:
    print('SAVE is False; not saving files')


Saved CSVs to C:\Users\sherv\workingdir\Projects\stock_analysis_tool\git\analysis\data


## Coverage and data quality

Compute first/last/bars per ticker.

In [13]:
if not ohlcv.empty:
    cov = (ohlcv.assign(date=pd.to_datetime(ohlcv['date'], errors='coerce'))
               .groupby('ticker')['date']
               .agg(first='min', last='max', bars='count')
               .reset_index()
               .sort_values('ticker'))
    display(cov)
else:
    print('OHLCV empty; cannot compute coverage')


Unnamed: 0,ticker,first,last,bars
0,AAPL,1980-12-12 05:00:00+00:00,2025-08-20 04:00:00+00:00,11263
1,AMZN,1997-05-15 04:00:00+00:00,2025-08-21 04:00:00+00:00,7112
2,BRK-B,1996-05-09 04:00:00+00:00,2025-08-21 04:00:00+00:00,7369
3,DIA,1998-01-20 05:00:00+00:00,2025-08-21 04:00:00+00:00,6941
4,GOOG,2004-08-19 04:00:00+00:00,2025-08-21 04:00:00+00:00,5286
5,IWM,2000-05-26 04:00:00+00:00,2025-08-21 04:00:00+00:00,6347
6,JPM,1980-03-17 05:00:00+00:00,2025-08-21 04:00:00+00:00,11452
7,META,2012-05-18 04:00:00+00:00,2025-08-21 04:00:00+00:00,3334
8,MSFT,1986-03-13 05:00:00+00:00,2025-08-21 04:00:00+00:00,9938
9,NVDA,1999-01-22 05:00:00+00:00,2025-08-21 04:00:00+00:00,6687


## Next steps

- eda.ipynb -> overview + teaser
- statistical_financial_analysis.ipynb -> correlation heatmap + CAPM
- indicators_metrics.ipynb -> compute indicators and export CSV
