# 01 — Data Collection

Downloads SPY (Adj Close, Volume) and VIX (Close) from Yahoo Finance with smart caching.
Saves combined raw data for the next stage.

In [1]:
import sys
import os
from pathlib import Path

# ── Project configuration ──────────────────────────────────────────────────────
PROJECT_FOLDER = "default_run"   # Change to "default_run" for full training

# ── Set up paths ───────────────────────────────────────────────────────────────
PROJECT_ROOT = Path(os.getcwd()).parent
sys.path.insert(0, str(PROJECT_ROOT))

DATA_CONFIG = PROJECT_ROOT / "config" / PROJECT_FOLDER / "data_config.json"
MODEL_CONFIG = PROJECT_ROOT / "config" / PROJECT_FOLDER / "model_config.json"

print(f"Project root : {PROJECT_ROOT}")
print(f"Project folder: {PROJECT_FOLDER}")
print(f"Data config  : {DATA_CONFIG}")

Project root : /Users/sharannaribole/Documents/github/stock-return-classifier
Project folder: default_run
Data config  : /Users/sharannaribole/Documents/github/stock-return-classifier/config/default_run/data_config.json


In [2]:
from src.utils.config_loader import ConfigLoader

config = ConfigLoader(str(DATA_CONFIG), str(MODEL_CONFIG))
print(config)
print()
print(f"Ticker     : {config.get('ticker')}")
print(f"Start date : {config.get('start_date')}")
print(f"End date   : {config.get('end_date')}")

ConfigLoader(ticker=SPY, project=default_run)

Ticker     : SPY
Start date : 2006-01-01
End date   : 2026-02-16


## Collect Data

In [3]:
import os
os.chdir(PROJECT_ROOT)  # ensure relative paths work

from src.data.collector import DataCollector

collector = DataCollector(config)
raw_df = collector.collect_data()
raw_df.head()

Collecting SPY data (2006-01-01 to 2026-02-16)...
  Downloading SPY from 2005-10-09 to 2026-02-16...
  Saved to cache: SPY_20051009_20260216.csv
Collecting VIX data (2006-01-01 to 2026-02-16)...
  Downloading ^VIX from 2005-10-09 to 2026-02-16...
  Saved to cache: VIX_20051009_20260216.csv
Combined data: 5119 rows, 5 columns


Unnamed: 0_level_0,Adj_Close,High,Low,Volume,VIX_Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2005-10-10,81.43998,82.202192,81.233979,52677000,15.55
2005-10-11,81.323257,81.982468,81.247722,75629800,15.63
2005-10-12,80.684631,81.803914,80.622833,100510400,16.219999
2005-10-13,80.636574,81.082915,80.258899,99052900,16.469999
2005-10-14,81.488052,81.584187,80.72584,88651000,14.87


## Data Summary

In [4]:
import pandas as pd

print(f"Date range : {raw_df.index[0].date()} to {raw_df.index[-1].date()}")
print(f"Total rows : {len(raw_df)}")
print(f"Columns    : {list(raw_df.columns)}")
print()
print(f"Missing values:")
print(raw_df.isnull().sum())
print()
print(f"Data types:")
print(raw_df.dtypes)

Date range : 2005-10-10 to 2026-02-13
Total rows : 5119
Columns    : ['Adj_Close', 'High', 'Low', 'Volume', 'VIX_Close']

Missing values:
Adj_Close    0
High         0
Low          0
Volume       0
VIX_Close    0
dtype: int64

Data types:
Adj_Close    float64
High         float64
Low          float64
Volume         int64
VIX_Close    float64
dtype: object


In [5]:
raw_df.describe()

Unnamed: 0,Adj_Close,High,Low,Volume,VIX_Close
count,5119.0,5119.0,5119.0,5119.0,5119.0
mean,233.729393,234.986188,232.285224,125242300.0,19.370766
std,160.502399,161.254408,159.607969,90695260.0,8.696475
min,49.944588,51.330511,49.203966,15270000.0,9.14
25%,101.015816,101.451447,100.39004,66577800.0,13.65
50%,175.186035,175.880107,174.312131,94717700.0,16.99
75%,346.946213,350.875336,343.744518,153786800.0,22.360001
max,695.48999,697.840027,693.940002,871026300.0,82.690002


## Save Raw Data

In [6]:
raw_dir = PROJECT_ROOT / "data" / "raw"
raw_dir.mkdir(parents=True, exist_ok=True)

out_path = raw_dir / f"{PROJECT_FOLDER}_raw.parquet"
raw_df.to_parquet(out_path)
print(f"Saved raw data to: {out_path}")
print(f"File size: {out_path.stat().st_size / 1024:.1f} KB")

Saved raw data to: /Users/sharannaribole/Documents/github/stock-return-classifier/data/raw/default_run_raw.parquet
File size: 239.2 KB
