In [1]:
import pandas as pd
import yfinance as yf
from pathlib import Path
from typing import Optional
from datetime import datetime, timedelta


In [2]:
class StockDataFetcher:
    """
    A class to handle stock data fetching operations using yfinance.
    
    This class encapsulates the data retrieval logic from Yahoo Finance,
    ensuring proper error handling and data validation.
    """
    
    def __init__(self) -> None:
        """
        Initialize the StockDataFetcher.
        
        No connection needed for yfinance - it uses HTTP requests.
        """
        pass
    
    def fetch_daily_data(
        self, 
        symbol: str, 
        num_bars: int = 5000
    ) -> Optional[pd.DataFrame]:
        """
        Fetch daily historical data for a given symbol.
        
        Mathematical Context:
        - Daily timeframe represents one bar per trading day
        - Each bar contains: Open, High, Low, Close, Volume, Time
        - Bars are returned in chronological order (oldest first)
        - yfinance fetches data from Yahoo Finance API
        
        Args:
            symbol: Trading symbol (e.g., 'SPY')
            num_bars: Number of daily bars to retrieve (default: 5000)
        
        Returns:
            DataFrame with columns: time, open, high, low, close, volume
            Returns None if fetch fails.
        """
        try:
            # Create ticker object
            ticker = yf.Ticker(symbol)
            
            print(f"Fetching data for {symbol}...")
            
            # Fetch historical data using period="max" to get all available data
            # This is more reliable than date ranges and ensures we get maximum history
            hist = ticker.history(period="max", interval="1d")
            
            if hist is None or hist.empty:
                print(f"✗ Failed to fetch data for {symbol}: No data returned")
                return None
            
            # Reset index to make Date a column
            # yfinance returns Date as the index, reset_index() converts it to a column
            df = hist.reset_index()
            
            # The date column name might vary, find it and rename to 'time'
            # It's usually the first column after reset_index() or named 'Date'
            date_col = None
            for col in df.columns:
                if df[col].dtype == 'datetime64[ns]' or 'date' in str(col).lower():
                    date_col = col
                    break
            
            if date_col:
                df.rename(columns={date_col: 'time'}, inplace=True)
            elif len(df.columns) > 0:
                # If no date column found, the index might have been the date
                # Check if first column looks like dates
                first_col = df.columns[0]
                if pd.api.types.is_datetime64_any_dtype(df[first_col]):
                    df.rename(columns={first_col: 'time'}, inplace=True)
                else:
                    # Create time column from index if it was datetime
                    df['time'] = hist.index
            else:
                print(f"✗ Could not identify date column in data")
                return None
            
            # Ensure time column is datetime
            df['time'] = pd.to_datetime(df['time'])
            
            # Select and rename columns to match expected format
            # yfinance provides: Open, High, Low, Close, Volume, Dividends, Stock Splits
            column_mapping = {
                'Open': 'open',
                'High': 'high',
                'Low': 'low',
                'Close': 'close',
                'Volume': 'volume'
            }
            
            # Find available columns (case-insensitive matching)
            available_columns = ['time']
            rename_dict = {}
            
            for original_name, new_name in column_mapping.items():
                # Try exact match first
                if original_name in df.columns:
                    available_columns.append(original_name)
                    rename_dict[original_name] = new_name
                else:
                    # Try case-insensitive match
                    for col in df.columns:
                        if col.lower() == original_name.lower():
                            available_columns.append(col)
                            rename_dict[col] = new_name
                            break
            
            # Keep only the columns we need
            df = df[available_columns].copy()
            
            # Rename columns
            df.rename(columns=rename_dict, inplace=True)
            
            # Limit to requested number of bars (take the most recent num_bars)
            if len(df) > num_bars:
                df = df.tail(num_bars).reset_index(drop=True)
            
            # Reorder columns for consistency
            column_order = ['time', 'open', 'high', 'low', 'close', 'volume']
            df = df[[col for col in column_order if col in df.columns]]
            
            print(f"✓ Successfully fetched {len(df)} bars for {symbol}")
            print(f"  Date range: {df['time'].min().date()} to {df['time'].max().date()}")
            
            return df
            
        except Exception as e:
            print(f"✗ Failed to fetch data for {symbol}: {e}")
            return None
    
    def save_to_csv(
        self, 
        df: pd.DataFrame, 
        symbol: str, 
        output_dir: str = "data/raw"
    ) -> bool:
        """
        Save DataFrame to CSV file.
        
        Args:
            df: DataFrame containing market data
            symbol: Trading symbol (used for filename)
            output_dir: Directory to save the CSV file
        
        Returns:
            True if save successful, False otherwise.
        """
        if df is None or df.empty:
            print("✗ Cannot save empty DataFrame")
            return False
        
        # Create output directory if it doesn't exist
        output_path = Path(output_dir)
        output_path.mkdir(parents=True, exist_ok=True)
        
        # Generate filename with timestamp
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        filename = f"{symbol}_D1_{timestamp}.csv"
        filepath = output_path / filename
        
        try:
            df.to_csv(filepath, index=False)
            print(f"✓ Data saved to: {filepath}")
            return True
        except Exception as e:
            print(f"✗ Failed to save CSV: {e}")
            return False
    



In [3]:
# Initialize fetcher
fetcher = StockDataFetcher()

# Fetch data for SPY
symbol = "SPY"
num_bars = 5000

df = fetcher.fetch_daily_data(symbol=symbol, num_bars=num_bars)

if df is not None:
    # Save to CSV
    fetcher.save_to_csv(df, symbol=symbol, output_dir="data/raw")
    
    # Display first few rows
    print("\nFirst 5 rows of fetched data:")
    display(df.head())
    
    # Display last few rows
    print("\nLast 5 rows of fetched data:")
    display(df.tail())
    
    # Display data summary
    print("\nData Summary:")
    print(df.info())
    
    # Display basic statistics
    print("\nBasic Statistics:")
    display(df.describe())
else:
    print("Failed to fetch data.")


Fetching data for SPY...
✓ Successfully fetched 5000 bars for SPY
  Date range: 2006-02-13 to 2025-12-26
✓ Data saved to: data/raw/SPY_D1_20251228_215819.csv

First 5 rows of fetched data:


Unnamed: 0,time,open,high,low,close,volume
0,2006-02-13 00:00:00-05:00,87.394249,87.525411,86.945541,87.263092,52308700
1,2006-02-14 00:00:00-05:00,87.297607,88.381406,87.125028,88.188118,90964400
2,2006-02-15 00:00:00-05:00,88.139786,88.581594,87.836044,88.498749,85471300
3,2006-02-16 00:00:00-05:00,88.59541,89.195994,88.484957,89.161476,61017900
4,2006-02-17 00:00:00-05:00,89.085556,89.161492,88.761106,88.919876,40342600



Last 5 rows of fetched data:


Unnamed: 0,time,open,high,low,close,volume
4995,2025-12-19 00:00:00-05:00,676.590027,681.090027,676.469971,680.590027,103599500
4996,2025-12-22 00:00:00-05:00,683.940002,685.359985,680.590027,684.830017,69556700
4997,2025-12-23 00:00:00-05:00,683.919983,688.200012,683.869995,687.960022,64840000
4998,2025-12-24 00:00:00-05:00,687.950012,690.830017,687.799988,690.380005,39445600
4999,2025-12-26 00:00:00-05:00,690.640015,691.659973,689.27002,690.309998,41588400



Data Summary:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype                           
---  ------  --------------  -----                           
 0   time    5000 non-null   datetime64[ns, America/New_York]
 1   open    5000 non-null   float64                         
 2   high    5000 non-null   float64                         
 3   low     5000 non-null   float64                         
 4   close   5000 non-null   float64                         
 5   volume  5000 non-null   int64                           
dtypes: datetime64[ns, America/New_York](1), float64(4), int64(1)
memory usage: 234.5 KB
None

Basic Statistics:


Unnamed: 0,open,high,low,close,volume
count,5000.0,5000.0,5000.0,5000.0,5000.0
mean,233.230354,234.524174,231.815298,233.266011,126605700.0
std,156.901284,157.666636,156.046177,156.931485,91269330.0
min,49.827258,51.330511,49.203962,49.944588,20270000.0
25%,101.944911,102.423652,101.310176,101.871138,67513600.0
50%,175.593658,176.289509,174.909323,175.698975,95917550.0
75%,346.219929,349.274187,342.90389,345.589256,155600300.0
max,690.640015,691.659973,689.27002,690.380005,871026300.0
