In [5]:
pip install yfinance


Note: you may need to restart the kernel to use updated packages.


In [11]:
import pandas as pd
import numpy as np
import yfinance as yf
from datetime import datetime, timedelta

# --- 0. Setup: Simulate Multi-Stock Intraday Data Download ---
def get_intraday_dummy_data(ticker, start_date, end_date, interval='5m'):
    """Simulates downloading 5-minute intraday data."""
    print(f"Simulating {interval} data for {ticker}...")
    try:
        data = yf.download(ticker, start=start_date, end=end_date, interval='1m', progress=False)
        if not data.empty:
            data_5m = data['Close'].resample('5min').ohlc()
            data_5m = pd.DataFrame(data_5m)
            data_5m.columns = ['Open', 'High', 'Low', 'Close']
            data_5m['Volume'] = data['Volume'].resample('5min').sum()
            data_5m.dropna(inplace=True)
            print(f"  Downloaded {len(data_5m)} actual 5m bars for {ticker} (using 1m resample).")
            return data_5m
    except Exception as e:
        print(f"  Could not fetch actual 1m data for {ticker} ({e}). Generating dummy data.")

    num_days = (pd.to_datetime(end_date) - pd.to_datetime(start_date)).days
    num_intervals_per_day = (9 * 60 + 30) // 5
    total_intervals = num_days * num_intervals_per_day

    timestamps = []
    current_dt = pd.to_datetime(start_date)
    while current_dt < pd.to_datetime(end_date):
        if current_dt.weekday() < 5:
            market_open = current_dt.replace(hour=9, minute=30, second=0, microsecond=0)
            market_close = current_dt.replace(hour=16, minute=0, second=0, microsecond=0)
            temp_time = market_open
            while temp_time <= market_close:
                timestamps.append(temp_time)
                temp_time += timedelta(minutes=5)
        current_dt += timedelta(days=1)

    base_price = 100 + np.random.rand() * 50
    prices = base_price + np.cumsum(np.random.randn(len(timestamps)) * 0.1)
    volumes = np.random.randint(1000, 100000, len(timestamps))

    df = pd.DataFrame({
        'Open': prices * (1 + np.random.uniform(-0.005, 0.005, len(timestamps))),
        'High': prices * (1 + np.random.uniform(0.001, 0.008, len(timestamps))),
        'Low': prices * (1 + np.random.uniform(-0.008, -0.001, len(timestamps))),
        'Close': prices,
        'Volume': volumes
    }, index=pd.to_datetime(timestamps))
    print(f"  Generated {len(df)} dummy 5m bars for {ticker}.")
    return df

tickers = ['AAPL', 'MSFT', 'GOOGL']
start_date = '2023-11-01'
end_date = '2024-01-31'

all_intraday_data = {}
for ticker in tickers:
    df_intraday = get_intraday_dummy_data(ticker, start_date, end_date)
    df_intraday['Ticker'] = ticker
    all_intraday_data[ticker] = df_intraday

combined_intraday_df = pd.concat(all_intraday_data.values())

print("\n--- Combined Intraday Data (First 5 rows across tickers) ---")
print(combined_intraday_df.head())
print(f"Total rows in combined intraday data: {len(combined_intraday_df)}")
print(f"Memory usage of combined intraday data: {combined_intraday_df.memory_usage(deep=True).sum() / (1024**2):.2f} MB")

if not isinstance(combined_intraday_df.index, pd.DatetimeIndex):
    combined_intraday_df.index = pd.to_datetime(combined_intraday_df.index)

print("\n--- Cleaning Intraday Data (handling NaNs) ---")
initial_rows_before_cleaning = len(combined_intraday_df)
combined_intraday_df['Close'] = combined_intraday_df.groupby('Ticker')['Close'].ffill()
combined_intraday_df.dropna(inplace=True)
rows_after_cleaning = len(combined_intraday_df)
print(f"Rows before cleaning: {initial_rows_before_cleaning}, Rows after cleaning: {rows_after_cleaning}")

print("\n--- Resampling Intraday Data to Daily ---")
daily_df = combined_intraday_df.reset_index().rename(columns={'index': 'Datetime'})
daily_df['Date'] = daily_df['Datetime'].dt.date

daily_agg = daily_df.groupby(['Ticker', 'Date']).agg(
    Open=('Open', 'first'),
    High=('High', 'max'),
    Low=('Low', 'min'),
    Close=('Close', 'last'),
    Volume=('Volume', 'sum')
).reset_index()

daily_agg['Date'] = pd.to_datetime(daily_agg['Date'])
# IMPORTANT: Do NOT set index here yet for multi-level operations later
# daily_agg.set_index('Date', inplace=True) # <-- Removed this line for a reason explained below

print(daily_agg.head())
print(f"Total daily rows: {len(daily_agg)}")

print("\n--- Calculating Technical Indicators (Rolling Windows) ---")

def calculate_sma(data_series, window):
    return data_series.rolling(window=window).mean()

def calculate_std(data_series, window):
    return data_series.rolling(window=window).std()

def calculate_rsi(data_series, window=14):
    delta = data_series.diff()
    gain = (delta.where(delta > 0, 0)).rolling(window=window).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=window).mean()
    rs = gain / loss
    rsi = 100 - (100 / (1 + rs))
    return rsi

def add_indicators_to_stock_data(df_group):
    # Ensure the DataFrame is sorted by date within the group before calculating indicators
    df_group = df_group.sort_values(by='Date')
    df_group['SMA_20'] = calculate_sma(df_group['Close'], 20)
    df_group['SMA_50'] = calculate_sma(df_group['Close'], 50)
    df_group['BB_Upper'] = df_group['SMA_20'] + (calculate_std(df_group['Close'], 20) * 2)
    df_group['BB_Lower'] = df_group['SMA_20'] - (calculate_std(df_group['Close'], 20) * 2)
    df_group['RSI'] = calculate_rsi(df_group['Close'], 14)
    return df_group

# *** CRITICAL CHANGE HERE ***
# Ensure 'Ticker' and 'Date' are set as MultiIndex *before* applying the indicator function
# and then use .dropna() which respects the MultiIndex.
# The .groupby().apply() should return a MultiIndex automatically if the group keys are not reset
# However, for explicit control and to prevent issues, we'll set the index here.
daily_agg_with_indicators = daily_agg.set_index(['Ticker', 'Date']) # Set the MultiIndex here
# Now apply the function to the groups, it will respect the index
daily_agg_with_indicators = daily_agg_with_indicators.groupby(level='Ticker', group_keys=False).apply(add_indicators_to_stock_data)


# Drop any NaNs created by rolling window calculations (now respects MultiIndex)
daily_agg_with_indicators.dropna(inplace=True)

print(daily_agg_with_indicators.head(10)) # Show more rows to see different tickers
print("\n--- Index of daily_agg_with_indicators ---")
print(daily_agg_with_indicators.index) # Check the index type explicitly

# --- 5. Data Transformation: Filtering, Sorting, Grouping, Merging ---

# --- FIX THE ERROR LINE HERE ---
# Use pd.IndexSlice for robust multi-index slicing.
# This syntax is clearer and less prone to misinterpretation than direct tuple slicing.
idx = pd.IndexSlice
aapl_daily_data = daily_agg_with_indicators.loc[idx['AAPL', '2023-12-01':'2023-12-31'], :] # All columns

print("\n--- Filtered AAPL Daily Data (Dec 2023) ---")
print(aapl_daily_data.head())

# Example: Sorting the entire DataFrame by Ticker then Date (already done by groupby/apply, but good to show)
sorted_data = daily_agg_with_indicators.sort_values(by=['Ticker', 'Date'], ascending=True)
print("\n--- Sorted Data Head ---")
print(sorted_data.head())

# Example: Using pivot_table to get a wider format for ML input (e.g., all closes as columns)
pivot_close_prices = daily_agg_with_indicators.reset_index().pivot_table(
    index='Date',
    columns='Ticker',
    values='Close'
)
print("\n--- Pivot Table (Close Prices by Ticker) ---")
print(pivot_close_prices.head())

# --- 6. Final Preparation for ML Model (Feature Engineering for XAI) ---
print("\n--- Final Preparation for ML Model (Lag Features) ---")

final_ml_data = daily_agg_with_indicators.copy()

# Create a target variable: 'Next_Day_Close' (what we want to predict)
final_ml_data['Next_Day_Close'] = final_ml_data.groupby(level='Ticker')['Close'].shift(-1)

# Create lag features (e.g., previous day's close, RSI, Volume)
final_ml_data['Prev_Day_Close'] = final_ml_data.groupby(level='Ticker')['Close'].shift(1)
final_ml_data['Prev_RSI'] = final_ml_data.groupby(level='Ticker')['RSI'].shift(1)
final_ml_data['Prev_Volume'] = final_ml_data.groupby(level='Ticker')['Volume'].shift(1)

final_ml_data.dropna(inplace=True)

features = ['Open', 'High', 'Low', 'Close', 'SMA_20', 'SMA_50', 'BB_Upper', 'BB_Lower', 'RSI',
            'Volume', 'Prev_Day_Close', 'Prev_RSI', 'Prev_Volume'] # Added OHLC, Volume to features

X = final_ml_data[features]
y = final_ml_data['Next_Day_Close']

print(f"\nFinal ML Features (X) head:")
print(X.head())
print(f"\nFinal ML Target (y) head:")
print(y.head())

print(f"\nShape of X for ML model: {X.shape}")
print(f"Shape of y for ML model: {y.shape}")
print(f"Memory usage of final ML data: {final_ml_data.memory_usage(deep=True).sum() / (1024**2):.2f} MB")


1 Failed download:
['AAPL']: YFPricesMissingError('possibly delisted; no price data found  (1m 2023-11-01 -> 2024-01-31) (Yahoo error = "1m data not available for startTime=1698811200 and endTime=1706677200. Only 8 days worth of 1m granularity data are allowed to be fetched per request.")')

1 Failed download:
['MSFT']: YFPricesMissingError('possibly delisted; no price data found  (1m 2023-11-01 -> 2024-01-31) (Yahoo error = "1m data not available for startTime=1698811200 and endTime=1706677200. Only 8 days worth of 1m granularity data are allowed to be fetched per request.")')


Simulating 5m data for AAPL...
  Generated 5135 dummy 5m bars for AAPL.
Simulating 5m data for MSFT...



1 Failed download:
['GOOGL']: YFPricesMissingError('possibly delisted; no price data found  (1m 2023-11-01 -> 2024-01-31) (Yahoo error = "1m data not available for startTime=1698811200 and endTime=1706677200. Only 8 days worth of 1m granularity data are allowed to be fetched per request.")')


  Generated 5135 dummy 5m bars for MSFT.
Simulating 5m data for GOOGL...
  Generated 5135 dummy 5m bars for GOOGL.

--- Combined Intraday Data (First 5 rows across tickers) ---
                           Open        High         Low       Close  Volume  \
2023-11-01 09:30:00  149.008410  148.723901  147.800710  148.553671   95460   
2023-11-01 09:35:00  149.324805  149.012614  148.240204  148.589737   38758   
2023-11-01 09:40:00  148.781076  149.664110  148.224788  148.501234   46560   
2023-11-01 09:45:00  149.198465  149.111529  148.062493  148.523395   47220   
2023-11-01 09:50:00  149.048924  149.216944  147.737542  148.495384   92084   

                    Ticker  
2023-11-01 09:30:00   AAPL  
2023-11-01 09:35:00   AAPL  
2023-11-01 09:40:00   AAPL  
2023-11-01 09:45:00   AAPL  
2023-11-01 09:50:00   AAPL  
Total rows in combined intraday data: 15405
Memory usage of combined intraday data: 1.43 MB

--- Cleaning Intraday Data (handling NaNs) ---
Rows before cleaning: 15405, Rows 