In [None]:
import yfinance as yf
import pandas as pd
import os
from datetime import datetime, timedelta

# List of 20 most instituionally held stocks
tickers = [
    "GOOG", "MSFT", "BRK-B", "AAPL", "AMZN", "JPM", "JNJ",
    "NVDA", "XOM", "V", "PG", "META", "HD", "SPY", "CVX",
    "MRK", "PEP", "ABBV", "UNH", "COST"
]
# Just in case, probably wont work but thats fine
tickers = [ticker.replace('-', '.') for ticker in tickers]

period = "7d"       # The most I can pull given the constraint of 1m
interval = "1m"

# Save Uncleaned Date
output_dir = "yfinance_1m_data"
os.makedirs(output_dir, exist_ok=True)

def download_and_save(ticker, period, interval, output_dir):
    try:
        print(f"Downloading data for {ticker}...")
        data = yf.download(tickers=ticker, period=period, interval=interval, progress=False)

        if data.empty:
            print(f"No data found for {ticker}. It might be due to the period/interval limitations.")
            return

        filename = os.path.join(output_dir, f"{ticker}.csv")

        data.to_csv(filename)
        print(f"Data for {ticker} saved to {filename}")

    except Exception as e:
        print(f"Error downloading data for {ticker}: {e}")

for ticker in tickers:
    download_and_save(ticker, period, interval, output_dir)

print("Data download complete.")


Downloading data for GOOG...
Data for GOOG saved to yfinance_1m_data/GOOG.csv
Downloading data for MSFT...
Data for MSFT saved to yfinance_1m_data/MSFT.csv
Downloading data for BRK.B...


ERROR:yfinance:
1 Failed download:
ERROR:yfinance:['BRK.B']: YFPricesMissingError('$%ticker%: possibly delisted; no price data found  (period=7d) (Yahoo error = "No data found, symbol may be delisted")')


No data found for BRK.B. It might be due to the period/interval limitations.
Downloading data for AAPL...
Data for AAPL saved to yfinance_1m_data/AAPL.csv
Downloading data for AMZN...
Data for AMZN saved to yfinance_1m_data/AMZN.csv
Downloading data for JPM...
Data for JPM saved to yfinance_1m_data/JPM.csv
Downloading data for JNJ...
Data for JNJ saved to yfinance_1m_data/JNJ.csv
Downloading data for NVDA...
Data for NVDA saved to yfinance_1m_data/NVDA.csv
Downloading data for XOM...
Data for XOM saved to yfinance_1m_data/XOM.csv
Downloading data for V...
Data for V saved to yfinance_1m_data/V.csv
Downloading data for PG...
Data for PG saved to yfinance_1m_data/PG.csv
Downloading data for META...
Data for META saved to yfinance_1m_data/META.csv
Downloading data for HD...
Data for HD saved to yfinance_1m_data/HD.csv
Downloading data for SPY...
Data for SPY saved to yfinance_1m_data/SPY.csv
Downloading data for CVX...
Data for CVX saved to yfinance_1m_data/CVX.csv
Downloading data for MR

In [None]:
import pandas as pd
import os

input_dir = "yfinance_1m_data"
output_dir = "yfinance_1m_data_cleaned"
os.makedirs(output_dir, exist_ok=True)

for filename in os.listdir(input_dir):
    if filename.endswith(".csv"):
        input_path = os.path.join(input_dir, filename)
        output_path = os.path.join(output_dir, filename)

        try:

            df = pd.read_csv(input_path, skiprows=[1,2])


            df.rename(columns={df.columns[0]: 'Datetime'}, inplace=True)

            df.to_csv(output_path, index=False)

            print(f"Processed {filename}: removed second row and renamed first column to 'Datetime'. Saved to {output_dir}.")

        except Exception as e:
            print(f"Error processing {filename}: {e}")

print("All files processed.")


Processed GOOG.csv: removed second row and renamed first column to 'Datetime'. Saved to yfinance_1m_data_cleaned.
Processed ABBV.csv: removed second row and renamed first column to 'Datetime'. Saved to yfinance_1m_data_cleaned.
Processed JNJ.csv: removed second row and renamed first column to 'Datetime'. Saved to yfinance_1m_data_cleaned.
Processed AMZN.csv: removed second row and renamed first column to 'Datetime'. Saved to yfinance_1m_data_cleaned.
Processed COST.csv: removed second row and renamed first column to 'Datetime'. Saved to yfinance_1m_data_cleaned.
Processed HD.csv: removed second row and renamed first column to 'Datetime'. Saved to yfinance_1m_data_cleaned.
Processed PG.csv: removed second row and renamed first column to 'Datetime'. Saved to yfinance_1m_data_cleaned.
Processed V.csv: removed second row and renamed first column to 'Datetime'. Saved to yfinance_1m_data_cleaned.
Processed SPY.csv: removed second row and renamed first column to 'Datetime'. Saved to yfinance_

In [None]:
import pandas as pd
import os
from datetime import datetime
import pytz

input_dir = "yfinance_1m_data_cleaned"

output_dir = "percent_changes"
os.makedirs(output_dir, exist_ok=True)

tickers = [
    "GOOG", "MSFT", "BRK-B", "AAPL", "AMZN", "JPM", "JNJ",
    "NVDA", "XOM", "V", "PG", "META", "HD", "SPY", "CVX",
    "MRK", "PEP", "ABBV", "UNH", "COST"
]

exceptions = ["BRK-B"]

processed_tickers = [
    ticker.replace('-', '.') if ticker not in exceptions else ticker
    for ticker in tickers
]

# Defining Intervals for change to be observed
intervals = [
    ("09:30", "10:00"),
    ("10:00", "10:30"),
    ("10:30", "11:00"),
    ("11:00", "11:30"),
    ("11:30", "12:00"),
    ("12:00", "12:30"),
    ("12:30", "13:00"),
    ("13:00", "13:30"),
    ("13:30", "14:00"),
    ("14:00", "14:30"),
    ("14:30", "15:00"),
    ("15:00", "15:30"),
    ("15:30", "16:00")
]


# Timezone Clarification
UTC = pytz.utc
ET = pytz.timezone('US/Eastern')

def calculate_percent_changes(ticker, input_dir, output_dir, intervals):

    input_file = os.path.join(input_dir, f"{ticker}.csv")
    if not os.path.exists(input_file):
        print(f"CSV file for {ticker} not found in '{input_dir}'. Skipping.\n")
        return

    try:
        df = pd.read_csv(
            input_file,
            parse_dates=['Datetime'],
            index_col='Datetime'
        )
        print(f"Successfully read CSV for {ticker}.")
    except Exception as e:
        print(f"Error reading '{input_file}': {e}\n")
        return

    if df.index.tzinfo is None or df.index.tz is None:
        try:
            df.index = df.index.tz_localize(UTC)
            print(f"Localized timezone for {ticker} to UTC.")
        except Exception as e:
            print(f"Error localizing timezone for {ticker}: {e}\n")
            return

    try:
        df.index = df.index.tz_convert(ET)
        print(f"Converted timezone for {ticker} to Eastern Time.")
    except Exception as e:
        print(f"Error converting timezone for {ticker}: {e}\n")
        return

    # Sort by datetime
    df.sort_index(inplace=True)

    # Add date column
    df['Date'] = df.index.date

    # Initialize a list for percent changes
    records = []

    unique_dates = sorted(df['Date'].unique())

    for i in range(1, len(unique_dates)):
        current_date = unique_dates[i]
        previous_date = unique_dates[i - 1]

        # Get the close price at 16:00 ET
        prev_day_data = df[df['Date'] == previous_date]
        try:
            # Get the last available close before or at 16:00
            prev_close_df = prev_day_data.between_time('15:59', '16:00')
            if prev_close_df.empty:
                print(f"No close data for {ticker} on {previous_date} at 16:00. Skipping overnight change for {current_date}.\n")
                prev_close_price = None
            else:
                prev_close_time = prev_close_df.iloc[-1]
                if 'Close' not in prev_close_time:
                    print(f"'Close' column missing for {ticker} on {previous_date} at 16:00. Skipping overnight change for {current_date}.\n")
                    prev_close_price = None
                else:
                    prev_close_price = prev_close_time['Close']
                    print(f"Previous close for {ticker} on {previous_date} is {prev_close_price}.")
        except Exception as e:
            print(f"Error retrieving previous close for {ticker} on {previous_date}: {e}\n")
            prev_close_price = None

        current_day_data = df[df['Date'] == current_date]

        try:
            open_price_df = current_day_data.between_time('09:30', '09:30')
            if open_price_df.empty:
                print(f"Open price at 09:30 for {ticker} on {current_date} not found. Skipping overnight change.\n")
                continue
            open_price = open_price_df.iloc[0]['Open']
            print(f"Open price for {ticker} on {current_date} at 09:30 is {open_price}.")
        except Exception as e:
            print(f"Error retrieving open price for {ticker} on {current_date} at 09:30: {e}\n")
            continue

        # Calculate Overnight Change (Previous Close to Open)
        if prev_close_price is not None:
            overnight_change = ((open_price - prev_close_price) / prev_close_price) * 100
            records.append({
                "Date": current_date,
                "Interval": "Overnight (16:00 Prev Close to 09:30 Open)",
                "Percent Change": overnight_change
            })
            print(f"Overnight change for {ticker} on {current_date}: {overnight_change:.4f}%")

        for start_time, end_time in intervals:
            try:
                start_dt_naive = datetime.combine(current_date, datetime.strptime(start_time, "%H:%M").time())
                end_dt_naive = datetime.combine(current_date, datetime.strptime(end_time, "%H:%M").time())

                # Eastern Time Adjustment
                start_dt = ET.localize(start_dt_naive)
                end_dt = ET.localize(end_dt_naive)

                if start_dt not in current_day_data.index or end_dt not in current_day_data.index:
                    print(f"Missing data for {ticker} on {current_date} during interval {start_time} to {end_time}. Skipping this interval.\n")
                    continue

                # Open price at start_time
                start_price = current_day_data.loc[start_dt]['Open']

                # Get the close price at end_time
                end_price = current_day_data.loc[end_dt]['Close']

                # Calculate percent change
                percent_change = ((end_price - start_price) / start_price) * 100

                # Define interval label
                interval_label = f"{start_time} to {end_time}"

                # Append to records
                records.append({
                    "Date": current_date,
                    "Interval": interval_label,
                    "Percent Change": percent_change
                })
                print(f"Percent change for {ticker} on {current_date} during {interval_label}: {percent_change:.4f}%")
            except KeyError as ke:
                print(f"KeyError for {ticker} on {current_date} during interval {start_time} to {end_time}: {ke}. Skipping this interval.\n")
                continue
            except Exception as e:
                print(f"Error processing interval {start_time} to {end_time} for {ticker} on {current_date}: {e}\n")
                continue

        if records:
            percent_changes_df = pd.DataFrame(records)

            output_file = os.path.join(output_dir, f"{ticker}.csv")

            # Save to CSV
            try:
                percent_changes_df.to_csv(output_file, index=False)
                print(f"Processed percent changes for {ticker} saved to '{output_file}'.\n")
            except Exception as e:
                print(f"Error saving percent changes for {ticker} to '{output_file}': {e}\n")
        else:
            print(f"No percent change records generated for {ticker}.\n")

def main():

    for original_ticker, processed_ticker in zip(tickers, processed_tickers):
        print(f"Processing ticker: {processed_ticker}")
        calculate_percent_changes(processed_ticker, input_dir, output_dir, intervals)
    print("All tickers processed.")

if __name__ == "__main__":
    main()


Processing ticker: GOOG
Successfully read CSV for GOOG.
Converted timezone for GOOG to Eastern Time.
Previous close for GOOG on 2025-01-22 is 199.9900054931641.
Open price for GOOG on 2025-01-23 at 09:30 is 199.97000122070312.
Overnight change for GOOG on 2025-01-23: -0.0100%
Percent change for GOOG on 2025-01-23 during 09:30 to 10:00: 0.0450%
Percent change for GOOG on 2025-01-23 during 10:00 to 10:30: 0.2246%
Percent change for GOOG on 2025-01-23 during 10:30 to 11:00: 0.4162%
Percent change for GOOG on 2025-01-23 during 11:00 to 11:30: 0.1806%
Percent change for GOOG on 2025-01-23 during 11:30 to 12:00: -0.2724%
Percent change for GOOG on 2025-01-23 during 12:00 to 12:30: -0.2507%
Percent change for GOOG on 2025-01-23 during 12:30 to 13:00: -0.0498%
Percent change for GOOG on 2025-01-23 during 13:00 to 13:30: -0.1542%
Percent change for GOOG on 2025-01-23 during 13:30 to 14:00: -1.1308%
Percent change for GOOG on 2025-01-23 during 14:00 to 14:30: -0.0677%
Percent change for GOOG on 

In [None]:
import pandas as pd
import os
from datetime import datetime
import statsmodels.api as sm


percent_change_dir = "percent_changes"

regression_results_dir = "regression_results"
os.makedirs(regression_results_dir, exist_ok=True)

# List of tickers again
tickers = [
    "GOOG", "MSFT", "BRK-B", "AAPL", "AMZN", "JPM", "JNJ",
    "NVDA", "XOM", "V", "PG", "META", "HD", "SPY", "CVX",
    "MRK", "PEP", "ABBV", "UNH", "COST"
]

exceptions = ["BRK-B"]

processed_tickers = [
    ticker.replace('-', '.') if ticker not in exceptions else ticker
    for ticker in tickers
]

def perform_regression_simple(df):

    X = sm.add_constant(df['r1'])
    y = df['r13']

    model = sm.OLS(y, X).fit()

    results = {
        'Alpha': model.params['const'],
        'Beta_r1': model.params['r1'],
        'p_Alpha': model.pvalues['const'],
        'p_Beta_r1': model.pvalues['r1'],
        'R-squared': model.rsquared
    }

    return results

def perform_regression_multiple(df):

    X = sm.add_constant(df[['r1', 'r2']])
    y = df['r13']

    model = sm.OLS(y, X).fit()

    results = {
        'Alpha': model.params['const'],
        'Beta_r1': model.params['r1'],
        'Beta_r2': model.params['r2'],
        'p_Alpha': model.pvalues['const'],
        'p_Beta_r1': model.pvalues['r1'],
        'p_Beta_r2': model.pvalues['r2'],
        'R-squared': model.rsquared
    }

    return results

def extract_returns(df_pivot, ticker):

    required_intervals = {
        'r1': 'Overnight (16:00 Prev Close to 09:30 Open)',
        'r2': '09:30 to 10:00',
        'r13': '15:00 to 15:30'
    }

    missing = [col for key, col in required_intervals.items() if col not in df_pivot.columns]
    if missing:
        print(f"Ticker {ticker} is missing intervals: {missing}. Skipping.")
        return pd.DataFrame()  # Return empty DataFrame

    df_returns = df_pivot[required_intervals.values()].copy()
    df_returns.columns = ['r1', 'r2', 'r13']

    return df_returns

def generate_summary_table(regression_summary):

    summary_data = []

    for ticker, regressions in regression_summary.items():
        # Simple Regression
        if 'simple' in regressions:
            simple = regressions['simple']
            summary_data.append({
                'Ticker': ticker,
                'Regression Type': 'Simple',
                'Alpha': simple['Alpha'],
                'Beta_r1': simple['Beta_r1'],
                'Beta_r2': '',
                'p_Alpha': simple['p_Alpha'],
                'p_Beta_r1': simple['p_Beta_r1'],
                'p_Beta_r2': '',
                'R-squared': simple['R-squared']
            })
        else:
            summary_data.append({
                'Ticker': ticker,
                'Regression Type': 'Simple',
                'Alpha': 'N/A',
                'Beta_r1': 'N/A',
                'Beta_r2': '',
                'p_Alpha': 'N/A',
                'p_Beta_r1': 'N/A',
                'p_Beta_r2': '',
                'R-squared': 'N/A'
            })

        # Multiple Regression
        if 'multiple' in regressions:
            multiple = regressions['multiple']
            summary_data.append({
                'Ticker': ticker,
                'Regression Type': 'Multiple',
                'Alpha': multiple['Alpha'],
                'Beta_r1': multiple['Beta_r1'],
                'Beta_r2': multiple['Beta_r2'],
                'p_Alpha': multiple['p_Alpha'],
                'p_Beta_r1': multiple['p_Beta_r1'],
                'p_Beta_r2': multiple['p_Beta_r2'],
                'R-squared': multiple['R-squared']
            })
        else:
            summary_data.append({
                'Ticker': ticker,
                'Regression Type': 'Multiple',
                'Alpha': 'N/A',
                'Beta_r1': 'N/A',
                'Beta_r2': 'N/A',
                'p_Alpha': 'N/A',
                'p_Beta_r1': 'N/A',
                'p_Beta_r2': 'N/A',
                'R-squared': 'N/A'
            })

    summary_df = pd.DataFrame(summary_data)
    return summary_df


def main():
    """
    Main function to perform intraday momentum predictive regression analysis.
    """
    regression_summary = {}

    for original_ticker, processed_ticker in zip(tickers, processed_tickers):
        print(f"Processing ticker: {processed_ticker}")

        file_path = os.path.join(percent_change_dir, f"{processed_ticker}.csv")

        if not os.path.exists(file_path):
            print(f"{processed_ticker}.csv does not exist in '{percent_change_dir}'. Skipping.\n")
            continue

        try:
            df = pd.read_csv(file_path)
        except Exception as e:
            print(f"Failed to read {processed_ticker}.csv: {e}\n")
            continue

        try:
            df_pivot = df.pivot(index='Date', columns='Interval', values='Percent Change')
        except Exception as e:
            print(f"Failed to pivot data for {processed_ticker}: {e}\n")
            continue

        # Extract r1, r2, and r13
        df_returns = extract_returns(df_pivot, processed_ticker)

        if df_returns.empty:
            print(f"No valid returns extracted for {processed_ticker}. Skipping regression.\n")
            continue

        df_clean = df_returns.dropna()

        if df_clean.empty:
            print(f"All returns contain NaN for {processed_ticker}. Skipping regression.\n")
            continue

        # Simple Regression
        try:
            simple_results = perform_regression_simple(df_clean)
        except Exception as e:
            print(f"Simple regression failed for {processed_ticker}: {e}\n")
            simple_results = None

        # Multiple Regression
        try:
            multiple_results = perform_regression_multiple(df_clean)
        except Exception as e:
            print(f"Multiple regression failed for {processed_ticker}: {e}\n")
            multiple_results = None

        # Store results
        regression_summary[processed_ticker] = {}
        if simple_results:
            regression_summary[processed_ticker]['simple'] = simple_results
        if multiple_results:
            regression_summary[processed_ticker]['multiple'] = multiple_results

        print(f"Completed regression analysis for {processed_ticker}.\n")

    # Generate summary table
    summary_table = generate_summary_table(regression_summary)

    # Save summary table to CSV
    summary_csv_path = os.path.join(regression_results_dir, "regression_summary_table.csv")
    try:
        summary_table.to_csv(summary_csv_path, index=False)
        print(f"Regression summary table saved to '{summary_csv_path}'.\n")
    except Exception as e:
        print(f"Error saving regression summary table: {e}\n")

    # Display the summary table
    print("Regression Summary Table:")
    print(summary_table)

if __name__ == "__main__":
    main()


Processing ticker: GOOG
Completed regression analysis for GOOG.

Processing ticker: MSFT
Completed regression analysis for MSFT.

Processing ticker: BRK-B
BRK-B.csv does not exist in 'percent_changes'. Skipping.

Processing ticker: AAPL
Completed regression analysis for AAPL.

Processing ticker: AMZN
Completed regression analysis for AMZN.

Processing ticker: JPM
Completed regression analysis for JPM.

Processing ticker: JNJ
Completed regression analysis for JNJ.

Processing ticker: NVDA
Completed regression analysis for NVDA.

Processing ticker: XOM
Completed regression analysis for XOM.

Processing ticker: V
Completed regression analysis for V.

Processing ticker: PG
Completed regression analysis for PG.

Processing ticker: META
Completed regression analysis for META.

Processing ticker: HD
Completed regression analysis for HD.

Processing ticker: SPY
Completed regression analysis for SPY.

Processing ticker: CVX
Completed regression analysis for CVX.

Processing ticker: MRK
Complete