## Load the fama french dataset

In [18]:
import sys
sys.path.append('../src')

from data_loader import load_sector_data, get_seasonal_returns, calculate_sector_ratio
# from returns_calculator import (
#     calculate_annual_returns_from_monthly,
#     calculate_seasonal_returns
# )
# from sharpe_analysis import (
#     calculate_sharpe_ratio,
#     calculate_sharpe_ratio_by_season
#)
from timeseries_analysis import (
    reshape_to_timeseries,
    calculate_sp500_returns_ts,
    calculate_smga_returns_ts,
)

from fama_french_analysis import run_fama_french_regression
from stats_analysis import calculate_statistics, seasonal_ttest
from visualization import plot_seasonal_comparison, plot_monthly_averages

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from datetime import datetime

# Display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', '{:.3f}'.format)

print("✓ Setup complete!")

✓ Setup complete!


## load up fama factors for regression

In [None]:
# loading data and converting into reusable ff factors regression input

# Load the data
file_path = '../data/Seasonal S&P Data as of 11.13.2025 - Abhi.xlsx'
factor_series = {}
factor_skiprows = [(1, "R-Mkt"), (59, "SMB"), (117, "HML"), (175, "rf")]
for skiprows, factor in factor_skiprows:
    df_monthly = load_sector_data(file_path, "SMB_HML", skiprows, numrows=56)
    ts = reshape_to_timeseries(df_monthly).sort_index()
    ts = pd.to_numeric(ts, errors="coerce")  # ensure numeric
    ts.name = factor
    factor_series[factor] = ts
factors_df = pd.concat(factor_series.values(), axis=1)
factors_df.index.name = "Date"

display(factors_df.head())


Unnamed: 0_level_0,rm_rf,smb,hml,rf
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1970-01-01,-8.1,2.91,3.17,0.6
1970-02-01,5.14,-2.39,3.69,0.62
1970-03-01,-1.06,-2.31,4.08,0.57
1970-04-01,-10.99,-6.14,6.14,0.5
1970-05-01,-6.9,-4.61,3.32,0.53


## load up datasets we want to run regressions on

In [20]:
df_sp500 = load_sector_data(file_path, 'S&P500')
df_tbills = load_sector_data(file_path, 'TBills')

print("SP 500 data:")
print(f"  Years: {df_sp500.index.min()} - {df_sp500.index.max()}")
print(f"  Shape: {df_sp500.shape}")

print("\nTbills data:")
print(f"  Years: {df_tbills.index.min()} - {df_tbills.index.max()}")
print(f"  Shape: {df_tbills.shape}")


sp500_excess = calculate_sp500_returns_ts(df_sp500, df_tbills)
print(f"\nS&P 500 excess returns calculated")
print(f"Total observations: {len(sp500_excess)}")
print(f"Date range: {sp500_excess.index.min()} to {sp500_excess.index.max()}")

smga_excess = calculate_smga_returns_ts(df_sp500, df_tbills)

print(f"\nSMGA excess returns calculated")
print(f"Total observations: {len(smga_excess)}")
print(f"Date range: {smga_excess.index.min()} to {smga_excess.index.max()}")



SP 500 data:
  Years: 1970 - 2025
  Shape: (56, 12)

Tbills data:
  Years: 2001 - 2025
  Shape: (25, 12)

S&P 500 excess returns calculated
Total observations: 291
Date range: 2001-08-01 00:00:00 to 2025-10-01 00:00:00

SMGA excess returns calculated
Total observations: 291
Date range: 2001-08-01 00:00:00 to 2025-10-01 00:00:00


## Run fama regression

In [21]:
# sp500 regression
results = run_fama_french_regression(sp500_excess, ff_factors=factors_df)

ValueError: Missing factors: {'HML', 'SMB', 'Mkt-RF'}