In [None]:
import numpy as np
import pandas as pd
import yfinance as yf
from statsmodels.tsa.regime_switching.markov_regression import MarkovRegression
from fredapi import Fred
import sqlite3
import os
from datetime import datetime, timedelta
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
from matplotlib import gridspec
import matplotlib.dates as mdates
from pandas.tseries.offsets import Day
from decimal import Decimal
import requests
import seaborn as sns
color_pal = sns.color_palette()
from pandas.tseries.offsets import BDay
from multiprocessing import Pool
from tqdm import tqdm


# Parameters
ticker = "^GSPC"
start_date = "1950-01-01"
end_date = "2024-11-15"
split_date = '2015-01-01'

# Download data
data = yf.download(ticker, start=start_date, end=end_date, interval="1wk")

# Use adjusted close price
data = data[['Adj Close']]
data.rename(columns={'Adj Close': 'Adj_Close'}, inplace=True)

# Calculate daily returns
data['Index_Returns'] = data['Adj_Close'].pct_change()
data.dropna(inplace=True)

# Separate train and test sets for 'Index_Returns'
train = data.loc[data.index < split_date, 'Index_Returns']
test = data.loc[data.index >= split_date, 'Index_Returns']

# Fit the Markov Switching Model on the Training Data
train_model = MarkovRegression(train, k_regimes=2, trend='c', switching_variance=True)
train_result = train_model.fit()

[*********************100%***********************]  1 of 1 completed


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.rename(columns={'Adj Close': 'Adj_Close'}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['Index_Returns'] = data['Adj_Close'].pct_change()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.dropna(inplace=True)
  self._init_dates(dates, freq)


In [14]:
train_regimes = train_result.smoothed_marginal_probabilities.idxmax(axis=1)
data.loc[train.index, 'Training Regime'] = train_regimes

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.loc[train.index, 'Training Regime'] = train_regimes


In [15]:
data

Unnamed: 0_level_0,Adj_Close,Index_Returns,Training Regime
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1950-01-08,16.650000,-0.025746,0.0
1950-01-15,16.940001,0.017417,0.0
1950-01-22,16.900000,-0.002361,0.0
1950-01-29,17.350000,0.026627,0.0
1950-02-05,17.209999,-0.008069,0.0
...,...,...,...
2024-10-20,5808.120117,-0.009642,
2024-10-27,5728.799805,-0.013657,
2024-11-03,5995.540039,0.046561,
2024-11-10,5985.379883,-0.001695,


In [16]:
# Loop over the test dates, including the last date
test_dates = test.index
for i, date in enumerate(tqdm(test_dates, desc='Processing dates')):
    # Expand the training data to include up to the current date
    recursive_train = data.loc[data.index <= date, 'Index_Returns']

    # Fit the Markov Switching Model on the expanded training data
    recursive_model = MarkovRegression(recursive_train, k_regimes=2, trend='c', switching_variance=True)
    try:
        recursive_result = recursive_model.fit(disp=False)
    except Exception as e:
        print(f"Model failed to converge at date {date}: {e}")
        continue  # Skip this date if the model fails to fit

    # Extract smoothed probabilities and last known state probabilities
    smoothed_probs = recursive_result.smoothed_marginal_probabilities
    last_probs = smoothed_probs.iloc[-1].values

    # Extract transition probabilities from the model parameters
    params = recursive_result.params
    p_00 = params['p[0->0]']
    p_10 = params['p[1->0]']
    p_01 = 1 - p_00
    p_11 = 1 - p_10

    # Construct the transition matrix
    transition_matrix = np.array([
        [p_00, p_01],
        [p_10, p_11]
    ])

    # Update state probabilities to predict the next day's regime
    state_probs = np.dot(last_probs, transition_matrix)

    # Determine the most likely regime at t+1
    regime_labels = smoothed_probs.columns.tolist()  # Should be [0, 1]
    most_likely_regime = regime_labels[np.argmax(state_probs)]

    # Get the next date for prediction
    if i + 1 < len(test.index):
        next_date = test.index[i + 1]
    else:
        # Predicting beyond the available data; estimate next date
        next_date = date + pd.Timedelta(days=1)
        # Add next_date to the DataFrame if it doesn't exist
        if next_date not in data.index:
            data.loc[next_date] = np.nan  # Initialize with NaNs

    # Store the predicted regime in 'Recursive_Predictions' at next_date
    data.at[next_date, 'Recursive_Predictions'] = most_likely_regime

    # Determine which regime corresponds to low and high volatility
    variances = [params['sigma2[0]'], params['sigma2[1]']]
    sigma2_0 = variances[0]
    sigma2_1 = variances[1]

    if sigma2_0 < sigma2_1:
        regime_mapping = {0: 'Low Volatility', 1: 'High Volatility'}
    else:
        regime_mapping = {0: 'High Volatility', 1: 'Low Volatility'}

    # Map the predicted regime to labels
    predicted_label = regime_mapping[most_likely_regime]
    data.at[next_date, 'Recursive_Predicted_Regime_Label'] = predicted_label

# Display the recursive predictions
print(data.loc[test.index, ['Index_Returns', 'Recursive_Predictions', 'Recursive_Predicted_Regime_Label']].head())

  self._init_dates(dates, freq)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.at[next_date, 'Recursive_Predictions'] = most_likely_regime
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.at[next_date, 'Recursive_Predicted_Regime_Label'] = predicted_label
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init

            Index_Returns  Recursive_Predictions  \
Date                                               
2024-01-07       0.018434                    NaN   
2024-01-14       0.011702                    0.0   
2024-01-21       0.010571                    0.0   
2024-01-28       0.013829                    0.0   
2024-02-04       0.013714                    0.0   

           Recursive_Predicted_Regime_Label  
Date                                         
2024-01-07                              NaN  
2024-01-14                   Low Volatility  
2024-01-21                   Low Volatility  
2024-01-28                   Low Volatility  
2024-02-04                   Low Volatility  





In [17]:
# # Loop over the test dates, including the last date
# test_dates = test.index
# for i, date in enumerate(test_dates):
#     # Expand the training data to include up to the current date
#     recursive_train = data.loc[data.index <= date, 'Index_Returns']

#     # Fit the Markov Switching Model on the expanded training data
#     recursive_model = MarkovRegression(recursive_train, k_regimes=2, trend='c', switching_variance=True)
#     try:
#         recursive_result = recursive_model.fit(disp=False)
#     except Exception as e:
#         print(f"Model failed to converge at date {date}: {e}")
#         continue  # Skip this date if the model fails to fit

#     # Extract smoothed probabilities and last known state probabilities
#     smoothed_probs = recursive_result.smoothed_marginal_probabilities
#     last_probs = smoothed_probs.iloc[-1].values

#     # Extract transition probabilities from the model parameters
#     params = recursive_result.params
#     p_00 = params['p[0->0]']
#     p_10 = params['p[1->0]']
#     p_01 = 1 - p_00
#     p_11 = 1 - p_10

#     # Construct the transition matrix
#     transition_matrix = np.array([
#         [p_00, p_01],
#         [p_10, p_11]
#     ])

#     # Update state probabilities to predict the next day's regime
#     state_probs = np.dot(last_probs, transition_matrix)

#     # Determine the most likely regime at t+1
#     regime_labels = smoothed_probs.columns.tolist()  # Should be [0, 1]
#     most_likely_regime = regime_labels[np.argmax(state_probs)]

#     # Get the next date for prediction
#     if i + 1 < len(test.index):
#         next_date = test.index[i + 1]
#     else:
#         # Predicting beyond the available data; estimate next date
#         next_date = date + pd.Timedelta(days=1)
#         # Add next_date to the DataFrame if it doesn't exist
#         if next_date not in data.index:
#             data.loc[next_date] = np.nan  # Initialize with NaNs

#     # Store the predicted regime in 'Recursive_Predictions' at next_date
#     data.at[next_date, 'Recursive_Predictions'] = most_likely_regime

#     # Determine which regime corresponds to low and high volatility
#     variances = [params['sigma2[0]'], params['sigma2[1]']]
#     sigma2_0 = variances[0]
#     sigma2_1 = variances[1]

#     if sigma2_0 < sigma2_1:
#         regime_mapping = {0: 'Low Volatility', 1: 'High Volatility'}
#     else:
#         regime_mapping = {0: 'High Volatility', 1: 'Low Volatility'}

#     # Map the predicted regime to labels
#     predicted_label = regime_mapping[most_likely_regime]
#     data.at[next_date, 'Recursive_Predicted_Regime_Label'] = predicted_label

# # Display the recursive predictions
# print(data.loc[test.index, ['Index_Returns', 'Recursive_Predictions', 'Recursive_Predicted_Regime_Label']].head())

In [18]:
data

Unnamed: 0_level_0,Adj_Close,Index_Returns,Training Regime,Recursive_Predictions,Recursive_Predicted_Regime_Label
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1950-01-08,16.650000,-0.025746,0.0,,
1950-01-15,16.940001,0.017417,0.0,,
1950-01-22,16.900000,-0.002361,0.0,,
1950-01-29,17.350000,0.026627,0.0,,
1950-02-05,17.209999,-0.008069,0.0,,
...,...,...,...,...,...
2024-10-27,5728.799805,-0.013657,,0.0,Low Volatility
2024-11-03,5995.540039,0.046561,,0.0,Low Volatility
2024-11-10,5985.379883,-0.001695,,0.0,Low Volatility
2024-11-13,5985.379883,0.000000,,0.0,Low Volatility


In [19]:
# Check for NaNs in 'Index_Returns' and drop them
returns = data['Index_Returns'].dropna()

# Fit Markov Switching Model
model = MarkovRegression(returns, k_regimes=2, trend='c', switching_variance=True)
result = model.fit()
print(result.summary())

# Add regime to the data
data.loc[returns.index, 'Full_Vol_Regime'] = result.smoothed_marginal_probabilities.idxmax(axis=1)

  self._init_dates(dates, freq)


                        Markov Switching Model Results                        
Dep. Variable:          Index_Returns   No. Observations:                 3907
Model:               MarkovRegression   Log Likelihood                9972.114
Date:                Thu, 14 Nov 2024   AIC                         -19932.228
Time:                        08:15:00   BIC                         -19894.605
Sample:                             0   HQIC                        -19918.876
                               - 3907                                         
Covariance Type:               approx                                         
                             Regime 0 parameters                              
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.0029      0.000      9.784      0.000       0.002       0.003
sigma2         0.0002   8.12e-06     26.284      0.0

In [20]:
data

Unnamed: 0_level_0,Adj_Close,Index_Returns,Training Regime,Recursive_Predictions,Recursive_Predicted_Regime_Label,Full_Vol_Regime
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1950-01-08,16.650000,-0.025746,0.0,,,0.0
1950-01-15,16.940001,0.017417,0.0,,,0.0
1950-01-22,16.900000,-0.002361,0.0,,,0.0
1950-01-29,17.350000,0.026627,0.0,,,0.0
1950-02-05,17.209999,-0.008069,0.0,,,0.0
...,...,...,...,...,...,...
2024-10-27,5728.799805,-0.013657,,0.0,Low Volatility,0.0
2024-11-03,5995.540039,0.046561,,0.0,Low Volatility,0.0
2024-11-10,5985.379883,-0.001695,,0.0,Low Volatility,0.0
2024-11-13,5985.379883,0.000000,,0.0,Low Volatility,0.0


In [21]:
# Non-Recursive Non-Updating Parameter Set

# Extract the smoothed probabilities for the training data
smoothed_probs = train_result.smoothed_marginal_probabilities

# Extract the last known state probabilities (at the end of the training data)
last_probs = smoothed_probs.iloc[-1].values  # Convert to NumPy array

# Regime labels (from smoothed probabilities)
regime_labels = smoothed_probs.columns.tolist()  # Should be [0, 1]

# Extract transition probabilities from the model parameters
params = train_result.params

# Transition probabilities
p_00 = params['p[0->0]']  # Probability of staying in Regime 0
p_10 = params['p[1->0]']  # Probability of moving from Regime 1 to Regime 0

# Probabilities of transitioning to Regime 1
p_01 = 1 - p_00           # Probability of moving from Regime 0 to Regime 1
p_11 = 1 - p_10           # Probability of staying in Regime 1

# Construct the transition matrix
transition_matrix = np.array([
    [p_00, p_01],  # Transitions from Regime 0
    [p_10, p_11]   # Transitions from Regime 1
])

# Regime-specific variances
variances = [params['sigma2[0]'], params['sigma2[1]']]

# Initialize columns for state probabilities and predicted regime in the data DataFrame
data['State_Prob_Regime0'] = np.nan
data['State_Prob_Regime1'] = np.nan
data['Predicted_Regime'] = np.nan

# Initialize state probabilities with the last known probabilities from the training data
state_probs = last_probs.copy()

# Loop over each date in the test set within the data DataFrame
for date in test.index:
    # Update state probabilities using the transition matrix
    state_probs = np.dot(state_probs, transition_matrix)
    
    # Store the state probabilities in the DataFrame
    data.at[date, 'State_Prob_Regime0'] = state_probs[0]
    data.at[date, 'State_Prob_Regime1'] = state_probs[1]
    
    # Determine the most likely regime
    most_likely_regime = regime_labels[np.argmax(state_probs)]
    
    # Store the predicted regime in the DataFrame
    data.at[date, 'Predicted_Regime'] = most_likely_regime

# Determine which regime corresponds to low volatility and high volatility
sigma2_0 = variances[0]
sigma2_1 = variances[1]

if sigma2_0 < sigma2_1:
    regime_mapping = {0: 'Low Volatility', 1: 'High Volatility'}
else:
    regime_mapping = {0: 'High Volatility', 1: 'Low Volatility'}

# Map the predicted regimes to labels
data['Predicted_Regime_Label'] = data['Predicted_Regime'].map(regime_mapping)

# # Display the first few rows of the data DataFrame for the test period
# print(data.loc[test.index, ['Index_Returns', 'State_Prob_Regime0', 'State_Prob_Regime1', 'Predicted_Regime', 'Predicted_Regime_Label']].head())
# Create the 'Blended_Test_Training_Regime' by merging 'Training Regime' and 'Predicted_Regime' based on date condition
data['Blended_Test_Training_Regime'] = None  # Initialize the column with None

# Set values from 'Training Regime' for dates before split_date
data.loc[data.index < split_date, 'Blended_Test_Training_Regime'] = data['Training Regime']

# Set values from 'Predicted_Regime' for dates on or after split_date
data.loc[data.index >= split_date, 'Blended_Test_Training_Regime'] = data['Predicted_Regime']

# Display the DataFrame with the new 'Blended_Test_Training_Regime' column
print(data[['Training Regime', 'Predicted_Regime', 'Blended_Test_Training_Regime']])

            Training Regime  Predicted_Regime Blended_Test_Training_Regime
Date                                                                      
1950-01-08              0.0               NaN                          0.0
1950-01-15              0.0               NaN                          0.0
1950-01-22              0.0               NaN                          0.0
1950-01-29              0.0               NaN                          0.0
1950-02-05              0.0               NaN                          0.0
...                     ...               ...                          ...
2024-10-27              NaN               0.0                          0.0
2024-11-03              NaN               0.0                          0.0
2024-11-10              NaN               0.0                          0.0
2024-11-13              NaN               0.0                          0.0
2024-11-14              NaN               NaN                          NaN

[3908 rows x 3 columns]


In [22]:
data

Unnamed: 0_level_0,Adj_Close,Index_Returns,Training Regime,Recursive_Predictions,Recursive_Predicted_Regime_Label,Full_Vol_Regime,State_Prob_Regime0,State_Prob_Regime1,Predicted_Regime,Predicted_Regime_Label,Blended_Test_Training_Regime
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1950-01-08,16.650000,-0.025746,0.0,,,0.0,,,,,0.0
1950-01-15,16.940001,0.017417,0.0,,,0.0,,,,,0.0
1950-01-22,16.900000,-0.002361,0.0,,,0.0,,,,,0.0
1950-01-29,17.350000,0.026627,0.0,,,0.0,,,,,0.0
1950-02-05,17.209999,-0.008069,0.0,,,0.0,,,,,0.0
...,...,...,...,...,...,...,...,...,...,...,...
2024-10-27,5728.799805,-0.013657,,0.0,Low Volatility,0.0,0.764720,0.235280,0.0,Low Volatility,0.0
2024-11-03,5995.540039,0.046561,,0.0,Low Volatility,0.0,0.764385,0.235615,0.0,Low Volatility,0.0
2024-11-10,5985.379883,-0.001695,,0.0,Low Volatility,0.0,0.764079,0.235921,0.0,Low Volatility,0.0
2024-11-13,5985.379883,0.000000,,0.0,Low Volatility,0.0,0.763800,0.236200,0.0,Low Volatility,0.0


In [23]:
# Flatten MultiIndex columns if necessary
if isinstance(data.columns, pd.MultiIndex):
    data.columns = ['_'.join(filter(None, col)) for col in data.columns]

# Specify the database filename
db_filename = 'output/fill_forward_comparison.db'

# Connect to the SQLite database
conn = sqlite3.connect(db_filename)

# Table name for main data export
table_name = 'regime_data'

# Write the flattened DataFrame to the SQLite database
data.to_sql(table_name, conn, if_exists='replace', index=True)

# Verify the table structure after export
cursor = conn.cursor()
cursor.execute(f"PRAGMA table_info({table_name})")
schema = cursor.fetchall()
print("Table schema:", schema)

# Close the connection
conn.close()

Table schema: [(0, 'Date', 'TIMESTAMP', 0, None, 0), (1, 'Adj_Close', 'REAL', 0, None, 0), (2, 'Index_Returns', 'REAL', 0, None, 0), (3, 'Training Regime', 'REAL', 0, None, 0), (4, 'Recursive_Predictions', 'REAL', 0, None, 0), (5, 'Recursive_Predicted_Regime_Label', 'TEXT', 0, None, 0), (6, 'Full_Vol_Regime', 'REAL', 0, None, 0), (7, 'State_Prob_Regime0', 'REAL', 0, None, 0), (8, 'State_Prob_Regime1', 'REAL', 0, None, 0), (9, 'Predicted_Regime', 'REAL', 0, None, 0), (10, 'Predicted_Regime_Label', 'TEXT', 0, None, 0), (11, 'Blended_Test_Training_Regime', 'REAL', 0, None, 0)]


In [24]:
# Filter for rows where 'Recursive_Predictions' and 'Full_Vol_Regime' do not match, excluding NaNs
filtered_data = data[(data['Recursive_Predictions'] != data['Full_Vol_Regime'])].dropna(subset=['Recursive_Predictions', 'Full_Vol_Regime'])

# Flatten MultiIndex columns if necessary
if isinstance(filtered_data.columns, pd.MultiIndex):
    filtered_data.columns = ['_'.join(filter(None, col)) for col in filtered_data.columns]

# Connect to the SQLite database
conn = sqlite3.connect(db_filename)

# Table name for filtered data
new_table_name = 'mismatched_regimes'

# Write the filtered DataFrame to the SQLite database
filtered_data.to_sql(new_table_name, conn, if_exists='replace', index=True)

# Verify the table structure for the filtered data
cursor = conn.cursor()
cursor.execute(f"PRAGMA table_info({new_table_name})")
filtered_schema = cursor.fetchall()
print("Filtered Table schema:", filtered_schema)

# Close the connection
conn.close()

Filtered Table schema: [(0, 'Date', 'TIMESTAMP', 0, None, 0), (1, 'Adj_Close', 'REAL', 0, None, 0), (2, 'Index_Returns', 'REAL', 0, None, 0), (3, 'Training Regime', 'REAL', 0, None, 0), (4, 'Recursive_Predictions', 'REAL', 0, None, 0), (5, 'Recursive_Predicted_Regime_Label', 'TEXT', 0, None, 0), (6, 'Full_Vol_Regime', 'REAL', 0, None, 0), (7, 'State_Prob_Regime0', 'REAL', 0, None, 0), (8, 'State_Prob_Regime1', 'REAL', 0, None, 0), (9, 'Predicted_Regime', 'REAL', 0, None, 0), (10, 'Predicted_Regime_Label', 'TEXT', 0, None, 0), (11, 'Blended_Test_Training_Regime', 'REAL', 0, None, 0)]


In [25]:
# Connect to the SQLite database
conn = sqlite3.connect(db_filename)

# Retrieve the main data table
query = "SELECT * FROM regime_data"
df_main = pd.read_sql_query(query, conn)

# Export the main data to CSV
output_csv_path_main = 'output/regime_data_export.csv'
df_main.to_csv(output_csv_path_main, index=False)
print(f"Main data exported to {output_csv_path_main}")

# Retrieve the filtered data table
query_filtered = "SELECT * FROM mismatched_regimes"
df_filtered = pd.read_sql_query(query_filtered, conn)

# Export the filtered data to CSV
output_csv_path_filtered = 'output/mismatched_regimes_export.csv'
df_filtered.to_csv(output_csv_path_filtered, index=False)
print(f"Filtered data exported to {output_csv_path_filtered}")

# Close the connection
conn.close()

Main data exported to output/regime_data_export.csv
Filtered data exported to output/mismatched_regimes_export.csv
