In [1]:
import numpy as np
import pandas as pd
import yfinance as yf
from statsmodels.tsa.regime_switching.markov_regression import MarkovRegression
from fredapi import Fred
import sqlite3
import os
from datetime import datetime, timedelta
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
from matplotlib import gridspec
import matplotlib.dates as mdates
from pandas.tseries.offsets import Day
from decimal import Decimal
import requests
import seaborn as sns
color_pal = sns.color_palette()

# Parameters
ticker = "^GSPC"
start_date = "1950-01-01"
end_date = "2024-11-12"
split_date = '2020-01-01'

# Download data
data = yf.download(ticker, start=start_date, end=end_date)

# Use adjusted close price
data = data[['Adj Close']]
data.rename(columns={'Adj Close': 'Adj_Close'}, inplace=True)

# Calculate daily returns
data['Index_Returns'] = data['Adj_Close'].pct_change()
data.dropna(inplace=True)

# Separate train and test sets for 'Index_Returns'
train = data.loc[data.index < split_date, 'Index_Returns']
test = data.loc[data.index >= split_date, 'Index_Returns']

# Fit the Markov Switching Model on the Training Data
train_model = MarkovRegression(train, k_regimes=2, trend='c', switching_variance=True)
train_result = train_model.fit()


[*********************100%***********************]  1 of 1 completed


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.rename(columns={'Adj Close': 'Adj_Close'}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['Index_Returns'] = data['Adj_Close'].pct_change()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.dropna(inplace=True)
  self._init_dates(dates, freq)


In [2]:
train_regimes = train_result.smoothed_marginal_probabilities.idxmax(axis=1)
data.loc[train.index, 'Training Regime'] = train_regimes

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.loc[train.index, 'Training Regime'] = train_regimes


In [3]:
data

Unnamed: 0_level_0,Adj_Close,Index_Returns,Training Regime
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1950-01-04,16.850000,0.011405,0.0
1950-01-05,16.930000,0.004748,0.0
1950-01-06,16.980000,0.002953,0.0
1950-01-09,17.080000,0.005889,0.0
1950-01-10,17.030001,-0.002927,0.0
...,...,...,...
2024-11-05,5782.759766,0.012266,
2024-11-06,5929.040039,0.025296,
2024-11-07,5973.100098,0.007431,
2024-11-08,5995.540039,0.003757,


In [4]:

# Extract the smoothed probabilities for the training data
smoothed_probs = train_result.smoothed_marginal_probabilities

# Extract the last known state probabilities (at the end of the training data)
last_probs = smoothed_probs.iloc[-1].values  # Convert to NumPy array

# Regime labels (from smoothed probabilities)
regime_labels = smoothed_probs.columns.tolist()  # Should be [0, 1]

# Extract transition probabilities from the model parameters
params = train_result.params

# Transition probabilities
p_00 = params['p[0->0]']  # Probability of staying in Regime 0
p_10 = params['p[1->0]']  # Probability of moving from Regime 1 to Regime 0

# Probabilities of transitioning to Regime 1
p_01 = 1 - p_00           # Probability of moving from Regime 0 to Regime 1
p_11 = 1 - p_10           # Probability of staying in Regime 1

# Construct the transition matrix
transition_matrix = np.array([
    [p_00, p_01],  # Transitions from Regime 0
    [p_10, p_11]   # Transitions from Regime 1
])

# Regime-specific variances
variances = [params['sigma2[0]'], params['sigma2[1]']]

# Initialize columns for state probabilities and predicted regime in the data DataFrame
data['State_Prob_Regime0'] = np.nan
data['State_Prob_Regime1'] = np.nan
data['Predicted_Regime'] = np.nan

# Initialize state probabilities with the last known probabilities from the training data
state_probs = last_probs.copy()

# Loop over each date in the test set within the data DataFrame
for date in test.index:
    # Update state probabilities using the transition matrix
    state_probs = np.dot(state_probs, transition_matrix)
    
    # Store the state probabilities in the DataFrame
    data.at[date, 'State_Prob_Regime0'] = state_probs[0]
    data.at[date, 'State_Prob_Regime1'] = state_probs[1]
    
    # Determine the most likely regime
    most_likely_regime = regime_labels[np.argmax(state_probs)]
    
    # Store the predicted regime in the DataFrame
    data.at[date, 'Predicted_Regime'] = most_likely_regime

# Determine which regime corresponds to low volatility and high volatility
sigma2_0 = variances[0]
sigma2_1 = variances[1]

if sigma2_0 < sigma2_1:
    regime_mapping = {0: 'Low Volatility', 1: 'High Volatility'}
else:
    regime_mapping = {0: 'High Volatility', 1: 'Low Volatility'}

# Map the predicted regimes to labels
data['Predicted_Regime_Label'] = data['Predicted_Regime'].map(regime_mapping)

# Display the first few rows of the data DataFrame for the test period
print(data.loc[test.index, ['Index_Returns', 'State_Prob_Regime0', 'State_Prob_Regime1', 'Predicted_Regime', 'Predicted_Regime_Label']].head())


            Index_Returns  State_Prob_Regime0  State_Prob_Regime1  \
Date                                                                
2020-01-02       0.008379            0.977110            0.022890   
2020-01-03      -0.007060            0.965435            0.034565   
2020-01-06       0.003533            0.954358            0.045642   
2020-01-07      -0.002803            0.943850            0.056150   
2020-01-08       0.004902            0.933881            0.066119   

            Predicted_Regime Predicted_Regime_Label  
Date                                                 
2020-01-02               0.0         Low Volatility  
2020-01-03               0.0         Low Volatility  
2020-01-06               0.0         Low Volatility  
2020-01-07               0.0         Low Volatility  
2020-01-08               0.0         Low Volatility  


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['State_Prob_Regime0'] = np.nan
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['State_Prob_Regime1'] = np.nan
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['Predicted_Regime'] = np.nan
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_i

In [5]:
data

Unnamed: 0_level_0,Adj_Close,Index_Returns,Training Regime,State_Prob_Regime0,State_Prob_Regime1,Predicted_Regime,Predicted_Regime_Label
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1950-01-04,16.850000,0.011405,0.0,,,,
1950-01-05,16.930000,0.004748,0.0,,,,
1950-01-06,16.980000,0.002953,0.0,,,,
1950-01-09,17.080000,0.005889,0.0,,,,
1950-01-10,17.030001,-0.002927,0.0,,,,
...,...,...,...,...,...,...,...
2024-11-05,5782.759766,0.012266,,0.749547,0.250453,0.0,Low Volatility
2024-11-06,5929.040039,0.025296,,0.749547,0.250453,0.0,Low Volatility
2024-11-07,5973.100098,0.007431,,0.749547,0.250453,0.0,Low Volatility
2024-11-08,5995.540039,0.003757,,0.749547,0.250453,0.0,Low Volatility


In [6]:
    # Check for NaNs in 'Index_Returns' and drop them
    returns = data['Index_Returns'].dropna()
    
    # Fit Markov Switching Model
    model = MarkovRegression(returns, k_regimes=2, trend='c', switching_variance=True)
    result = model.fit()
    print(result.summary())
    
    # Add regime to the data
    data.loc[returns.index, 'Full_Vol_Regime'] = result.smoothed_marginal_probabilities.idxmax(axis=1)

  self._init_dates(dates, freq)


                        Markov Switching Model Results                        
Dep. Variable:          Index_Returns   No. Observations:                18836
Model:               MarkovRegression   Log Likelihood               63429.516
Date:                Tue, 12 Nov 2024   AIC                        -126847.033
Time:                        16:19:57   BIC                        -126799.972
Sample:                             0   HQIC                       -126831.591
                              - 18836                                         
Covariance Type:               approx                                         
                             Regime 0 parameters                              
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.0007   5.65e-05     11.528      0.000       0.001       0.001
sigma2      3.997e-05   8.05e-07     49.639      0.0

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.loc[returns.index, 'Full_Vol_Regime'] = result.smoothed_marginal_probabilities.idxmax(axis=1)
  data.loc[returns.index, 'Full_Vol_Regime'] = result.smoothed_marginal_probabilities.idxmax(axis=1)


In [7]:
data

Unnamed: 0_level_0,Adj_Close,Index_Returns,Training Regime,State_Prob_Regime0,State_Prob_Regime1,Predicted_Regime,Predicted_Regime_Label,Full_Vol_Regime
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1950-01-04,16.850000,0.011405,0.0,,,,,0
1950-01-05,16.930000,0.004748,0.0,,,,,0
1950-01-06,16.980000,0.002953,0.0,,,,,0
1950-01-09,17.080000,0.005889,0.0,,,,,0
1950-01-10,17.030001,-0.002927,0.0,,,,,0
...,...,...,...,...,...,...,...,...
2024-11-05,5782.759766,0.012266,,0.749547,0.250453,0.0,Low Volatility,1
2024-11-06,5929.040039,0.025296,,0.749547,0.250453,0.0,Low Volatility,1
2024-11-07,5973.100098,0.007431,,0.749547,0.250453,0.0,Low Volatility,1
2024-11-08,5995.540039,0.003757,,0.749547,0.250453,0.0,Low Volatility,1


In [8]:
# Create the 'Blended_Test_Training_Regime' by merging 'Training Regime' and 'Predicted_Regime' based on date condition
data['Blended_Test_Training_Regime'] = None  # Initialize the column with None

# Set values from 'Training Regime' for dates before split_date
data.loc[data.index < split_date, 'Blended_Test_Training_Regime'] = data['Training Regime']

# Set values from 'Predicted_Regime' for dates on or after split_date
data.loc[data.index >= split_date, 'Blended_Test_Training_Regime'] = data['Predicted_Regime']

# Display the DataFrame with the new 'Blended_Test_Training_Regime' column
print(data[['Training Regime', 'Predicted_Regime', 'Blended_Test_Training_Regime']])

            Training Regime  Predicted_Regime Blended_Test_Training_Regime
Date                                                                      
1950-01-04              0.0               NaN                          0.0
1950-01-05              0.0               NaN                          0.0
1950-01-06              0.0               NaN                          0.0
1950-01-09              0.0               NaN                          0.0
1950-01-10              0.0               NaN                          0.0
...                     ...               ...                          ...
2024-11-05              NaN               0.0                          0.0
2024-11-06              NaN               0.0                          0.0
2024-11-07              NaN               0.0                          0.0
2024-11-08              NaN               0.0                          0.0
2024-11-11              NaN               0.0                          0.0

[18836 rows x 3 columns]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['Blended_Test_Training_Regime'] = None  # Initialize the column with None


In [9]:
data

Unnamed: 0_level_0,Adj_Close,Index_Returns,Training Regime,State_Prob_Regime0,State_Prob_Regime1,Predicted_Regime,Predicted_Regime_Label,Full_Vol_Regime,Blended_Test_Training_Regime
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1950-01-04,16.850000,0.011405,0.0,,,,,0,0.0
1950-01-05,16.930000,0.004748,0.0,,,,,0,0.0
1950-01-06,16.980000,0.002953,0.0,,,,,0,0.0
1950-01-09,17.080000,0.005889,0.0,,,,,0,0.0
1950-01-10,17.030001,-0.002927,0.0,,,,,0,0.0
...,...,...,...,...,...,...,...,...,...
2024-11-05,5782.759766,0.012266,,0.749547,0.250453,0.0,Low Volatility,1,0.0
2024-11-06,5929.040039,0.025296,,0.749547,0.250453,0.0,Low Volatility,1,0.0
2024-11-07,5973.100098,0.007431,,0.749547,0.250453,0.0,Low Volatility,1,0.0
2024-11-08,5995.540039,0.003757,,0.749547,0.250453,0.0,Low Volatility,1,0.0


In [None]:
# Specify the database filename
db_filename = 'output/fill_forward_2020_comparison.db'

# Create a connection to the SQLite database
conn = sqlite3.connect(db_filename)  # Use db_filename as a string

# Specify the table name
table_name = 'regime_data'

# Write the DataFrame to the SQLite database
data.to_sql(table_name, conn, if_exists='replace', index=True)

# Close the connection
conn.close()

In [None]:
# Assuming 'data' is the main DataFrame
# Filter data to create 'filtered_data' with mismatched regimes, excluding rows with NaN in either column
filtered_data = data[(data['Predicted_Regime'] != data['Full_Vol_Regime'])].dropna(subset=['Predicted_Regime', 'Full_Vol_Regime'])

# Specify the database file name
db_filename = 'output/fill_forward_2020_comparison.db'

# Re-establish the connection to the SQLite database
conn = sqlite3.connect(db_filename)

# Specify the new table name
new_table_name = 'mismatched_regimes'

# Write the filtered DataFrame to the new table
filtered_data.to_sql(new_table_name, conn, if_exists='replace', index=True)

# Close the connection
conn.close()

In [None]:
# Connect to the database
db_path = 'output/fill_forward_2020_comparison.db'  # Replace with your SQLite database path
conn = sqlite3.connect(db_path)

# Define the query
query = "SELECT * FROM regime_data"  # Replace 'your_table' with your actual table name

# Read data into a pandas DataFrame
df = pd.read_sql_query(query, conn)

# Export DataFrame to CSV
output_csv_path = 'output/fill_forward_2020_comparison.csv'  # Replace with your desired CSV file path
df.to_csv(output_csv_path, index=False)  # Set index=False to exclude DataFrame index

# Close the connection
conn.close()

print(f"Data exported to {output_csv_path}")

Data exported to fill_forward_2020_comparison.csv


In [13]:
# print("Log-Likelihood:", result.llf)
# print("AIC:", result.aic)
# print("BIC:", result.bic)