In [22]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

In [23]:
#load the data from the csv file
data = pd.read_csv('../Market_Crash_Predictor/cleaned_data/final_standardized_data.csv')


In [24]:
#Show the first 5 rows of the data
data.head()

Unnamed: 0,Date,Federal Funds Rate,Unemployment Rate,GDP,Inflation Rate,SP500
0,1975-01-01,3.87,8.1,1616.116,9.143147,72.564091
1,1975-02-01,6.76,8.1,1616.116,9.143147,80.096842
2,1975-03-01,6.07,8.6,1616.116,9.143147,83.831999
3,1975-04-01,5.48,8.8,1651.853,9.143147,84.723182
4,1975-05-01,5.55,9.0,1651.853,9.143147,90.09619


In [25]:
#Make sure Date is in datetime format
data['Date'] = pd.to_datetime(data['Date'])

#Set the Date as the index
data.set_index('Date', inplace=True)

In [26]:
#Create a month-to-month percentage change in the features
data['sp500_pct_change'] = data['SP500'].pct_change()
data['gdp_pct_change'] = data['GDP'].pct_change()
data['inflation_pct_change'] = data['Inflation Rate'].pct_change()
data['unemployment_pct_change'] = data['Unemployment Rate'].pct_change()
data['fed_funds_rate_pct_change'] = data['Federal Funds Rate'].pct_change()


The default fill_method='pad' in Series.pct_change is deprecated and will be removed in a future version. Either fill in any non-leading NA values prior to calling pct_change or specify 'fill_method=None' to not fill NA values.


The default fill_method='pad' in Series.pct_change is deprecated and will be removed in a future version. Either fill in any non-leading NA values prior to calling pct_change or specify 'fill_method=None' to not fill NA values.


The default fill_method='pad' in Series.pct_change is deprecated and will be removed in a future version. Either fill in any non-leading NA values prior to calling pct_change or specify 'fill_method=None' to not fill NA values.


The default fill_method='pad' in Series.pct_change is deprecated and will be removed in a future version. Either fill in any non-leading NA values prior to calling pct_change or specify 'fill_method=None' to not fill NA values.



In [27]:
#Do a moving average and volatility calulation for the SP500
data['sp500_quarter'] = data['SP500'].rolling(window=3).mean()
data['sp500_yearly'] = data['SP500'].rolling(window=12).mean()
data['sp500_volatility_quarter'] = data['sp500_pct_change'].rolling(window=3).std()
data['sp500_volatility_yearly'] = data['sp500_pct_change'].rolling(window=12).std()


In [28]:
#Set crash threshold to 10% drop in the SP500
crash_threshold = -0.05
data['crash'] = (data['sp500_pct_change'] < crash_threshold).astype(int)


In [29]:
#Show the first 5 rows of the data
data.head()

Unnamed: 0_level_0,Federal Funds Rate,Unemployment Rate,GDP,Inflation Rate,SP500,sp500_pct_change,gdp_pct_change,inflation_pct_change,unemployment_pct_change,fed_funds_rate_pct_change,sp500_quarter,sp500_yearly,sp500_volatility_quarter,sp500_volatility_yearly,crash
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
1975-01-01,3.87,8.1,1616.116,9.143147,72.564091,,,,,,,,,,0
1975-02-01,6.76,8.1,1616.116,9.143147,80.096842,0.103808,0.0,0.0,0.0,0.74677,,,,,0
1975-03-01,6.07,8.6,1616.116,9.143147,83.831999,0.046633,0.0,0.0,0.061728,-0.102071,78.830977,,,,0
1975-04-01,5.48,8.8,1651.853,9.143147,84.723182,0.010631,0.022113,0.0,0.023256,-0.097199,82.884008,,0.046988,,0
1975-05-01,5.55,9.0,1651.853,9.143147,90.09619,0.063418,0.0,0.0,0.022727,0.012774,86.217124,,0.026971,,0


In [30]:
#Drop the first 12 rows of the data since they have NaN values
data = data[12:]

In [31]:
#Show the first 5 rows of the data
data.head()

Unnamed: 0_level_0,Federal Funds Rate,Unemployment Rate,GDP,Inflation Rate,SP500,sp500_pct_change,gdp_pct_change,inflation_pct_change,unemployment_pct_change,fed_funds_rate_pct_change,sp500_quarter,sp500_yearly,sp500_volatility_quarter,sp500_volatility_yearly,crash
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
1976-01-01,5.37,7.9,1820.487,5.744813,96.859524,0.091985,0.033293,-0.371681,-0.036585,0.011299,91.875081,88.184867,0.054999,0.049154,0
1976-02-01,4.84,7.7,1820.487,5.744813,100.639474,0.039025,0.0,0.0,-0.025316,-0.098696,95.399817,89.896753,0.05357,0.042928,0
1976-03-01,5.21,7.6,1820.487,5.744813,101.084348,0.00442,0.0,0.0,-0.012987,0.076446,99.527782,91.334449,0.044101,0.042277,0
1976-04-01,4.83,7.7,1852.332,5.744813,101.929524,0.008361,0.017493,0.0,0.013158,-0.072937,101.217782,92.768311,0.018944,0.04231,0
1976-05-01,5.1,7.4,1852.332,5.744813,101.161999,-0.00753,0.0,0.0,-0.038961,0.055901,101.391957,93.690462,0.008275,0.04003,0


In [32]:
#Create a csv file with the new data
data.to_csv('../Market_Crash_Predictor/cleaned_data/1987crashmodel.csv')

In [33]:
#Define the features and the target
features = [
    'sp500_pct_change', 'gdp_pct_change', 'inflation_pct_change', 
    'unemployment_pct_change', 'fed_funds_rate_pct_change',
    'sp500_quarter', 'sp500_yearly', 'sp500_volatility_quarter', 'sp500_volatility_yearly'
]

X = data[features]
y = data['crash']

In [34]:
#Standardize the features
scaler = StandardScaler()
x_scaled = scaler.fit_transform(X)

In [35]:
#SPlit the data into training from 1976 to 1986 and and testing the 1987 data
train_start_date = '1976-01-01'
train_end_date = '1986-12-01'

train_data = data[(data.index >= train_start_date) & (data.index <= train_end_date)]
test_data = data[(data.index >= '1987-01-01') & (data.index <= '1989-12-01')]

X_train = scaler.fit_transform(train_data[features])
y_train = train_data['crash']
X_test = scaler.transform(test_data[features])
y_test = test_data['crash']

In [36]:
#Create a logistic regression model
model = LogisticRegression()

#Fit the model to the training data
model.fit(X_train, y_train)

In [37]:
#Make predictions on the testing data
y_pred = model.predict(X_test)

#Print the classification report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        34
           1       1.00      1.00      1.00         2

    accuracy                           1.00        36
   macro avg       1.00      1.00      1.00        36
weighted avg       1.00      1.00      1.00        36



In [38]:
#Show the coefficients of the model
coefficients = pd.DataFrame({
	'Variable': features,
	'Coefficient': model.coef_[0]
})

#Transform the coefficients into absolute values
coefficients['Coefficient'] = coefficients['Coefficient'].abs()

In [39]:
#Print the coefficients
print(coefficients)

                    Variable  Coefficient
0           sp500_pct_change     2.189463
1             gdp_pct_change     0.652324
2       inflation_pct_change     0.453517
3    unemployment_pct_change     0.113738
4  fed_funds_rate_pct_change     0.125411
5              sp500_quarter     0.147003
6               sp500_yearly     0.054403
7   sp500_volatility_quarter     0.699829
8    sp500_volatility_yearly     0.287417


In [40]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

# Define the date to analyze
target_date = '1987-09-01'  

# Function to get a 5-year window around the specified date
def get_5yr_window_single(data, target_date):
    start_date = pd.to_datetime(target_date) - pd.DateOffset(years=2)
    end_date = pd.to_datetime(target_date) + pd.DateOffset(years=2)
    return data[(data.index >= start_date) & (data.index <= end_date)]

# Apply the 5-year window function to get data around the target date
crash_data = get_5yr_window_single(data, target_date)

# Define features and target for the current window
X = crash_data[features]
y = crash_data['crash']

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train the model on this window's data
model = LogisticRegression()
model.fit(X_scaled, y)

# Predict probabilities for the specified window
y_prob = model.predict_proba(X_scaled)[:, 1]

# Ensure crash_data and predictions match in length
crash_data = crash_data.copy()  # To avoid modifying the original DataFrame
crash_data['crash_probability'] = y_prob  # This should now match lengths
crash_data['Date'] = crash_data.index

# Plot the results with Plotly
fig = px.scatter(crash_data, x='Date', y='sp500_pct_change', color='crash_probability',
                 title='Market Crash Probability around Selected Date',
                 labels={'crash_probability': 'Crash Probability', 'sp500_pct_change': 'S&P 500 % Change'},
                 color_continuous_scale='Viridis')

# Add markers for actual crashes within this window
fig.add_trace(go.Scatter(x=crash_data[crash_data['crash'] == 1]['Date'],
                         y=crash_data[crash_data['crash'] == 1]['sp500_pct_change'],
                         mode='markers',
                         name='Actual Crash',
                         marker=dict(symbol='x', color='red', size=10)))

# Update layout for clarity
fig.update_layout(coloraxis_colorbar=dict(title='Crash Probability'),
                  xaxis_title='Date',
                  yaxis_title='S&P 500 % Change',
                  template='plotly_dark')

# Show the interactive plot
fig.show()
