In [14]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore")

# Fix the path construction - use forward slashes or raw strings
import os

# Method 1: Use forward slashes (recommended)
path = os.path.abspath(os.path.join("..", "04_S&P500_quant_analysis", "01_data", "S&P500_D_1789-05-01_2025-10-10.csv"))

# Alternative Method 2: Use raw string
# path = os.path.abspath(os.path.join("..", r"04_S&P500_quant_analysis\01_data", "S&P500_D_1789-05-01_2025-10-10.csv"))

# Alternative Method 3: Use pathlib (modern approach)
# from pathlib import Path
# path = Path("..") / "04_S&P500_quant_analysis" / "01_data" / "S&P500_D_1789-05-01_2025-10-10.csv"

print(f"Path exists: {os.path.exists(path)}")

# Read the CSV file
df = pd.read_csv(path)

# lower case column names for easier access
df.columns = [col.lower() for col in df.columns]

# Set the 'date' column as the index and convert it to datetime
df['date'] = pd.to_datetime(df['date'])
df.set_index('date', inplace=True)

# Show the first few rows of the dataframe
df.head()

Path exists: True


Unnamed: 0_level_0,open,high,low,close,volume
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1789-05-01,0.51,0.51,0.51,0.51,0.0
1789-06-01,0.51,0.51,0.51,0.51,0.0
1789-07-01,0.5,0.5,0.5,0.5,0.0
1789-08-01,0.5,0.51,0.5,0.51,0.0
1789-09-01,0.51,0.51,0.5,0.51,0.0


In [15]:
# filter DatetimeIndex higher than year 1957
df = df[df.index > '1957-01-01']

df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 17311 entries, 1957-01-02 to 2025-10-10
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   open    17311 non-null  float64
 1   high    17311 non-null  float64
 2   low     17311 non-null  float64
 3   close   17311 non-null  float64
 4   volume  17311 non-null  float64
dtypes: float64(5)
memory usage: 811.5 KB


In [16]:
df.shape

(17311, 5)

In [17]:
# # plot close prices over time
# plt.figure(figsize=(14, 7))
# plt.plot(df.index, df['close'], label='Close Price')
# plt.title('S&P 500 Close Prices Over Time')
# plt.xlabel('Date')
# plt.ylabel('Close Price')
# plt.legend()
# plt.grid()
# plt.show()

In [18]:
# keep close column only
df = df[['close']]

# calculate daily returns
df['daily_return'] = df['close'].pct_change()

# fill NaN values
df.fillna(0, inplace=True)

df.head()

Unnamed: 0_level_0,close,daily_return
date,Unnamed: 1_level_1,Unnamed: 2_level_1
1957-01-02,46.2,0.0
1957-01-03,46.6,0.008658
1957-01-04,46.66,0.001288
1957-01-07,46.42,-0.005144
1957-01-08,46.25,-0.003662


In [19]:
"""
Corrections analysis:

1) Number of negative daily returns between minus 1% and minus 2% per year and month
2) Number of negative daily returns between minus 2% and minus 3% per year and month
3) Number of negative daily returns between minus 3% and minus 5% per year and month
4) Number of negative daily returns between minus 5% and minus 10% per year and month
5) Number of negative daily returns between minus 10% and minus 15% per year and month
6) Number of negative daily returns between minus 15% and minus 20% per year and month
7) Number of negative daily returns between minus 20% and minus 30% per year and month
8) Number of negative daily returns greater than minus 30% per year and month

For each of the above categories, will be created a column in the dataframe with signal 1 when the condition is met, 
0 otherwise.

Then, will be created a pivot table with years as index, months as columns, and sum of occurrences as values.
""";

# define the bins and labels
bins = [-np.inf, -0.30, -0.20, -0.15, -0.10, -0.05, -0.03, -0.02, -0.01, 0]
labels = ['over_-30%', '-20%_to_-30%', '-15%_to_-20%', '-10%_to_-15%', '-5%_to_-10%', '-3%_to_-5%', '-2%_to_-3%', '-1%_to_-2%', '0%_to_-1%']

# create a new column 'correction_category' based on the bins
df['correction_category'] = pd.cut(df['daily_return'], bins=bins, labels=labels)

df.head()

Unnamed: 0_level_0,close,daily_return,correction_category
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1957-01-02,46.2,0.0,0%_to_-1%
1957-01-03,46.6,0.008658,
1957-01-04,46.66,0.001288,
1957-01-07,46.42,-0.005144,0%_to_-1%
1957-01-08,46.25,-0.003662,0%_to_-1%


In [21]:
# group by year, month and correction_category, then count occurrences
correction_counts = df.groupby([df.index.year, df.index.month, 'correction_category']).size().unstack(fill_value=0)

correction_counts.index.names = ['Year', 'Month']

correction_counts.head()

Unnamed: 0_level_0,correction_category,over_-30%,-20%_to_-30%,-15%_to_-20%,-10%_to_-15%,-5%_to_-10%,-3%_to_-5%,-2%_to_-3%,-1%_to_-2%,0%_to_-1%
Year,Month,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1957,1,0,0,0,0,0,0,0,2,11
1957,2,0,0,0,0,0,0,0,2,12
1957,3,0,0,0,0,0,0,0,0,9
1957,4,0,0,0,0,0,0,0,0,7
1957,5,0,0,0,0,0,0,0,0,8


In [22]:
correction_counts.tail(12)

Unnamed: 0_level_0,correction_category,over_-30%,-20%_to_-30%,-15%_to_-20%,-10%_to_-15%,-5%_to_-10%,-3%_to_-5%,-2%_to_-3%,-1%_to_-2%,0%_to_-1%
Year,Month,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2025,1,0,0,0,0,0,0,0,3,5
2025,2,0,0,0,0,0,0,0,2,7
2025,3,0,0,0,0,0,0,1,7,3
2025,4,0,0,0,0,1,2,2,1,2
2025,5,0,0,0,0,0,0,0,1,8
2025,6,0,0,0,0,0,0,0,1,6
2025,7,0,0,0,0,0,0,0,0,9
2025,8,0,0,0,0,0,0,0,1,10
2025,9,0,0,0,0,0,0,0,0,8
2025,10,0,0,0,0,0,0,1,0,2


In [None]:
# # download the dataframe as csv
# correction_counts.to_csv('correction_counts_per_year_month.csv')