In this notebook we obtain the following from our existing data:
- std dev (past 12 months)
- maximum drawdown (over past 12 months)
- Label: [shifted by 12 months] realized maxuimum drawdown

Our data is monthly!

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path

In [None]:
save_path = Path('../data/features_with_labels.csv')

DRAWDOWN_LABEL = "realized maximum drawdown (1yr, forward)"

In [None]:
path = Path('../data/ret_sample_processed.csv')
df = pd.read_csv(path)
df['ret_eom'] = pd.to_datetime(df['ret_eom'], format='%Y%m%d')
df = df.sort_values(['id', 'ret_eom'])


In [None]:
print("Data shape:", df.shape)

print()
print()
print("Stock return distribution:")
print(df["stock_ret"].describe())

print()
print()
print("Check for extreme returns:")
print(f"Returns > 100%: {(df["stock_ret"] > 1.0).sum()}")
print(f"Returns < -100%: {(df["stock_ret"] < -1.0).sum()}")

print()
print()
print("Sample of extreme returns:")
print(df[np.abs(df["stock_ret"]) > 1.0][["id", "ret_eom", "stock_ret"]].head(10))

print()
print()
print("Check grouping - sample company:")
sample_id = df["id"].iloc[0]
sample_company = df[df["id"] == sample_id].head(20)
print(f"\nSample company {sample_id}:")
print(sample_company[["id", "ret_eom", "stock_ret"]])

Data shape: (1110679, 7)


Stock return distribution:
count    1.110679e+06
mean     2.263510e-02
std      1.043494e+01
min     -9.999000e-01
25%     -6.849300e-02
50%      1.056000e-03
75%      6.718200e-02
max      9.499000e+03
Name: stock_ret, dtype: float64


Check for extreme returns:
Returns > 100%: 3339
Returns < -100%: 0


Sample of extreme returns:
                     id    ret_eom  stock_ret
853668   comp_007183_01 2020-12-31   1.337469
862350   comp_007183_01 2021-02-28   1.025000
89438    comp_009551_02 2006-07-31   1.300000
360952   comp_009551_02 2011-03-31   1.320000
1106124  comp_009619_01 2025-05-31   3.378987
31761    comp_009728_02 2005-08-31   2.717949
434556   comp_011738_01 2012-08-31   1.833333
505645   comp_012717_01 2014-01-31   1.088947
251678   comp_012785_01 2009-03-31   1.093023
256430   comp_012785_01 2009-04-30   1.222222


Check grouping - sample company:

Sample company comp_001004_01:
                     id    ret_eom  stock_ret
1098015  comp_001004_

In [None]:
# Clean extreme returns (likely data errors)
# Winsorize at 99th percentile (cap extreme values)
# Remove observations with returns > 200% or < -95%

print("Before cleaning:")
print(f"Total observations: {len(df)}")
print(f"Returns > 200%: {(df["stock_ret"] > 2.0).sum()}")
print(f"Returns < -95%: {(df["stock_ret"] < -0.95).sum()}")

# Apply winsorization: cap returns at -95% and +200%
df["stock_ret_clean"] = df["stock_ret"].clip(lower=-0.95, upper=2.0)

print()
print("After winsorization (cap at -95% and +200%):")
print(df["stock_ret_clean"].describe())

# Use cleaned returns for all calculations
df["stock_ret"] = df["stock_ret_clean"]
df = df.drop(columns=["stock_ret_clean"])

Before cleaning:
Total observations: 1110679
Returns > 200%: 755
Returns < -95%: 30

After winsorization (cap at -95% and +200%):
count    1.110679e+06
mean     6.415685e-03
std      1.764156e-01
min     -9.500000e-01
25%     -6.849300e-02
50%      1.056000e-03
75%      6.718200e-02
max      2.000000e+00
Name: stock_ret_clean, dtype: float64


In [None]:
LOOKBACK_MONTHS = 12   # 1 year for historical vol
FORWARD_MONTHS = 12    # 1 year for maximum drawdown

def compute_historical_vol(group):
    """Compute rolling 12-month standard deviation (annualized)"""
    group["hist_vol"] = group["stock_ret"].rolling(
        window=LOOKBACK_MONTHS, 
        min_periods=6  # Need at least 6 months for stable estimate
    ).std() * np.sqrt(12)  # Annualize monthly volatility
    return group

def compute_forward_drawdown(group):
    """Compute MAXIMUM drawdown over next N months"""
    returns = group['stock_ret'].values
    drawdowns = []
    
    for i in range(len(returns)):
        forward_rets = returns[i:i+FORWARD_MONTHS]
        if len(forward_rets) < 6:  # Need at least 6 months of data
            drawdowns.append(np.nan)
            continue
        
        cum_wealth = np.cumprod(1 + forward_rets)
        
        running_max = np.maximum.accumulate(cum_wealth)
        
        drawdown = (cum_wealth - running_max) / running_max
        
        # Most negative value
        max_drawdown = drawdown.min()
        
        drawdowns.append(max_drawdown)
    
    group[DRAWDOWN_LABEL] = drawdowns
    return group

print("Computing historical volatility...")
df = df.groupby("id", group_keys=False).apply(compute_historical_vol)

print("Computing forward drawdowns...")
df = df.groupby("id", group_keys=False).apply(compute_forward_drawdown)

print()
print("Summary Statistics:")
print(f"Total rows: {len(df)}")
print(f"Non-null hist_vol: {df['hist_vol'].notna().sum()}")
print(f"Non-null forward drawdown: {df[DRAWDOWN_LABEL].notna().sum()}")

print()
print(f"Drawdown distribution:")
print(df[DRAWDOWN_LABEL].describe())

print()
print(f"std dev distribution:")
print(df["hist_vol"].describe())

df.to_csv(save_path, index=False)
print()
print("Saved to features_with_labels.csv")

Computing historical volatility...


  df = df.groupby('id', group_keys=False).apply(compute_historical_vol)


Computing forward drawdowns...


  df = df.groupby('id', group_keys=False).apply(compute_forward_drawdown)



Summary Statistics:
Total rows: 1110679
Non-null hist_vol: 1040508
Non-null forward drawdown: 1040508

Drawdown distribution:
count    1.040508e+06
mean    -3.098852e-01
std      2.264232e-01
min     -9.999945e-01
25%     -4.422664e-01
50%     -2.507933e-01
75%     -1.311746e-01
max      0.000000e+00
Name: realized maximum drawdown (1yr, forward), dtype: float64

std dev distribution:
count    1.040508e+06
mean     4.785393e-01
std      3.538528e-01
min      0.000000e+00
25%      2.535512e-01
50%      3.854242e-01
75%      5.936429e-01
max      3.903332e+00
Name: hist_vol, dtype: float64

Saved to features_with_labels.csv

Saved to features_with_labels.csv


In [7]:
df

Unnamed: 0,id,date,ret_eom,gvkey,excntry,stock_ret,prc,hist_vol,"realized maximum drawdown (1yr, forward)"
1098015,comp_001004_01,20250228,2025-02-28,1004.0,USA,-0.040437,67.760000,,
1100560,comp_001004_01,20250331,2025-03-31,1004.0,USA,-0.138880,65.020000,,
1103100,comp_001004_01,20250430,2025-04-30,1004.0,USA,-0.045187,55.990000,,
1105635,comp_001004_01,20250530,2025-05-31,1004.0,USA,0.148709,53.460000,,
1108158,comp_001004_01,20250630,2025-06-30,1004.0,USA,0.120176,61.410000,,
...,...,...,...,...,...,...,...,...,...
1079170,crsp_93436,20240830,2024-08-31,184996.0,USA,-0.077390,232.070007,0.482097,
1083900,crsp_93436,20240930,2024-09-30,184996.0,USA,0.221942,214.110001,0.531748,
1088615,crsp_93436,20241031,2024-10-31,184996.0,USA,-0.045025,261.630005,0.485219,
1093327,crsp_93436,20241129,2024-11-30,184996.0,USA,0.381469,249.850006,0.581559,
