In [2]:
%load_ext autoreload
%autoreload 2

In [58]:
import sys
sys.path.append("..") 
from src import load_realized_vol, build_har_features,load_earnings,load_implied_vol  # etc.
from src.features import fit_har_regression
from src.signals import compute_signal_matrix
from src.forecast import create_forecast
from src.signals import size_positions


import pandas as pd
import pandas_market_calendars as mcal
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import numpy as np



In [48]:
def filter_earnings(earnings, min_date, max_date):
    """Filter earnings data to only include dates within the range of realized volatility."""
    return earnings[(earnings['date'] >= min_date) & (earnings['date'] <= max_date)]


def filter_earnings_by_ticker(earnings, tickers):
    """Filter earnings data to only include specified tickers."""
    return earnings[earnings['act_symbol'].isin(tickers)]


def create_earnings_mask(realized, earnings_subset):
    """Create a mask for earnings dates in the realized volatility DataFrame."""
    # Initialize mask with False
    earnings_mask = pd.DataFrame(False, index=realized.index, columns=realized.columns)
    
    # Set True on earnings days
    for idx, row in earnings_subset.iterrows():
        date, ticker = row['date'], row['act_symbol']
        if date in earnings_mask.index and ticker in earnings_mask.columns:
            earnings_mask.at[date, ticker] = True
            
    return earnings_mask

In [56]:
realized = load_realized_vol()
earnings = load_earnings()
earnings_subset = filter_earnings(earnings, min_date, max_date)
earnings_subset = filter_earnings_by_ticker(earnings_subset, realized.columns)
earnings_mask = create_earnings_mask(realized, earnings_subset)

min_date,max_date = realized.index.min(), realized.index.max()

implied = load_implied_vol("../output/features_data.csv", earnings_subset)


In [59]:
har_1d, har_1w, har_1m = build_har_features(realized)

har_factors = fit_har_regression(realized, har_1d.shift(1), har_1w.shift(1), har_1m.shift(1), earnings_mask=earnings_mask)
forecast = create_forecast(har_1d, har_1w, har_1m, har_factors)
signal_matrix = compute_signal_matrix(forecast, implied) #both forecast and implied are as of today
positions = size_positions(signal_matrix, implied.loc[forecast.index], method='unit', threshold=0.0)


In [85]:
aligned_realized, aligned_implied = realized.shift(-1).align(implied.shift(0), join='inner', axis=None)
aligned_realized_temp, aligned_signal=aligned_realized.shift(0).align(signal_matrix.shift(0),  axis=None)
aligned_realized_temp, aligned_positions = realized.shift(-1).align(positions.shift(0), join='inner', axis=None)




In [86]:
signal_matrix

ticker,A,AAL,AAP,AAPL,ABBV,ABC,ABT,ACGL,ACN,ADBE,...,WYNN,XEL,XOM,XRAY,XYL,YUM,ZBH,ZBRA,ZION,ZTS
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2022-02-02,,,,,,,,,,,...,,,,,,,,,,
2022-02-03,,,,,,,,,,,...,,,,,,,,,,
2022-02-04,,,,,,,,,,,...,,,,,,,,,,
2022-02-07,,,,,,,,,,,...,,,,,,,,,,
2022-02-08,0.010242,0.025575,,0.112631,0.106175,0.100186,0.037706,,0.045943,0.122558,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-12-22,0.046970,0.014160,0.043598,0.060370,0.079356,,0.058489,0.079675,0.086728,0.093639,...,,,,,,,,,,
2023-12-26,0.015854,-0.057754,-0.033882,0.016594,0.040757,,0.024993,0.034852,0.045394,0.041227,...,,,,,,,,,,
2023-12-27,0.019148,-0.056962,-0.018798,0.025917,0.049290,,0.015758,0.038187,0.047491,0.028310,...,,,,,,,,,,
2023-12-28,-0.030632,-0.104475,-0.095711,-0.012518,0.003682,,-0.014560,,0.005561,0.002692,...,,,,,,,,,,


In [93]:
#def align_multiple(*dfs):
#    idx = dfs[0].index
#    for df in dfs[1:]:
#        idx = idx.intersection(df.index)
#    return [df.loc[idx] for df in dfs]

def align_multiple(*dfs, align_columns=False):
    # Intersect indices (rows/dates) explicitly
    common_idx = dfs[0].index
    for df in dfs[1:]:
        common_idx = common_idx.intersection(df.index)

    if align_columns:
        # Optional column alignment
        common_cols = dfs[0].columns
        for df in dfs[1:]:
            common_cols = common_cols.intersection(df.columns)
        return [df.loc[common_idx, common_cols] for df in dfs]
    else:
        # Only align rows
        return [df.loc[common_idx] for df in dfs]


# Usage:
aligned_realized, aligned_implied,aligned_forecast, aligned_signal,aligned_positions= align_multiple(realized.shift(1), implied, forecast, signal_matrix, positions, align_columns=True)


#aligned_realized, aligned_implied,aligned_forecast, aligned_signal,aligned_positions= align_multiple(realized.shift(1), implied, forecast, signal_matrix, positions)

In [None]:
X = aligned_signal.stack().rename("signal")
y=(aligned_realized>aligned_implied).stack().astype(int)
aligned_x, aligned_y = X.align(y, join='inner', axis=0)

In [144]:
X = aligned_signal.stack().rename("signal")
#y = (aligned_realized.loc[forecast.index]>aligned_implied.loc[forecast.index]).stack().astype(int).reset_index(drop=True)
y = (aligned_realized > aligned_implied).stack().astype(int)

# Intersection of multi-indices
common_idx = X.index.intersection(y.index)

X_aligned = X.loc[common_idx]
y_aligned = y.loc[common_idx]

# Ready for modeling
X_df = X_aligned.reset_index(drop=True).to_frame()
y_series = y_aligned.reset_index(drop=True)

In [146]:

from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(max_depth=3).fit(X_df, y_series)


print("Accuracy:", clf.score(X_df, y_series))
print("Feature Importance:", clf.feature_importances_)


Accuracy: 0.7485357128817957
Feature Importance: [1.]


In [147]:
#not used right now, read the daaily close to close implied vol (because the implied vol includes the close to open moves)
# so while the har variables are based purely on intraday data, the implied vol is based on the daily close to close

df = pd.read_csv("../data/all_vols.csv", parse_dates=["date"])
df_prices=df.pivot(index="date", columns="ticker", values="close")
df_returns= df_prices.pct_change().shift(-1).dropna()
df_close_realized_vol = df_returns.rolling(2,min_periods=2).std().shift(0).dropna()*np.sqrt(252)

  df_returns= df_prices.pct_change().shift(-1).dropna()


In [113]:

df_close_realized_vol

ticker,A,AAL,AAP,AAPL,ABBV,ABC,ABT,ACGL,ACN,ADBE,...,WYNN,XEL,XOM,XRAY,XYL,YUM,ZBH,ZBRA,ZION,ZTS
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2023-01-05,0.359476,0.165326,0.028599,0.521215,0.217626,0.161013,0.195404,0.128275,0.527702,0.573690,...,0.217234,0.567414,0.118721,0.341127,0.715687,0.314882,0.173245,0.682834,0.628695,0.432038
2023-01-06,0.313231,0.203917,0.184057,0.341500,0.536285,0.224966,0.175015,0.368752,0.079522,0.166433,...,0.177981,0.217461,0.341895,0.259652,0.342489,0.275048,0.067765,0.320132,0.484635,0.211588
2023-01-09,0.602707,0.088839,0.096693,0.011526,0.189392,0.206642,0.196160,0.279037,0.131417,0.421359,...,0.281219,0.096266,0.373825,0.263427,0.062996,0.018833,0.082909,0.057218,0.059156,0.606573
2023-01-10,0.378612,0.341926,0.118113,0.186089,0.039001,0.286019,0.256007,0.108119,0.190364,0.246481,...,0.517574,0.085573,0.038150,0.480419,0.069337,0.051971,0.055677,0.204295,0.054967,0.314746
2023-01-11,0.328235,0.983254,0.175824,0.242361,0.171456,0.093547,0.094835,0.078446,0.129668,0.089084,...,0.420336,0.235360,0.055974,0.308286,0.053885,0.092667,0.004416,0.765560,0.035325,0.106361
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-12-21,0.005152,0.308514,0.134308,0.057090,0.108861,0.000000,0.058051,0.133838,0.120100,0.105851,...,0.307161,0.057335,0.027799,0.391049,0.124476,0.077451,0.119992,0.004324,0.183464,0.020766
2023-12-22,0.032422,0.141130,0.162186,0.037887,0.201146,0.000000,0.034251,0.002818,0.230427,0.016436,...,0.104161,0.058875,0.006570,0.105189,0.012263,0.015537,0.004383,0.089555,0.111227,0.009168
2023-12-26,0.020911,0.069239,0.178402,0.037143,0.051490,0.000000,0.035923,0.051719,0.051309,0.033018,...,0.113924,0.166533,0.080277,0.078365,0.022621,0.030126,0.109664,0.247753,0.298003,0.052158
2023-12-27,0.002408,0.067400,0.125219,0.013938,0.034107,0.000000,0.004818,0.071738,0.082170,0.032413,...,0.186845,0.197120,0.109540,0.044165,0.090770,0.070571,0.078693,0.028490,0.115318,0.066149
