In [112]:
import numpy as np
import pandas as pd
from typing import  List

## Data Preprocessing

In [162]:
import pickle
with open('./Data/stocks.pkl', 'rb') as file:
    stocks = pickle.load(file)

In [163]:
stocks

Unnamed: 0,ticker,PERMNO,HdrCUSIP,PERMCO,DlyCalDt,DlyVol,DlyClose,DlyLow,DlyHigh,DlyOpen
0,JJSF,10026,46603210,7976,2013-01-02,92378.0,64.3200,63.610,64.8400,64.76
1,JJSF,10026,46603210,7976,2013-01-03,51264.0,64.6800,63.360,64.7750,64.21
2,JJSF,10026,46603210,7976,2013-01-04,55188.0,64.7700,64.520,65.7700,64.96
3,JJSF,10026,46603210,7976,2013-01-07,34634.0,63.4007,63.390,64.7600,64.31
4,JJSF,10026,46603210,7976,2013-01-08,83955.0,63.4300,63.000,64.1800,63.65
...,...,...,...,...,...,...,...,...,...,...
6663731,CBOE,93429,12503M10,53447,2023-12-22,420217.0,175.5300,174.785,176.3700,175.85
6663732,CBOE,93429,12503M10,53447,2023-12-26,433297.0,173.6900,173.500,175.8600,175.86
6663733,CBOE,93429,12503M10,53447,2023-12-27,475587.0,175.6100,173.640,175.6300,174.11
6663734,CBOE,93429,12503M10,53447,2023-12-28,682563.0,177.8400,176.040,178.6386,176.04


In [164]:
stocks = stocks.drop(columns=['HdrCUSIP', 'PERMCO'])

In [165]:
stocks.isna().sum()

ticker          0
PERMNO          0
DlyCalDt        0
DlyVol        738
DlyClose    84650
DlyLow      84650
DlyHigh     84650
DlyOpen     84655
dtype: int64

In [166]:
stocks = stocks.rename(columns={"DlyCalDt": "Date"})
stocks["Date"] = pd.to_datetime(stocks["Date"])

In [167]:
data = stocks.dropna().copy()

In [168]:
data

Unnamed: 0,ticker,PERMNO,Date,DlyVol,DlyClose,DlyLow,DlyHigh,DlyOpen
0,JJSF,10026,2013-01-02,92378.0,64.3200,63.610,64.8400,64.76
1,JJSF,10026,2013-01-03,51264.0,64.6800,63.360,64.7750,64.21
2,JJSF,10026,2013-01-04,55188.0,64.7700,64.520,65.7700,64.96
3,JJSF,10026,2013-01-07,34634.0,63.4007,63.390,64.7600,64.31
4,JJSF,10026,2013-01-08,83955.0,63.4300,63.000,64.1800,63.65
...,...,...,...,...,...,...,...,...
6663731,CBOE,93429,2023-12-22,420217.0,175.5300,174.785,176.3700,175.85
6663732,CBOE,93429,2023-12-26,433297.0,173.6900,173.500,175.8600,175.86
6663733,CBOE,93429,2023-12-27,475587.0,175.6100,173.640,175.6300,174.11
6663734,CBOE,93429,2023-12-28,682563.0,177.8400,176.040,178.6386,176.04


In [193]:
#drop Stocks for which less than 2000 entries exist to include only stocks with values over the analyzes time window.

counts = data['PERMNO'].value_counts()

valid_permnos = counts[counts >= 2000].index

# Filter the dataframe
data = data[data['PERMNO'].isin(valid_permnos)]

In [195]:
data = data.set_index(["Date", "PERMNO"], drop=False)
data = data.sort_index(level=0)

In [196]:
data.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 5091824 entries, (Timestamp('2013-01-02 00:00:00'), np.int64(10026)) to (Timestamp('2023-12-29 00:00:00'), np.int64(93429))
Data columns (total 8 columns):
 #   Column    Dtype         
---  ------    -----         
 0   ticker    object        
 1   PERMNO    int64         
 2   Date      datetime64[ns]
 3   DlyVol    float64       
 4   DlyClose  float64       
 5   DlyLow    float64       
 6   DlyHigh   float64       
 7   DlyOpen   float64       
dtypes: datetime64[ns](1), float64(5), int64(1), object(1)
memory usage: 330.4+ MB


In [197]:
data.index.value_counts()

Date        PERMNO
2013-01-02  10026     1
2020-04-17  10355     1
            10661     1
            10629     1
            10547     1
                     ..
2016-10-13  79906     1
            79841     1
            79839     1
            79824     1
2023-12-29  93429     1
Name: count, Length: 5091824, dtype: int64

In [198]:
data.drop_duplicates(subset=None, keep="first", inplace=True)

In [200]:
data.shape

(5091824, 8)

In [201]:
data_final = data.copy()

## Event based labeling

In [202]:
from Labeling import *

In [204]:
permno_list = data_final["PERMNO"].unique()
permno_list

array([10026, 10065, 10145, ..., 90423, 79057, 78840], shape=(1937,))

In [205]:
from ta.momentum import RSIIndicator
from ta.trend import SMAIndicator, EMAIndicator, MACD

def compute_features(data: pd.DataFrame, windows=(5, 10, 20, 50, 100)):

    # Ensure the dataset has the required columns
    required_column = ["DlyClose", "DlyOpen","DlyHigh", "DlyLow"]
    if not all(col in data.columns for col in required_column):
        raise ValueError(f"Dataset must contain the following columns: {required_column}")
    
    # 1. TREND / LEVEL FEATURES
        # Simple Moving Average (SMA)
    data['SMA_20'] = SMAIndicator(close=data["DlyClose"], window=20).sma_indicator()
    data['SMA_50'] = SMAIndicator(close=data["DlyClose"], window=50).sma_indicator()

        # Exponential Moving Average (EMA)
    data['EMA_20'] = EMAIndicator(close=data["DlyClose"], window=20).ema_indicator()
    data['EMA_50'] = EMAIndicator(close=data["DlyClose"], window=50).ema_indicator()


    # 2. MOMENTUM
    for w in (1, 5, 10, 20):
        data[f"ret_{w}d"] = data["DlyClose"].pct_change(w)
    
        # Moving Average Convergence Divergence (MACD)
    macd_indicator = MACD(close=data["DlyClose"], window_slow=26, window_fast=12, window_sign=9)
    data['MACD'] = macd_indicator.macd()
    data['MACD_Signal'] = macd_indicator.macd_signal()
    

    # 3. BANDS / OSCILLATORS
        # Bollinger Bands (20)
    data["HL_range"] = (data["DlyHigh"] - data["DlyLow"]) / data["DlyClose"]
    data["Gap_OC"] = (data["DlyOpen"] - data["DlyClose"].shift(1)) / data["DlyClose"].shift(1)
    data["Gap_CC"] = data["DlyClose"].pct_change()

        # Relative Strength Index (RSI)
    data['RSI'] = RSIIndicator(close=data["DlyClose"], window=14).rsi()


    #4. Returns and lagged returns
    data['Return'] = np.log(data['DlyClose']).pct_change()
    data['Lag_Return_1'] = data['Return'].shift(1)
    data['Lag_Return_2'] = data['Return'].shift(2)
    data['Lag_Return_3'] = data['Return'].shift(3)

    # 5. Tomorrow's return (forward-looking)
    data['Tomorrow_Return'] = data['Return'].shift(-1)

    # Dropping NaN values caused by rolling calculations
    data = data.dropna()

    return data


In [206]:
data_final

Unnamed: 0_level_0,Unnamed: 1_level_0,ticker,PERMNO,Date,DlyVol,DlyClose,DlyLow,DlyHigh,DlyOpen
Date,PERMNO,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2013-01-02,10026,JJSF,10026,2013-01-02,92378.0,64.32,63.6100,64.84,64.76
2013-01-02,10065,ADX,10065,2013-01-02,322400.0,10.85,10.7200,10.85,10.81
2013-01-02,10145,HON,10145,2013-01-02,5208500.0,64.75,64.1600,65.09,64.67
2013-01-02,10158,AMRC,10158,2013-01-02,90000.0,9.98,9.8600,10.13,10.13
2013-01-02,10201,AT,10201,2013-01-02,1253800.0,12.05,11.5900,12.05,11.63
...,...,...,...,...,...,...,...,...,...
2023-12-29,93380,JKS,93380,2023-12-29,474616.0,36.94,36.6278,37.37,37.37
2023-12-29,93415,CEM,93415,2023-12-29,24273.0,40.09,39.7800,40.25,39.78
2023-12-29,93419,HPP,93419,2023-12-29,2504836.0,9.31,9.2800,9.48,9.40
2023-12-29,93427,FN,93427,2023-12-29,296974.0,190.33,189.5100,196.05,195.18


In [207]:
events_by_permno = {}

for id in permno_list:

    sub_data = data_final.xs(id, level="PERMNO").sort_index()

    sub_data = compute_features(sub_data)
    prices = sub_data["DlyClose"]

    volatility = daily_volatility_with_log_returns(prices, 30)
    filter_threshold = 1.5
    molecules = cusum_filter_events_dynamic_threshold(np.log(prices), filter_threshold * volatility)
    vertical_barriers = vertical_barrier(prices, molecules, 20)

    triple_barrier_events ,tt = meta_events(prices, vertical_barriers.index, [1, 1], volatility, 0, 1, vertical_barriers)
    labels = meta_labeling(triple_barrier_events, prices)
    

    t_events = pd.merge(sub_data, labels, left_index=True, right_index=True)

    events_by_permno[id] = t_events


In [211]:
data_labeled = pd.concat(events_by_permno, ignore_index=True)

In [214]:
with open('./Data/stocks_labeled.pkl', 'wb') as file:
    stocks = pickle.dump(data_labeled, file)

In [215]:
data_labeled.columns

Index(['ticker', 'PERMNO', 'Date', 'DlyVol', 'DlyClose', 'DlyLow', 'DlyHigh',
       'DlyOpen', 'SMA_20', 'SMA_50', 'EMA_20', 'EMA_50', 'ret_1d', 'ret_5d',
       'ret_10d', 'ret_20d', 'MACD', 'MACD_Signal', 'HL_range', 'Gap_OC',
       'Gap_CC', 'RSI', 'Return', 'Lag_Return_1', 'Lag_Return_2',
       'Lag_Return_3', 'Tomorrow_Return', 'End Time', 'Return of Label',
       'Label'],
      dtype='object')