In [2]:
import pandas as pd

In [3]:
data_path = '/Users/sidrana/Projects/data/AAPL_2020-01-01_2024-12-23.csv.csv'
data = pd.read_csv(data_path)

In [None]:
def calculate_overnight_delta(df, date_col='date', open_col='open', close_col='close'): 
    """ 
    Calculate the overnight delta for each trading day in the DataFrame.

    Parametesrs:
        1) df (pd.DataFrame): DataFrame containing stock data with 'Date', 'Open', and 'Close' columns.
        2) date_col (str): Name of the column containing dates. Default is 'date'.
        3) open_col (str): Name of the column containing opening prices. Default is 'open'.
        4) close_col (str): Name of the column containing closing prices. Default is 'close'.
    
    Returns:
        1) df_mod (pd.DataFrame): DataFrame with an additional 'Overnight_Delta' column.
    """

    df_mod = df.copy()  # copy the original DataFrame to avoid modifying it directly
    df_mod[date_col] = pd.to_datetime(df_mod[date_col])  # confirm date column as datetime type
    df_mod = df_mod.sort_values(by=date_col).reset_index(drop=True)  # sort by date and reset index

    # calculate overnight change and percentage change
    df_mod['overnight_delta'] = df_mod[open_col] - df_mod[close_col].shift(1)  
    df_mod['overnight_delta_pct'] = (df_mod['overnight_delta'] / df_mod[close_col].shift(1)) * 100  

    return df_mod


def identify_abnormal_delta(df, threshold): 
    """ 
    Identify abnormal overnight price changes.

    Parameters:
        1) df (pd.DataFrame): DataFrame containing stock data with 'Overnight_Delta' column.
        2) threshold (float): Threshold value for identifying abnormal overnight delta.
    
    Returns:
        1) df_mod (pd.DataFrame): DataFrame containing days with abnormal overnight delta.
    """

    df_mod = df.copy()  # copy the original DataFrame to avoid modifying it directly

    mean_delta_pct = df_mod['overnight_delta_pct'].mean()  # calculate mean overnight change percentage
    std_dev = df_mod['overnight_delta_pct'].std()  # repeate for std dev

    # calculate z score and label anomalies
    df_mod['z_score'] = (df_mod['overnight_delta_pct'] - mean_delta_pct) / std_dev
    df_mod['abnormal'] = df_mod['z_score'].abs() > threshold 

    return df_mod


def calculate_technical_indicators(df):
    """
    Calculate technical indicators for stock data. Will identify 
        - Late day momentum 
        - Volume spike
        - RSI 
        - ATR 
        - 52 week high/low proximity 
        - Gap up/down 

    Parameters:
        1) df (pd.DataFrame): DataFrame containing stock data with 'Close' column.
    """ 

    df_mod = df.copy()  # copy the original DataFrame to avoid direct moidification

    # 1) late day momentum
    df_mod['intraday_delta'] = df_mod['close'] - df_mod['open']
    df_mod['close_position'] = (df_mod['class'] - df_mod['low']) / df_mod['intraday_delta']
    df_mod['close_position'] = df_mod['close_position'].fillna(0.5) # handle division by zero

    # 2) volume spike
    df_mod['avg_volume_20'] = df_mod['volume'].rolling(window=20).mean()
    df_mod['volume_ratio'] = df_mod['volume'] / df_mod['avg_volume_20']
    df_mod['volume_spike'] = df_mod['volume_ratio'] > 2.0  # threshold for volume spike

    # 3) RSI (Relative Strength Index)
    delta = df_mod['close'].diff()
    gain = (delta.where(delta > 0, 0)).rolling(window=14).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=14).mean()
    rs = gain / loss    

    df_mod['rsi'] = 100 - (100 / (1 + rs))
    df_mod['rsi_oversold'] = df_mod['rsi'] < 30
    df_mod['rsi_overbought'] = df_mod['rsi'] > 70

    # 4) ATR (Average True Range) - volatility 
    high_low = df_mod['high'] - df_mod['low']
    high_close = np.abs(df_mod['high'] - df_mod['close'].shift())
    low_close = np.abs(df_mod['low'] - df_mod['close'].shift())
    
    ranges = pd.concat([high_low, high_close, low_close], axis=1)
    true_ranges = np.max(ranges, axis=1)
    df_mod['atr'] = true_ranges.rolling(window=14).mean()
    df_mod['atr_pct'] = df_mod['atr'] / df_mod['close'] * 100

    # 5) 52 week high/low proximity
    df_mod['52_week_high'] = df_mod['high'].rolling(window=252).max()
    df_mod['52_week_low'] = df_mod['low'].rolling(window=252).min()
    df_mod['prox_to_52_week_high'] = ((df_mod['52_week_high'] - df_mod['close']) / df_mod['52_week_high']) * 100
    df_mod['prox_to_52_week_low'] = ((df_mod['close'] - df_mod['52_week_low']) / df_mod['52_week_low']) * 100
    df_mod['near_52_week_high'] = df_mod['prox_to_52_week_high'] < 5  # within 5% of 52 week high
    df_mod['near_52_week_low'] = df_mod['prox_to_52_week_low'] < 5    # within 5% of 52 week low

    # 6) intraday momentum
    df_mod['intraday_return'] = ((df_mod['close'] - df_mod['open']) / df_mod['open']) * 100
    df_mod['strong_up_day'] = df_mod['intraday_return'] > 2.0  # more than 2% gain
    df_mod['strong_down_day'] = df_mod['intraday_return'] < -2.0  # more than 2% loss


    return df_mod


def add_calendar_signals(df):   

    return 

In [5]:
data = calculate_overnight_delta(data)
data = identify_abnormal_delta(data, threshold=2)

In [6]:
data.head()

Unnamed: 0,volume,vw,open,close,high,low,date,n,overnight_delta,overnight_delta_pct,z_score,abnormal
0,69007830.0,129.9522,131.38,130.03,131.41,128.72,2022-12-27 05:00:00,608331,,,,False
1,85438391.0,127.5918,129.67,126.04,131.0275,125.87,2022-12-28 05:00:00,762246,-0.36,-0.276859,-0.280927,False
2,75501810.0,129.4879,127.99,129.61,130.4814,127.73,2022-12-29 05:00:00,598204,1.95,1.547128,1.637825,False
3,76204209.0,128.8013,128.41,129.93,129.95,127.43,2022-12-30 05:00:00,592655,-1.2,-0.925854,-0.96364,False
4,112117471.0,125.725,130.28,125.07,130.9,124.17,2023-01-03 05:00:00,1021065,0.35,0.269376,0.293688,False
