# Label Stock Market Data

| **Approach**     | **Label Type**       | **Use Case**                                   |
| ------------ | ---------------- | ------------------------------------------ |
| Binary Class | 0 = Down, 1 = Up | Basic price direction prediction           |
| Multi-Class  | -1, 0, 1         | Classify as Down / Neutral / Up            |
| Regression   | Next-day return  | Predict return %, not class                |
| Thresholded  | Return > x%      | Capture only “strong” signals              |
| Multi-day    | Future window    | Predict if price increases within `n` days |


In [4]:
# Load the Data with all the Features
import pandas as pd

stock_data = pd.read_csv('../data/processed_data/Stock_Technical_features.csv',parse_dates=['Date'],index_col=[0])
stock_data

Unnamed: 0,Date,Close,High,Low,Open,Volume,SMA_20,SMA_50,EMA_12,EMA_26,...,BB_Upper,BB_Lower,OBV,OBV_pct_change,Volume_rolling_mean_10,Volume_pct_change,Volume_spike,Volume_zscore,Daily_Return,Log_Return
0,2023-05-22,280.647339,285.992144,280.196195,281.039613,1189700,275.318704,283.637429,280.647339,280.647339,...,291.619286,259.018123,0,0.546295,1426960.0,0.818358,False,-1.023136,-0.041968,-0.042874
1,2023-05-23,268.869110,279.794109,266.819448,278.862460,2163300,275.318704,283.637429,278.835304,279.774877,...,291.619286,259.018123,-2163300,0.546295,1426960.0,0.818358,False,-1.023136,-0.041968,-0.042874
2,2023-05-24,267.299957,269.830184,264.995303,267.731468,1181800,275.318704,283.637429,277.060635,278.850809,...,291.619286,259.018123,-3345100,0.546295,1426960.0,-0.453705,False,-1.023136,-0.005836,-0.005853
3,2023-05-25,267.143066,268.829874,262.592627,267.790333,1159100,275.318704,283.637429,275.534855,277.983569,...,291.619286,259.018123,-4504200,0.346507,1426960.0,-0.019208,False,-1.023136,-0.000587,-0.000587
4,2023-05-26,267.319611,270.526524,266.093734,267.819778,1091700,275.318704,283.637429,274.270971,277.193646,...,291.619286,259.018123,-3412500,-0.242374,1426960.0,-0.058149,False,-1.023136,0.000661,0.000661
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
494,2025-05-12,388.079987,392.420013,386.500000,389.070007,1359300,366.112997,367.611379,376.550053,370.467516,...,396.473461,335.752534,25229000,0.056947,1664680.0,0.791381,False,-0.354221,0.018797,0.018622
495,2025-05-13,387.529999,390.459991,385.779999,388.670013,862600,367.818497,367.482579,378.239275,371.731404,...,398.997946,336.639047,24366400,-0.034191,1640110.0,-0.365409,False,-0.880548,-0.001417,-0.001418
496,2025-05-14,386.320007,388.019989,384.760010,386.959991,1107700,369.794498,367.428551,379.482465,372.812041,...,400.368609,339.220386,23258700,-0.045460,1525710.0,0.284141,False,-0.481058,-0.003122,-0.003127
497,2025-05-15,392.200012,392.630005,384.579987,388.500000,1124400,372.070998,367.376989,381.439011,374.248187,...,402.171894,341.970102,24383100,0.048343,1474720.0,0.015076,False,-0.399551,0.015221,0.015106


In [6]:
# Binary Classification Target: Next-Day Movement
stock_data['Target_Binary'] = (stock_data['Close'].shift(-1) > stock_data['Close']).astype(int)

In [7]:
stock_data

Unnamed: 0,Date,Close,High,Low,Open,Volume,SMA_20,SMA_50,EMA_12,EMA_26,...,BB_Lower,OBV,OBV_pct_change,Volume_rolling_mean_10,Volume_pct_change,Volume_spike,Volume_zscore,Daily_Return,Log_Return,Target_Binary
0,2023-05-22,280.647339,285.992144,280.196195,281.039613,1189700,275.318704,283.637429,280.647339,280.647339,...,259.018123,0,0.546295,1426960.0,0.818358,False,-1.023136,-0.041968,-0.042874,0
1,2023-05-23,268.869110,279.794109,266.819448,278.862460,2163300,275.318704,283.637429,278.835304,279.774877,...,259.018123,-2163300,0.546295,1426960.0,0.818358,False,-1.023136,-0.041968,-0.042874,0
2,2023-05-24,267.299957,269.830184,264.995303,267.731468,1181800,275.318704,283.637429,277.060635,278.850809,...,259.018123,-3345100,0.546295,1426960.0,-0.453705,False,-1.023136,-0.005836,-0.005853,0
3,2023-05-25,267.143066,268.829874,262.592627,267.790333,1159100,275.318704,283.637429,275.534855,277.983569,...,259.018123,-4504200,0.346507,1426960.0,-0.019208,False,-1.023136,-0.000587,-0.000587,1
4,2023-05-26,267.319611,270.526524,266.093734,267.819778,1091700,275.318704,283.637429,274.270971,277.193646,...,259.018123,-3412500,-0.242374,1426960.0,-0.058149,False,-1.023136,0.000661,0.000661,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
494,2025-05-12,388.079987,392.420013,386.500000,389.070007,1359300,366.112997,367.611379,376.550053,370.467516,...,335.752534,25229000,0.056947,1664680.0,0.791381,False,-0.354221,0.018797,0.018622,0
495,2025-05-13,387.529999,390.459991,385.779999,388.670013,862600,367.818497,367.482579,378.239275,371.731404,...,336.639047,24366400,-0.034191,1640110.0,-0.365409,False,-0.880548,-0.001417,-0.001418,0
496,2025-05-14,386.320007,388.019989,384.760010,386.959991,1107700,369.794498,367.428551,379.482465,372.812041,...,339.220386,23258700,-0.045460,1525710.0,0.284141,False,-0.481058,-0.003122,-0.003127,1
497,2025-05-15,392.200012,392.630005,384.579987,388.500000,1124400,372.070998,367.376989,381.439011,374.248187,...,341.970102,24383100,0.048343,1474720.0,0.015076,False,-0.399551,0.015221,0.015106,1


In [8]:
# Regression Target: Predict Raw Return
stock_data['Next_Return'] = (stock_data['Close'].shift(-1) - stock_data['Close']) / stock_data['Close']
stock_data['Target_Regression'] = stock_data['Next_Return']

In [9]:
stock_data

Unnamed: 0,Date,Close,High,Low,Open,Volume,SMA_20,SMA_50,EMA_12,EMA_26,...,OBV_pct_change,Volume_rolling_mean_10,Volume_pct_change,Volume_spike,Volume_zscore,Daily_Return,Log_Return,Target_Binary,Next_Return,Target_Regression
0,2023-05-22,280.647339,285.992144,280.196195,281.039613,1189700,275.318704,283.637429,280.647339,280.647339,...,0.546295,1426960.0,0.818358,False,-1.023136,-0.041968,-0.042874,0,-0.041968,-0.041968
1,2023-05-23,268.869110,279.794109,266.819448,278.862460,2163300,275.318704,283.637429,278.835304,279.774877,...,0.546295,1426960.0,0.818358,False,-1.023136,-0.041968,-0.042874,0,-0.005836,-0.005836
2,2023-05-24,267.299957,269.830184,264.995303,267.731468,1181800,275.318704,283.637429,277.060635,278.850809,...,0.546295,1426960.0,-0.453705,False,-1.023136,-0.005836,-0.005853,0,-0.000587,-0.000587
3,2023-05-25,267.143066,268.829874,262.592627,267.790333,1159100,275.318704,283.637429,275.534855,277.983569,...,0.346507,1426960.0,-0.019208,False,-1.023136,-0.000587,-0.000587,1,0.000661,0.000661
4,2023-05-26,267.319611,270.526524,266.093734,267.819778,1091700,275.318704,283.637429,274.270971,277.193646,...,-0.242374,1426960.0,-0.058149,False,-1.023136,0.000661,0.000661,0,-0.003999,-0.003999
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
494,2025-05-12,388.079987,392.420013,386.500000,389.070007,1359300,366.112997,367.611379,376.550053,370.467516,...,0.056947,1664680.0,0.791381,False,-0.354221,0.018797,0.018622,0,-0.001417,-0.001417
495,2025-05-13,387.529999,390.459991,385.779999,388.670013,862600,367.818497,367.482579,378.239275,371.731404,...,-0.034191,1640110.0,-0.365409,False,-0.880548,-0.001417,-0.001418,0,-0.003122,-0.003122
496,2025-05-14,386.320007,388.019989,384.760010,386.959991,1107700,369.794498,367.428551,379.482465,372.812041,...,-0.045460,1525710.0,0.284141,False,-0.481058,-0.003122,-0.003127,1,0.015221,0.015221
497,2025-05-15,392.200012,392.630005,384.579987,388.500000,1124400,372.070998,367.376989,381.439011,374.248187,...,0.048343,1474720.0,0.015076,False,-0.399551,0.015221,0.015106,1,0.005125,0.005125


In [11]:
# Multi-Class Target: Direction + Threshold
def label_multi(x):
    if x > 0.01:
        return 1
    elif x < -0.01:
        return -1
    else:
        return 0

stock_data['Target_MultiClass'] = stock_data['Next_Return'].apply(label_multi)

In [12]:
# Rolling Future Gain Label (2–3 Days Horizon)
stock_data['Max_3D_Close'] = stock_data['Close'].rolling(window=3).max().shift(-2)
stock_data['Target_3D'] = (stock_data['Max_3D_Close'] > stock_data['Close']).astype(int)

In [13]:
stock_data

Unnamed: 0,Date,Close,High,Low,Open,Volume,SMA_20,SMA_50,EMA_12,EMA_26,...,Volume_spike,Volume_zscore,Daily_Return,Log_Return,Target_Binary,Next_Return,Target_Regression,Target_MultiClass,Max_3D_Close,Target_3D
0,2023-05-22,280.647339,285.992144,280.196195,281.039613,1189700,275.318704,283.637429,280.647339,280.647339,...,False,-1.023136,-0.041968,-0.042874,0,-0.041968,-0.041968,-1,280.647339,0
1,2023-05-23,268.869110,279.794109,266.819448,278.862460,2163300,275.318704,283.637429,278.835304,279.774877,...,False,-1.023136,-0.041968,-0.042874,0,-0.005836,-0.005836,0,268.869110,0
2,2023-05-24,267.299957,269.830184,264.995303,267.731468,1181800,275.318704,283.637429,277.060635,278.850809,...,False,-1.023136,-0.005836,-0.005853,0,-0.000587,-0.000587,0,267.319611,1
3,2023-05-25,267.143066,268.829874,262.592627,267.790333,1159100,275.318704,283.637429,275.534855,277.983569,...,False,-1.023136,-0.000587,-0.000587,1,0.000661,0.000661,0,267.319611,1
4,2023-05-26,267.319611,270.526524,266.093734,267.819778,1091700,275.318704,283.637429,274.270971,277.193646,...,False,-1.023136,0.000661,0.000661,0,-0.003999,-0.003999,0,270.261780,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
494,2025-05-12,388.079987,392.420013,386.500000,389.070007,1359300,366.112997,367.611379,376.550053,370.467516,...,False,-0.354221,0.018797,0.018622,0,-0.001417,-0.001417,0,388.079987,0
495,2025-05-13,387.529999,390.459991,385.779999,388.670013,862600,367.818497,367.482579,378.239275,371.731404,...,False,-0.880548,-0.001417,-0.001418,0,-0.003122,-0.003122,0,392.200012,1
496,2025-05-14,386.320007,388.019989,384.760010,386.959991,1107700,369.794498,367.428551,379.482465,372.812041,...,False,-0.481058,-0.003122,-0.003127,1,0.015221,0.015221,1,394.209991,1
497,2025-05-15,392.200012,392.630005,384.579987,388.500000,1124400,372.070998,367.376989,381.439011,374.248187,...,False,-0.399551,0.015221,0.015106,1,0.005125,0.005125,0,,0


## Create Stock Labels

In [55]:
import pandas as pd
import numpy as np
import os

def add_prediction_labels(stock_data: pd.DataFrame,
                          close_col: str = 'Close',
                          return_threshold: float = 0.01,
                          future_window: int = 3,
                          stock_name: str = 'Stock',
                          save: bool = True) -> pd.DataFrame:
    """
    Adds multiple prediction label columns for classification and regression tasks.

    Parameters:
        stock_data (pd.DataFrame): Input stock dataframe (must include 'Close').
        close_col (str): Name of the close price column.
        return_threshold (float): % threshold for multi-class labeling.
        future_window (int): Number of days to check for future price increase.
        stock_name(str): name of stock.
        save(bool): Save the results.

    Returns:
        pd.DataFrame: Updated DataFrame with new label columns.
    """

    stock_data = stock_data.copy()

    # === Calculate next-day return
    stock_data['Next_Return'] = (stock_data[close_col].shift(-1) - stock_data[close_col]) / stock_data[close_col]

    # === Binary classification (1 = up, 0 = down)
    stock_data['Target_Binary'] = (stock_data[close_col].shift(-1) > stock_data[close_col]).astype(int)

    # === Multi-class: -1 (down), 0 (neutral), 1 (up)
    def multi_class_label(x):
        if x > return_threshold:
            return 1
        elif x < -return_threshold:
            return -1
        else:
            return 0

    stock_data['Target_MultiClass'] = stock_data['Next_Return'].apply(multi_class_label)

    # === Regression label: raw next-day return
    stock_data['Target_Regression'] = stock_data['Next_Return']

    # === Rolling future window label (1 if price increases in next n days)
    stock_data[f'Max_{future_window}D_Close'] = stock_data[close_col].rolling(window=future_window).max().shift(-(future_window - 1))
    stock_data[f'Target_{future_window}D'] = (stock_data[f'Max_{future_window}D_Close'] > stock_data[close_col]).astype(int)

    # === Drop rows with NaNs in any label columns
    label_cols = ['Target_Binary', 'Target_MultiClass', 'Target_Regression', f'Target_{future_window}D']
    stock_data.dropna(subset=label_cols, inplace=True)
    
    # === Step 6: Save if needed
    if save:
        save_path = f'../data/ML_data/{stock_name}_ML_Ready_dataset.csv'
        os.makedirs(os.path.dirname(save_path), exist_ok=True)
        stock_data.to_csv(save_path, index=False)
        print(f"✅ ML Ready dataset saved to: {save_path}")
    
    return stock_data

In [56]:
stock_data = pd.read_csv('../data/processed_data/Stock_Technical_features.csv',parse_dates=['Date'],index_col=[0])
labeled_data = add_prediction_labels(stock_data, close_col='Close', return_threshold=0.01, future_window=3,stock_name="Stryker")

✅ ML Ready dataset saved to: ../data/ML_data/Stryker_ML_Ready_dataset.csv


In [57]:
labeled_data

Unnamed: 0,Date,Close,High,Low,Open,Volume,SMA_20,SMA_50,EMA_12,EMA_26,...,Volume_spike,Volume_zscore,Daily_Return,Log_Return,Next_Return,Target_Binary,Target_MultiClass,Target_Regression,Max_3D_Close,Target_3D
0,2023-05-22,280.647339,285.992144,280.196195,281.039613,1189700,275.318704,283.637429,280.647339,280.647339,...,False,-1.023136,-0.041968,-0.042874,-0.041968,0,-1,-0.041968,280.647339,0
1,2023-05-23,268.869110,279.794109,266.819448,278.862460,2163300,275.318704,283.637429,278.835304,279.774877,...,False,-1.023136,-0.041968,-0.042874,-0.005836,0,0,-0.005836,268.869110,0
2,2023-05-24,267.299957,269.830184,264.995303,267.731468,1181800,275.318704,283.637429,277.060635,278.850809,...,False,-1.023136,-0.005836,-0.005853,-0.000587,0,0,-0.000587,267.319611,1
3,2023-05-25,267.143066,268.829874,262.592627,267.790333,1159100,275.318704,283.637429,275.534855,277.983569,...,False,-1.023136,-0.000587,-0.000587,0.000661,1,0,0.000661,267.319611,1
4,2023-05-26,267.319611,270.526524,266.093734,267.819778,1091700,275.318704,283.637429,274.270971,277.193646,...,False,-1.023136,0.000661,0.000661,-0.003999,0,0,-0.003999,270.261780,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
493,2025-05-09,380.920013,384.549988,380.279999,382.200012,758800,364.222998,367.555779,374.453701,369.058519,...,False,-1.027358,-0.004027,-0.004035,0.018797,1,1,0.018797,388.079987,1
494,2025-05-12,388.079987,392.420013,386.500000,389.070007,1359300,366.112997,367.611379,376.550053,370.467516,...,False,-0.354221,0.018797,0.018622,-0.001417,0,0,-0.001417,388.079987,0
495,2025-05-13,387.529999,390.459991,385.779999,388.670013,862600,367.818497,367.482579,378.239275,371.731404,...,False,-0.880548,-0.001417,-0.001418,-0.003122,0,0,-0.003122,392.200012,1
496,2025-05-14,386.320007,388.019989,384.760010,386.959991,1107700,369.794498,367.428551,379.482465,372.812041,...,False,-0.481058,-0.003122,-0.003127,0.015221,1,1,0.015221,394.209991,1
