In [None]:
import sys
import os

# Add the utils directory to the Python path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..', 'utils')))

import pandas as pd
import modin.pandas as mpd
from utils import clickhouse_data
import numpy as np
import talib
from sklearn.decomposition import PCA

# Define parameters
start_date = '2017-01-01'
end_date = '2025-12-31'

# Load and preprocess data
large_cap = clickhouse_data.clickhouse_largecap(start_date, end_date)
# mid_cap = clickhouse_data.clickhouse_midcap(start_date, end_date)
# small_cap = clickhouse_data.clickhouse_smallcap(start_date, end_date)

# # Fill missing dates
# large_cap = clickhouse_data.fill_missing_dates_modin_optimized(large_cap)
# mid_cap = clickhouse_data.fill_missing_dates_modin_optimized(mid_cap)
# small_cap = clickhouse_data.fill_missing_dates_modin_optimized(small_cap)

# Stack dataframes
all_cap = mpd.concat([large_cap])



    SELECT 
        financialinstrumentid,
        date,
        open,
        high,
        low,
        close,
        lastprice,
        previouscloseprice,
        volume,
        totaltradingvolume,
        totaltradevalue,
        totalnumberoftradesexecuted,
        tickersymbol,
        securityseries,
        settlementprice,
        financialinstrumentname
    FROM stock_data.tickers
    WHERE financialinstrumentid IN ('500002',) AND date BETWEEN '2017-01-01' AND '2025-12-31'
    ORDER BY financialinstrumentid, date ASC
    


In addition, using fork() with Python in general is a recipe for mysterious
deadlocks and crashes.

The most likely reason you are seeing this error is because you are using the
multiprocessing module on Linux, which uses fork() by default. This will be
fixed in Python 3.14. Until then, you want to use the "spawn" context instead.

See https://docs.pola.rs/user-guide/misc/multiprocessing/ for details.

or by setting POLARS_ALLOW_FORKING_THREAD=1.

2025-02-26 23:33:10,149	INFO worker.py:1821 -- Started a local Ray instance.


# Feature Engineering functions with validations and importance. 


In [None]:

import sys
import os

# Add the utils directory to the Python path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..', 'utils')))

import pandas as pd
import modin.pandas as mpd
from utils import clickhouse_data
import numpy as np
import talib
from sklearn.decomposition import PCA

# Define parameters
start_date = '2017-01-01'
end_date = '2025-12-31'

# Load and preprocess data
large_cap = clickhouse_data.clickhouse_largecap(start_date, end_date)
# mid_cap = clickhouse_data.clickhouse_midcap(start_date, end_date)
# small_cap = clickhouse_data.clickhouse_smallcap(start_date, end_date)

# # Fill missing dates
# large_cap = clickhouse_data.fill_missing_dates_modin_optimized(large_cap)
# mid_cap = clickhouse_data.fill_missing_dates_modin_optimized(mid_cap)
# small_cap = clickhouse_data.fill_missing_dates_modin_optimized(small_cap)

# Stack dataframes
all_cap = mpd.concat([large_cap])



import numpy as np
import pandas as pd
import talib
from prophet import Prophet

def extract_all_features(df):
    """
    Master function that:
    1. Retains raw OHLCV & additional trading data.
    2. Computes trend-based, volatility, liquidity, and microstructure features.
    3. Computes advanced technical indicators.
    4. Extracts seasonal components using Prophet.
    5. Ensures all feature columns are complete and normalized.
    """
    
    df = df.copy()

    # === Retain Raw Data for Reference ===
    raw_cols = [
        'date', 'open', 'high', 'low', 'close', 'lastprice', 'previouscloseprice',
        'volume', 'totaltradingvolume', 'totaltradevalue', 'totalnumberoftradesexecuted'
    ]
    df = df[raw_cols]  # Ensure we only work with the relevant columns

    # Ensure proper datetime format
    df['date'] = pd.to_datetime(df['date'])
    df.set_index('date', inplace=True)

    # === Compute Derived Features ===
    
    ## 🔹 Trend-Based Features
    df = clickhouse_data.compute_trend_features(df)

    ## 🔹 Volatility Features
    df = clickhouse_data.compute_volatility_features(df)

    ## 🔹 Liquidity Features (Includes Total Trading Volume & Trade Value)
    df = clickhouse_data.compute_liquidity_features(df)

    ## 🔹 Market Microstructure Features (VWAP, etc.)
    df = clickhouse_data.compute_microstructure_features(df)

    # === Derived Features from Additional Trading Data ===

    # 🔹 Previous Close Return (Gap Indicator)
    df['prev_close_return'] = np.log(df['close'] / df['previouscloseprice'])

    # 🔹 Trading Intensity (Total Trades / Volume)
    df['trading_intensity'] = df['totalnumberoftradesexecuted'] / df['totaltradingvolume']

    # 🔹 Turnover Ratio (Liquidity Proxy)
    df['turnover_ratio'] = df['totaltradevalue'] / df['totaltradingvolume']

    # 🔹 VWAP Ratio (Price Positioning)
    df['vwap_ratio'] = df['close'] / df['vwap']
    
    # 🔹 High-Low & Close-Open Ratios
    df['high_low_ratio'] = df['high'] / df['low']
    df['close_open_ratio'] = df['close'] / df['open']

    # === Advanced Technical Indicators ===
    df['cci_20'] = talib.CCI(df['high'].values, df['low'].values, df['close'].values, timeperiod=20)
    df['williams_r'] = talib.WILLR(df['high'].values, df['low'].values, df['close'].values, timeperiod=14)
    df['stoch_k'], df['stoch_d'] = talib.STOCH(
        df['high'].values, df['low'].values, df['close'].values, 
        fastk_period=14, slowk_period=3, slowd_period=3
    )
    df['chande_momentum'] = talib.CMO(df['close'].values, timeperiod=14)
    df['ulcer_index'] = np.sqrt((df['close'].rolling(14).max().values - df['close'].values) ** 2 / 14)
    df['obv'] = talib.OBV(df['close'].astype(float).values, df['volume'].astype(float).values)
    df['chaikin_money_flow'] = talib.ADOSC(
        df['high'].astype(float).values, df['low'].astype(float).values, df['close'].astype(float).values, df['volume'].astype(float).values, 
        fastperiod=3, slowperiod=10
    )
    
    # === Extract Seasonality Features Using Prophet ===
    df_prophet = df[['close']].reset_index()
    df_prophet.columns = ['ds', 'y']

    prophet = Prophet(yearly_seasonality=True, weekly_seasonality=True, daily_seasonality=False)
    prophet.fit(df_prophet)

    future = prophet.make_future_dataframe(periods=0)
    forecast = prophet.predict(future)

    df['seasonal_weekly'] = forecast['weekly']
    df['seasonal_yearly'] = forecast['yearly']
    
    # Fill missing values (if any)
    df = df.ffill()
    df.fillna(0, inplace=True)

    return df

featured_df = extract_all_features(all_cap)


import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, BatchNormalization, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np

# 🚀 Enable Mixed Precision for Faster Training on M2 GPU
tf.keras.mixed_precision.set_global_policy('mixed_float16')

# 🚀 Input Data Validation: Ensure there are no NaNs
if featured_df.isnull().values.any():
    raise ValueError("🚨 Warning: Input Data Contains NaN Values! Please handle missing values.")

# 🚀 Load and scale the data
X = featured_df.dropna().astype(np.float32)  # Ensure correct dtype
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
input_dim = X_scaled.shape[1]

# ✅ Define a Simple Autoencoder Model
encoding_dim = min(10, input_dim // 2)  # Default to 10 or half of input features, whichever is smaller

# ✅ Encoder
input_layer = Input(shape=(input_dim,))
x = Dense(encoding_dim * 4, activation='relu', kernel_initializer='he_normal')(input_layer)
x = BatchNormalization()(x)
x = Dropout(0.2)(x)
x = Dense(encoding_dim * 2, activation='relu', kernel_initializer='he_normal')(x)
x = BatchNormalization()(x)
encoded = Dense(encoding_dim, activation='relu', kernel_initializer='he_normal', name="bottleneck")(x)  # Latent space

# ✅ Decoder
x = Dense(encoding_dim * 2, activation='relu', kernel_initializer='he_normal')(encoded)
x = BatchNormalization()(x)
x = Dense(encoding_dim * 4, activation='relu', kernel_initializer='he_normal')(x)
x = BatchNormalization()(x)
decoded = Dense(input_dim, activation='linear')(x)  # Reconstruct input

# ✅ Build Model
autoencoder = Model(input_layer, decoded)
autoencoder.compile(optimizer=Adam(learning_rate=0.001), loss=tf.keras.losses.Huber(delta=1.0))

# ✅ Manually Split the Data
split_ratio = 0.9
split_index = int(X_scaled.shape[0] * split_ratio)
X_train, X_val = X_scaled[:split_index], X_scaled[split_index:]

# ✅ Train Autoencoder Using NumPy Arrays
autoencoder.fit(X_train, X_train, 
                validation_data=(X_val, X_val),  # ✅ Explicitly define validation data
                epochs=50, 
                batch_size=32)

# ✅ Extract Latent Features (Encoder Model)
encoder = Model(inputs=autoencoder.input, outputs=autoencoder.get_layer("bottleneck").output)
X_encoded = encoder.predict(X_scaled)

# ✅ Convert Encoded Features to DataFrame
encoded_features_df = pd.DataFrame(X_encoded, index=featured_df.index)

print("✅ Simple Feature Extraction Complete!")
encoded_features_df.head()


Please refer to https://modin.readthedocs.io/en/stable/supported_apis/defaulting_to_pandas.html for explanation.
23:33:18 - cmdstanpy - INFO - Chain [1] start processing
23:33:19 - cmdstanpy - INFO - Chain [1] done processing


In [4]:
featured_df


Unnamed: 0_level_0,open,high,low,close,lastprice,previouscloseprice,volume,totaltradingvolume,totaltradevalue,totalnumberoftradesexecuted,...,cci_20,williams_r,stoch_k,stoch_d,chande_momentum,ulcer_index,obv,chaikin_money_flow,seasonal_weekly,seasonal_yearly
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2017-06-23,1500.35,1503.00,1430.20,1440.55,1440.55,1486.30,10963,10963,15950701,1256,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,10963.0,0.000000,0.0,0.0
2017-06-27,1445.00,1457.00,1428.85,1442.95,1442.95,1440.55,9015,9015,13001118,1135,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,19978.0,0.000000,0.0,0.0
2017-06-28,1443.00,1458.00,1425.80,1452.40,1452.40,1442.95,5662,5662,8171340,892,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,25640.0,0.000000,0.0,0.0
2017-06-29,1458.00,1479.00,1435.00,1439.80,1439.80,1452.40,8416,8416,12287156,804,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,17224.0,0.000000,0.0,0.0
2017-06-30,1419.25,1460.00,1411.25,1451.10,1451.10,1439.80,4764,4764,6900488,529,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,21988.0,0.000000,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-12-10,7585.00,7710.00,7572.25,7694.10,7684.50,7564.55,5208,5208,39940871,1771,...,129.803746,-2.316602,92.911017,92.425667,18.316990,0.000000,-9664466.0,6180.854158,0.0,0.0
2024-12-12,7799.95,7799.95,7635.05,7654.95,7660.15,7725.80,3840,3840,29509077,1195,...,126.512055,-12.146597,90.529267,91.351614,15.650372,10.463278,-9668306.0,4842.355629,0.0,0.0
2024-12-16,7726.30,7945.00,7709.00,7893.40,7888.00,7697.45,10909,10909,85970063,3429,...,136.401314,-3.854198,93.894201,92.444829,26.513863,0.000000,-9657397.0,5807.772359,0.0,0.0
2025-01-01,6932.70,6947.45,6845.10,6929.05,6924.95,6913.95,3430,3430,23701429,1022,...,-56.894064,-75.885121,69.371361,84.598277,-18.949662,257.733379,-9660827.0,6373.699490,0.0,0.0


2025-02-26 23:33:28.261641: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M2
2025-02-26 23:33:28.261676: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 8.00 GB
2025-02-26 23:33:28.261687: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 2.67 GB
2025-02-26 23:33:28.261704: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2025-02-26 23:33:28.261720: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


Epoch 1/50


2025-02-26 23:33:29.915052: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.
2025-02-26 23:33:29.928494: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:966] PluggableGraphOptimizer failed: INVALID_ARGUMENT: Failed to deserialize the `graph_buf`.


[1m52/52[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 62ms/step - loss: 0.4514 - val_loss: 1.0322
Epoch 2/50
[1m52/52[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 34ms/step - loss: 0.2636 - val_loss: 1.0447
Epoch 3/50
[1m52/52[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 34ms/step - loss: 0.2040 - val_loss: 0.8902
Epoch 4/50
[1m52/52[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 38ms/step - loss: 0.1724 - val_loss: 0.7049
Epoch 5/50
[1m52/52[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 40ms/step - loss: 0.1525 - val_loss: 0.7295
Epoch 6/50
[1m52/52[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 43ms/step - loss: 0.1406 - val_loss: 0.6275
Epoch 7/50
[1m52/52[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 37ms/step - loss: 0.1393 - val_loss: 0.5604
Epoch 8/50
[1m52/52[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 36ms/step - loss: 0.1303 - val_loss: 0.5484
Epoch 9/50
[1m52/52[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m

2025-02-26 23:35:26.330075: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:966] PluggableGraphOptimizer failed: INVALID_ARGUMENT: Failed to deserialize the `graph_buf`.


[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step
✅ Simple Feature Extraction Complete!


Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2017-06-23,0.0,0.0,1.875,0.0,0.0,1.289062,0.907227,4.367188,2.462891,0.0
2017-06-27,0.0,0.0,1.533203,0.0,0.0,0.468994,2.083984,4.46875,1.798828,0.0
2017-06-28,0.0,0.0,1.525391,0.0,0.0,0.206543,2.484375,4.484375,1.638672,0.0
2017-06-29,0.0,0.0,1.65625,0.0,0.0,0.807617,1.604492,4.492188,2.076172,0.0
2017-06-30,0.0,0.122314,1.556641,0.0,0.0,0.074097,2.75,4.621094,1.527344,0.0
