In [None]:
import logging
import pandas as pd
from pathlib import Path
from src.features.stable_feature_selector import StableFeatureSelector
import psutil
import multiprocessing

# --- Configuration ---
# Paths
panel_data_path = '/home/siddharth.johri/DECOHERE/data/features/fundamentals/enhanced_features'
results_base_dir = '/home/siddharth.johri/DECOHERE/results/stable_features'

# Selection Parameters
target_date_str = '2024-08-30'
target_column_name = 'PE_RATIO_RATIO_SIGNED_LOG'
k_features = 50
compulsory_features = ['sector_1','sector_2']

# LightGBM/Optuna/CV Parameters - Optimized for e2-standard-16
lookback_days = 20
n_splits = 3
n_trials = 5#50  # Reduced to 50 trials for faster execution
optuna_n_jobs = 15  # Use 15 cores (leave 1 for system)
random_seed = 42

# LightGBM specific parameters
early_stopping_rounds = 50
num_boost_round = 1000
rmse_threshold = 2.0  # Early stopping threshold for RMSE

# Pre-filtering Parameters
missing_threshold = 0.50
variance_threshold_value = 1e-4
univariate_corr_threshold = 0.01

# Stability Calculation Parameters
stability_candidate_pool = 250  # Increased from 250 to reduce selection bias

# --- System Info Logging ---
print(f"CPU Cores available: {multiprocessing.cpu_count()}")
print(f"Memory available: {psutil.virtual_memory().total / (1024**3):.1f} GB")
print(f"Using {optuna_n_jobs} cores for parallel processing")

# --- Setup Logging ---
log_file = Path(results_base_dir) / f"stable_selection_{target_date_str}.log"
log_file.parent.mkdir(parents=True, exist_ok=True)

# Clear previous handlers
for handler in logging.root.handlers[:]:
    logging.root.removeHandler(handler)

# Setup new handlers with more detailed format
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler(log_file),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger(__name__)

# Log system configuration
logger.info(f"System Configuration:")
logger.info(f"- CPU Cores: {multiprocessing.cpu_count()}")
logger.info(f"- Available Memory: {psutil.virtual_memory().total / (1024**3):.1f} GB")
logger.info(f"- Optuna Jobs: {optuna_n_jobs}")
logger.info(f"- Number of Trials: {n_trials}")
logger.info(f"- Stability Parameters:")
logger.info(f"  - Candidate Pool Size: {stability_candidate_pool}")
logger.info(f"- LightGBM Parameters:")
logger.info(f"  - Early Stopping Rounds: {early_stopping_rounds}")
logger.info(f"  - Number of Boost Rounds: {num_boost_round}")
logger.info(f"  - RMSE Threshold: {rmse_threshold}")

# --- Instantiate and Run ---
logger.info("--- Starting Stable Feature Selection Run ---")
selector = StableFeatureSelector(
    k_features=k_features,
    compulsory_features=compulsory_features,
    lookback_days=lookback_days,
    n_splits=n_splits,
    n_trials=n_trials,
    optuna_n_jobs=optuna_n_jobs,
    random_seed=random_seed,
    results_base_dir=results_base_dir,
    early_stopping_rounds=early_stopping_rounds,
    rmse_threshold=rmse_threshold,
    stability_candidate_pool=stability_candidate_pool
)

# Run with memory monitoring
initial_memory = psutil.Process().memory_info().rss / (1024**3)
logger.info(f"Initial memory usage: {initial_memory:.1f} GB")

results = selector.tune_and_select_stable_features(
    panel_data_path=panel_data_path,
    target_date_str=target_date_str,
    target_column_name=target_column_name,
    missing_threshold=missing_threshold,
    variance_threshold_value=variance_threshold_value,
    univariate_corr_threshold=univariate_corr_threshold
)

# Log final memory usage
final_memory = psutil.Process().memory_info().rss / (1024**3)
logger.info(f"Final memory usage: {final_memory:.1f} GB")
logger.info(f"Memory delta: {final_memory - initial_memory:.1f} GB")

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
from src.features.stable_feature_selector import StableFeatureSelector
from datetime import datetime, timedelta
import logging
import os
import json
from sklearn.model_selection import TimeSeriesSplit

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)



# Set target date (today)
target_date= '2024-08-30'

#target_date = datetime.now().strftime('%Y-%m-%d')
target_date_dt = pd.to_datetime(target_date)
year = target_date_dt.year
month = f"{target_date_dt.month:02d}"

# Define hyperparameter search space with dependencies
HYPERPARAM_SEARCH_SPACE = {
    'learning_rate': {'min': 0.005, 'max': 0.1, 'type': 'float', 'log': True},
    'feature_fraction': {'min': 0.7, 'max': 1.0, 'type': 'float'},
    'bagging_fraction': {'min': 0.7, 'max': 1.0, 'type': 'float'},
    'bagging_freq': {'min': 1, 'max': 10, 'type': 'int'},
    'min_child_samples': {'min': 20, 'max': 100, 'type': 'int'},
    'lambda_l1': {'min': 1e-8, 'max': 10.0, 'type': 'float', 'log': True},
    'lambda_l2': {'min': 1e-8, 'max': 10.0, 'type': 'float', 'log': True},
    'min_data_in_leaf': {'min': 20, 'max': 100, 'type': 'int'}
}

# Path to load previously selected features
base_dir = "/home/siddharth.johri/DECOHERE/data/features/fundamentals/stable_features"
partitioned_dir = os.path.join(base_dir, f"year={year}", f"month={month}")
metadata_file = os.path.join(partitioned_dir, f"metadata_{target_date}.json")

try:
    # Load previously selected features
    logger.info(f"Loading stable features from: {metadata_file}")
    with open(metadata_file, 'r') as f:
        metadata = json.load(f)
    
    stable_features = metadata['stable_feature_set']
    logger.info(f"Loaded {len(stable_features)} stable features")
    logger.info(f"Features: {stable_features}")
    
    # Initialize the feature selector
    feature_selector = StableFeatureSelector(
        k_features=len(stable_features),
        lookback_days=20
    )
    
    # Run hyperparameter optimization
    logger.info(f"Starting LightGBM hyperparameter optimization for date: {target_date}")
    optimized_params = feature_selector.optimize_lightgbm_params(
        stable_feature_set=stable_features,
        date_str=target_date,
        search_space=HYPERPARAM_SEARCH_SPACE,
        n_trials=5,
        n_folds=5,
        num_threads=16
    )
    
    logger.info(f"Hyperparameter optimization completed successfully!")
    logger.info("Optimized Parameters:")
    for param, value in optimized_params.items():
        logger.info(f"  {param}: {value}")
    
except FileNotFoundError:
    logger.error(f"Metadata file not found: {metadata_file}")
    logger.error("Please ensure the stable feature selection has been run for this date")
except Exception as e:
    logger.error(f"Error during hyperparameter optimization: {str(e)}", exc_info=True)

In [1]:
# Import required libraries
import pandas as pd
import numpy as np
from src.features.stable_feature_selector import StableFeatureSelector
from datetime import datetime, timedelta
import logging
import os
import json
from sklearn.model_selection import TimeSeriesSplit

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Set target date (today)
target_date = '2024-08-30'
target_date_dt = pd.to_datetime(target_date)
year = target_date_dt.year
month = f"{target_date_dt.month:02d}"

# System configuration
SYSTEM_CONFIG = {
    'cpu_cores': 15,  # 15 vCPUs
    'memory_gb': 64,  # 64GB RAM
    'num_threads': 14  # Leave 1 vCPU for system processes
}

# Define compulsory features
COMPULSORY_FEATURES = ['sector_1', 'sector_2']

# Define hyperparameter search space with dependencies
HYPERPARAM_SEARCH_SPACE = {
    'learning_rate': {'min': 0.005, 'max': 0.1, 'type': 'float', 'log': True},
    'feature_fraction': {'min': 0.7, 'max': 1.0, 'type': 'float'},
    'bagging_fraction': {'min': 0.7, 'max': 1.0, 'type': 'float'},
    'bagging_freq': {'min': 1, 'max': 10, 'type': 'int'},
    'min_child_samples': {'min': 20, 'max': 100, 'type': 'int'},
    'lambda_l1': {'min': 1e-8, 'max': 10.0, 'type': 'float', 'log': True},
    'lambda_l2': {'min': 1e-8, 'max': 10.0, 'type': 'float', 'log': True},
    'min_data_in_leaf': {'min': 20, 'max': 100, 'type': 'int'}
}

# Path to load previously selected features
base_dir = "/home/siddharth.johri/DECOHERE/data/features/fundamentals/stable_features"
partitioned_dir = os.path.join(base_dir, f"year={year}", f"month={month}")
metadata_file = os.path.join(partitioned_dir, f"metadata_{target_date}.json")

try:
    # Load previously selected features
    logger.info(f"Loading stable features from: {metadata_file}")
    with open(metadata_file, 'r') as f:
        metadata = json.load(f)
    
    stable_features = metadata['stable_feature_set']
    logger.info(f"Loaded {len(stable_features)} stable features")
    logger.info(f"Features: {stable_features}")
    
    # Ensure compulsory features are included
    all_features = list(set(stable_features + COMPULSORY_FEATURES))
    logger.info(f"Total features for optimization: {len(all_features)} (including {len(COMPULSORY_FEATURES)} compulsory features)")
    logger.info(f"Compulsory features: {COMPULSORY_FEATURES}")
    
    # Initialize the feature selector with system configuration and compulsory features
    feature_selector = StableFeatureSelector(
        k_features=len(stable_features),
        lookback_days=20,
        compulsory_features=COMPULSORY_FEATURES,
        system_config=SYSTEM_CONFIG
    )
    
    # Run hyperparameter optimization
    logger.info(f"Starting LightGBM hyperparameter optimization for date: {target_date}")
    logger.info(f"System configuration: {SYSTEM_CONFIG}")
    logger.info(f"Expected runtime: ~25-30 minutes (5 trials × ~5 minutes)")
    
    optimized_params = feature_selector.optimize_lightgbm_params(
        stable_feature_set=all_features,  # Use all features including compulsory ones
        date_str=target_date,
        search_space=HYPERPARAM_SEARCH_SPACE,
        n_trials=5,
        n_folds=5,
        num_threads=SYSTEM_CONFIG['num_threads']
    )
    
    logger.info(f"Hyperparameter optimization completed successfully!")
    logger.info("Optimized Parameters:")
    for param, value in optimized_params.items():
        logger.info(f"  {param}: {value}")
    
except FileNotFoundError:
    logger.error(f"Metadata file not found: {metadata_file}")
    logger.error("Please ensure the stable feature selection has been run for this date")
except Exception as e:
    logger.error(f"Error during hyperparameter optimization: {str(e)}", exc_info=True)

2025-04-15 10:23:01,249 - INFO - Loading stable features from: /home/siddharth.johri/DECOHERE/data/features/fundamentals/stable_features/year=2024/month=08/metadata_2024-08-30.json
2025-04-15 10:23:01,250 - INFO - Loaded 50 stable features
2025-04-15 10:23:01,251 - INFO - Features: ['rank_as_is_PE_RATIO_RATIO_SIGNED_LOG_period_2', 'rank_level_first_fwd_PE_RATIO_RATIO_SIGNED_LOG', 'rank_as_is_PE_RATIO_RATIO_SIGNED_LOG_period_-5', 'rank_as_is_PREV_PE_RATIO_RATIO_SIGNED_LOG_period_1', 'rank_level_first_fwd_PREV_PE_RATIO_RATIO_SIGNED_LOG', 'rank_ratio_fwd_accel_PE_RATIO_RATIO_SIGNED_LOG', 'rank_ratio_fwd_slope_RETURN_COM_EQY_RATIO_SIGNED_LOG', 'rank_as_is_PE_RATIO_RATIO_SIGNED_LOG_period_3', 'rank_ratio_fwd_slope_INVENTORY_TURNOVER_RATIO_SIGNED_LOG', 'rank_ratio_hist_vol_RETURN_ON_ASSETS_RATIO_SIGNED_LOG', 'rank_as_is_INVENTORY_TURNOVER_RATIO_SIGNED_LOG_period_-5', 'rank_as_is_PE_RATIO_RATIO_SIGNED_LOG_period_-4', 'rank_ratio_fwd_slope_RETURN_ON_ASSETS_RATIO_SIGNED_LOG', 'rank_ratio_combin

In [1]:
import pandas as pd
a = pd.read_parquet('/home/siddharth.johri/DECOHERE/data/features/fundamentals/stable_features/year=2024/month=08/data_2024-08-30.pq')  

In [2]:
a

Unnamed: 0,feature_name,importance_score,is_selected,is_compulsory,target_date
0,rank_as_is_PE_RATIO_RATIO_SIGNED_LOG_period_2,8447.481915,True,False,2024-08-30
1,rank_level_first_fwd_PE_RATIO_RATIO_SIGNED_LOG,986.747684,True,False,2024-08-30
2,rank_as_is_PE_RATIO_RATIO_SIGNED_LOG_period_-5,796.039374,True,False,2024-08-30
3,rank_as_is_PREV_PE_RATIO_RATIO_SIGNED_LOG_peri...,661.432873,True,False,2024-08-30
4,rank_level_first_fwd_PREV_PE_RATIO_RATIO_SIGNE...,244.948917,True,False,2024-08-30
...,...,...,...,...,...
195,rank_scaled_slope_divergence_NET_INCOME_CSTAT_...,0.418237,False,False,2024-08-30
196,rank_ratio_hist_r2_OPERATING_MARGIN_RATIO_SIGN...,0.417889,False,False,2024-08-30
197,rank_as_is_PREV_PX_TO_BOOK_RATIO_RATIO_SIGNED_...,0.417809,False,False,2024-08-30
198,rank_ratio_combined_r2_SALES_COEFF_OF_VAR_RATI...,0.415066,False,False,2024-08-30


bucket
(-8.037, 845.117]       198
(845.117, 1689.824]       1
(1689.824, 2534.532]      0
(2534.532, 3379.239]      0
(3379.239, 4223.946]      0
(4223.946, 5068.653]      0
(5068.653, 5913.36]       0
(5913.36, 6758.068]       0
(6758.068, 7602.775]      0
(7602.775, 8447.482]      1
Name: count, dtype: int64
