In [1]:
# --- Cell 1: Setup and Configuration ---
import os
import sys
import logging
import yaml
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from typing import Dict, List, Tuple, Optional, Any, Union
from datetime import datetime
import pytz

#import datetime
import json


# Set up project root path - use absolute path to DECOHERE directory
project_root = '/home/siddharth.johri/DECOHERE'
if project_root not in sys.path:
    sys.path.append(project_root)

# Import project modules
from src.data.data_processor import DataProcessor
from src.data.efficient_data_storage import EfficientDataStorage, DataType, DataStage
from src.features.feature_selector import FeatureSelector
from src.data.feature_generator import FeatureGenerator
from src.analysis.performance_analyzer import analyze_performance_metrics

# Set up logging with more detailed format
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler(os.path.join(project_root, 'logs', 'pipeline.log')),
        logging.StreamHandler()
    ]
)

# Load configuration
config_path = os.path.join(project_root, 'config', 'config.yaml')
if not os.path.exists(config_path):
    raise FileNotFoundError(f"Config file not found at {config_path}. Please ensure the config directory exists and contains config.yaml")

with open(config_path, 'r') as f:
    config = yaml.safe_load(f)

# Initialize classes with the correct config structure
data_processor = DataProcessor(config, logging.getLogger('DataProcessor'))
storage = EfficientDataStorage(config, logging.getLogger('Storage'))
feature_generator = FeatureGenerator(config, logging.getLogger('FeatureGenerator'))

# Generate run ID
sydney_tz = pytz.timezone('Australia/Sydney')
current_time = datetime.now(sydney_tz)
run_id = current_time.strftime('%Y%m%d_%H%M%S')

# Initialize feature selector using the new from_config method
feature_selector = FeatureSelector.from_config(config, run_id=run_id)

# Set default target date and constants
target_date = '2024-09-02'
TARGET_COL = 'PE_RATIO_RATIO_SIGNED_LOG'

def get_latest_run_id(results_dir: str) -> str:
    """
    Get the latest run ID from the results directory.
    
    Args:
        results_dir: Path to the results directory
        
    Returns:
        Latest run ID
    """
    run_dirs = [d for d in os.listdir(results_dir) if d.startswith('run_')]
    if not run_dirs:
        raise FileNotFoundError(f"No run directories found in {results_dir}")
    return sorted(run_dirs)[-1].replace('run_', '')

def get_metrics_file_path(config: Dict[str, Any], run_id: str, date: str) -> str:
    """
    Get the path to the performance metrics file.
    
    Args:
        config: Configuration dictionary
        run_id: Run identifier
        date: Date in YYYY-MM-DD format
        
    Returns:
        Path to the performance metrics file
    """
    # Convert date to datetime to handle timestamp format
    date_obj = pd.to_datetime(date)
    # Format date with timestamp
    formatted_date = date_obj.strftime("%Y-%m-%d %H:%M:%S")
    
    return os.path.join(
        config['output']['results_dir'],
        f"run_{run_id}",
        f"performance_metrics_{formatted_date}.json"
    )

# Helper function to run performance analysis
def run_performance_analysis(date: str = target_date, run_id: str = None):
    """
    Run performance analysis for a given date and run_id.
    
    Args:
        date: Target date for analysis (default: target_date)
        run_id: Run identifier (default: latest available run)
    """
    if run_id is None:
        run_id = get_latest_run_id(feature_selector.config['output']['results_dir'])
    
    metrics_file = get_metrics_file_path(feature_selector.config, run_id, date)
    
    analyze_performance_metrics(
        metrics_file=metrics_file,
        target_date=date,
        run_id=run_id,
        output_dir=os.path.join(feature_selector.config['output']['results_dir'], f"run_{run_id}")
    )

2025-04-12 06:31:24,010 - src.data.data_processor - INFO - Validating data structure
2025-04-12 06:31:24,012 - src.data.data_processor - INFO - Data structure validation completed successfully
2025-04-12 06:31:24,013 - Storage - INFO - Validating data structure
2025-04-12 06:31:24,014 - Storage - INFO - Data structure validation completed successfully
2025-04-12 06:31:24,028 - FeatureSelector - INFO - Initializing FeatureSelector
2025-04-12 06:31:24,028 - FeatureSelector - INFO - Initializing FeatureSelector
2025-04-12 06:31:24,030 - FeatureSelector - INFO - Saved run configuration to /home/siddharth.johri/DECOHERE/data/results/feature_selection/run_20250412_163124/run_config.json
2025-04-12 06:31:24,030 - FeatureSelector - INFO - Saved run configuration to /home/siddharth.johri/DECOHERE/data/results/feature_selection/run_20250412_163124/run_config.json


In [2]:
# --- Cell 2: Processing Function (with Enhanced Features) ---
def run_pipeline_for_date(date_str: str, processor: DataProcessor, storage: EfficientDataStorage, 
                         feature_generator: Optional[FeatureGenerator], config: dict):
    """Runs the full data pipeline for a single date, including enhanced features if available."""
    
    if not processor or not storage:
        logging.error(f"[{date_str}] Processor or Storage not initialized. Aborting.")
        return False, None, None, None

    logging.info(f"--- Starting Pipeline for Date: {date_str} ---")
    processed_file_path = None
    pre_feature_file_path = None
    enhanced_feature_file_path = None
    success = False

    try:
        # 1. Load and process raw data
        raw_data = processor.load_raw_data(date_str)
        if raw_data.empty:
            logging.warning(f"[{date_str}] No raw data found. Skipping remaining steps.")
            return True, None, None, None

        # 2. Process data
        transformed_data = processor.transform_raw_data(raw_data)
        filled_data = processor.fill_missing_values(transformed_data)

        # 3. Store processed data
        processed_file_path = storage.store_data(
            df=filled_data, 
            data_type=DataType.FUNDAMENTALS,
            stage=DataStage.PROCESSED, 
            date=date_str
        )

        # 4. Generate and store pre-feature data
        pre_feature_df = storage.processed_data_feat_gen(filled_data)
        if not pre_feature_df.empty:
            pre_feature_file_path = storage.store_data(
                df=pre_feature_df, 
                data_type=DataType.FUNDAMENTALS,
                stage=DataStage.FEATURES, 
                date=date_str, 
                sub_type='pre_feature_set'
            )

            # 5. Generate enhanced features if generator available
            if feature_generator:
                # Get parameters from config
                feature_cfg = config.get('features', {})
                hist_w = feature_cfg.get('hist_window', 6)
                fwd_w = feature_cfg.get('fwd_window', 6)
                target_m = feature_cfg.get('target_metric', 'PE_RATIO_RATIO_SIGNED_LOG')
                sector_map_rel_path = feature_cfg.get('sector_mapping_path', None)
                sector_map_abs_path = os.path.join(project_root, sector_map_rel_path) if sector_map_rel_path else None
                sector_levels = feature_cfg.get('sector_levels_to_include', ['sector_1'])
                include_sectors = feature_cfg.get('include_sector_features', True)

                enhanced_feature_df = feature_generator.generate_enhanced_features(
                    df=pre_feature_df,
                    hist_window=hist_w,
                    fwd_window=fwd_w,
                    target_metric=target_m,
                    sector_mapping_path=sector_map_abs_path,
                    sector_levels_to_include=sector_levels,
                    include_sector_features=include_sectors
                )
                
                if not enhanced_feature_df.empty:
                    enhanced_feature_file_path = storage.store_data(
                        df=enhanced_feature_df, 
                        data_type=DataType.FUNDAMENTALS,
                        stage=DataStage.FEATURES, 
                        date=date_str, 
                        sub_type='enhanced_features'
                    )
            else:
                logging.info(f"[{date_str}] FeatureGenerator not available. Skipping enhanced features.")

        success = True

    except Exception as e:
        logging.error(f"[{date_str}] Pipeline error: {e}", exc_info=True)
        success = False

    finally:
        logging.info(f"--- Finished Pipeline for Date: {date_str} (Success: {success}) ---")

    return success, processed_file_path, pre_feature_file_path, enhanced_feature_file_path

print("Pipeline function defined.")

Pipeline function defined.


In [3]:
# %%
# --- new Cell 3: Execution Loop and Verification ---
dates_to_process = ['2024-09-02']#, '2024-09-03', '2024-09-04','2024-09-05', '2024-09-06', '2024-09-09', '2024-09-10','2024-09-11', '2024-09-12', '2024-09-13']
results = {}

# Process each date
for date_str in dates_to_process:
    logging.info(f"--- Starting Pipeline Run for Date: {date_str} ---")
    success, proc_path, pre_feat_path, enh_feat_path = run_pipeline_for_date(
        date_str=date_str,
        processor=data_processor,  # Changed from processor to data_processor
        storage=storage,
        feature_generator=feature_generator,
        config=config
    )
    results[date_str] = (success, proc_path, pre_feat_path, enh_feat_path)
    logging.info(f"--- Completed Pipeline Run for Date: {date_str} ---")

# Verify results
def verify_pipeline_output(date_str: str, result: tuple, expect_files: bool = True) -> bool:
    """Verify the pipeline output for a specific date."""
    success, proc_path, pre_feat_path, enh_feat_path = result
    
    if not success:
        logging.error(f"[{date_str}] Pipeline run failed")
        return False
        
    if expect_files:
        # Check if all expected files exist
        for path, file_type in [
            (proc_path, "processed data"),
            (pre_feat_path, "pre-feature data"),
            (enh_feat_path, "enhanced features")
        ]:
            if path and not os.path.exists(path):
                logging.error(f"[{date_str}] Missing {file_type} file: {path}")
                return False
            elif path:
                logging.info(f"[{date_str}] Verified {file_type} file: {path}")
    
    return True

# Verify all results
all_success = True
for date_str, result in results.items():
    if not verify_pipeline_output(date_str, result):
        all_success = False
        logging.error(f"[{date_str}] Verification failed")
    else:
        logging.info(f"[{date_str}] Verification successful")

if all_success:
    logging.info("All pipeline runs completed and verified successfully")
else:
    logging.error("Some pipeline runs failed verification")

print("Pipeline execution and verification complete.")

2025-04-12 06:31:30,766 - root - INFO - --- Starting Pipeline Run for Date: 2024-09-02 ---
2025-04-12 06:31:30,767 - root - INFO - --- Starting Pipeline for Date: 2024-09-02 ---
2025-04-12 06:31:30,767 - src.data.data_processor - INFO - Loading raw data from /home/siddharth.johri/DECOHERE/data/raw/fundamentals/financials_2024_09.pq
2025-04-12 06:31:30,809 - src.data.data_processor - INFO - Filtering data for date: 2024-09-02
2025-04-12 06:31:30,818 - src.data.data_processor - INFO - Loaded raw data with shape: (5223, 34)
2025-04-12 06:31:30,819 - src.data.data_processor - INFO - Calculating periods for each ticker using COHERE logic
2025-04-12 06:31:30,834 - src.data.data_processor - INFO - Number of unique fiscal months per ID: fiscal_month
1    457
2     43
Name: count, dtype: int64
2025-04-12 06:31:30,834 - src.data.data_processor - INFO - Calculating periods by ID using id column: 'ID'
2025-04-12 06:31:32,678 - src.data.data_processor - INFO - Period counts: {-6: np.int64(1), -5: n

Processing groups:   0%|          | 0/500 [00:00<?, ?it/s]

2025-04-12 06:31:38,457 - FeatureGenerator - INFO - First group columns: ['ID', 'PIT_DATE', 'PERIOD', 'NET_INCOME_COEFF_OF_VAR', 'EBIT_COEFF_OF_VAR', 'EBITDA_COEFF_OF_VAR', 'SALES_COEFF_OF_VAR', 'NET_INCOME_RAW_SIGNED_LOG', 'NET_INCOME_RAW_SCALED_SALES_SIGNED_LOG', 'EBIT_RAW_SIGNED_LOG', 'EBIT_RAW_SCALED_SALES_SIGNED_LOG', 'EBITDA_RAW_SIGNED_LOG', 'EBITDA_RAW_SCALED_SALES_SIGNED_LOG', 'SALES_RAW_SIGNED_LOG', 'NET_OPERATING_ASSETS_RAW_SIGNED_LOG', 'NET_OPERATING_ASSETS_RAW_SCALED_SALES_SIGNED_LOG', 'INVENTORIES_RAW_SIGNED_LOG', 'INVENTORIES_RAW_SCALED_SALES_SIGNED_LOG', 'FREE_CASH_FLOW_RAW_SIGNED_LOG', 'FREE_CASH_FLOW_RAW_SCALED_SALES_SIGNED_LOG', 'DIVIDEND_RAW_SIGNED_LOG', 'DIVIDEND_RAW_SCALED_SALES_SIGNED_LOG', 'CAPEX_RAW_SIGNED_LOG', 'CAPEX_RAW_SCALED_SALES_SIGNED_LOG', 'DEPRECIATION_RAW_SIGNED_LOG', 'DEPRECIATION_RAW_SCALED_SALES_SIGNED_LOG', 'NET_INCOME_CSTAT_STD_RAW_SIGNED_LOG', 'NET_INCOME_CSTAT_STD_RAW_SCALED_SALES_SIGNED_LOG', 'EBIT_CSTAT_STD_RAW_SIGNED_LOG', 'EBIT_CSTAT_STD_RA

Pipeline execution and verification complete.


In [7]:
# #Comparison block

# # s =pd.read_parquet('/home/siddharth.johri/DECOHERE/data/raw/fundamentals/financials_2024_09.pq')
# a = pd.read_parquet('/home/siddharth.johri/DECOHERE/data/processed/fundamentals/year=2024/month=09/data_2024-09-04.pq')
# b = pd.read_parquet('/home/siddharth.johri/DECOHERE/data/features/fundamentals/pre_feature_set/year=2024/month=09/data_2024-09-04.pq')
c = pd.read_parquet('/home/siddharth.johri/DECOHERE/data/features/fundamentals/enhanced_features/year=2024/month=09/data_2024-09-02.pq')
list(c.columns)
# # # a.query('ID == "INFO IB Equity" & PERIOD_END_DATE == "2024-03-31"')['PE_RATIO_RATIO']
# # # c.query('ID == "INFO IB Equity"')['PE_RATIO_RATIO_SIGNED_LOG']
s= pd.read_parquet('/home/siddharth.johri/DECOHERE/data/raw/sector/sector_mappings.pq')
s.head()

Unnamed: 0,ID,sector_1,sector_2,sector_3,sector_4
0,360ONE IB Equity,Financials,Financial Services,Asset Management,Wealth Management
1,3M IB Equity,Materials,Materials,Chemicals,Specialty Chemicals
2,AACL IB Equity,Materials,Materials,Chemicals,Basic & Diversified Chemicals
3,AAVAS IB Equity,Financials,Financial Services,Specialty Finance,Consumer Finance
4,ABB IB Equity,Industrials,Industrial Products,Electrical Equipment,Electrical Power Equipment


In [None]:
# --- Cell 4.1: Feature Tuning (Run every 5th day) ---

# --- Configuration ---
tuning_target_date = '2024-09-02' # Set the date for the end of the 20-day tuning window
run_tuning = True # Set to True on the 5th day, False otherwise
k_features_target = 50 # Desired number of stable features
tuning_n_trials = 50 # Number of Optuna trials (adjust based on time/resources)
tuning_n_splits = 5 # Number of TimeSeriesSplit folds
tuning_lookback_days = 20 # Max days of history for tuning
stable_features_filepath = os.path.join(config['output']['results_dir'], f'stable_features_{tuning_target_date}.json')

# --- Run Tuning ---
if run_tuning:
    print(f"--- Running Feature Stability Tuning for period ending {tuning_target_date} ---")
    
    # Initialize FeatureSelector (ensure config and run_id are defined earlier)
    # Assuming feature_selector object needs to be initialized if not already done
    if 'feature_selector' not in locals():
         feature_selection_config = { # Adapt based on your actual config structure
            'data': { 'base_dir': config['data']['base_dir'] },
            'feature_selection': config['feature_selection'],
            'output': { 'results_dir': config['output']['results_dir'] }
         }
         feature_selector = FeatureSelector(feature_selection_config, run_id=run_id, logger=logging.getLogger('FeatureSelectorTune'))
         print("Initialized FeatureSelector for tuning.")
         
    stable_features, best_params = feature_selector.tune_for_stable_features(
        target_date_str=tuning_target_date,
        k_features=k_features_target,
        n_trials=tuning_n_trials,
        n_splits=tuning_n_splits,
        max_days_lookback=tuning_lookback_days,
        target_col='PE_RATIO_RATIO_SIGNED_LOG' # Ensure this is the correct target
    )

    if stable_features:
        print(f"\n--- Tuning Complete ---")
        print(f"Identified {len(stable_features)} stable features.")
        print("Best Hyperparameters found:")
        print(json.dumps(best_params, indent=4))
        
        # Save the stable features list
        print(f"Saving stable features to: {stable_features_filepath}")
        os.makedirs(os.path.dirname(stable_features_filepath), exist_ok=True)
        with open(stable_features_filepath, 'w') as f:
            json.dump(stable_features, f, indent=4)
            
        # Store features in a variable for immediate use if needed later in the notebook
        current_stable_features = stable_features 
    else:
        print(f"\n--- Tuning Failed ---")
        print("Falling back to potentially loading previously saved features if they exist.")
        current_stable_features = None
        # Optionally load the last known good feature set here if tuning fails
        # if os.path.exists(stable_features_filepath):
        #     try:
        #         with open(stable_features_filepath, 'r') as f:
        #             current_stable_features = json.load(f)
        #         print(f"Loaded previously saved stable features from: {stable_features_filepath}")
        #     except Exception as e:
        #         print(f"Error loading previous features: {e}")
        #         current_stable_features = None
        
else:
    print("--- Skipping Feature Tuning ---")
    # Load the latest available stable features file determined by tuning_target_date
    print(f"Attempting to load stable features from: {stable_features_filepath}")
    if os.path.exists(stable_features_filepath):
        try:
            with open(stable_features_filepath, 'r') as f:
                current_stable_features = json.load(f)
            print(f"Successfully loaded {len(current_stable_features)} stable features.")
        except Exception as e:
            print(f"Error loading stable features file: {e}")
            current_stable_features = None
    else:
        print("Stable features file not found.")
        current_stable_features = None

# Ensure the variable is defined even if loading fails
if 'current_stable_features' not in locals():
     current_stable_features = None
     
if not current_stable_features:
     print("\nWARNING: No stable feature set is available. Daily processing might fail or use all features.")


In [None]:
# --- Cell 4.2: Daily Processing (Run every day) ---

# --- Configuration ---
daily_target_date = '2024-09-02' # Set the specific date for daily processing

# --- Load or Use Tuned Features ---
# Ensure 'current_stable_features' variable is available from Cell 3
if 'current_stable_features' not in locals():
     print("ERROR: Stable feature set ('current_stable_features') not found from tuning cell.")
     # Handle error: maybe try loading again or raise an exception
     raise ValueError("Stable features must be loaded or generated before daily processing.")
elif not current_stable_features:
     print("WARNING: Proceeding without a pre-defined stable feature set. Full feature selection might run if supported, or it might fail.")
     # Decide behavior: run full selection (None) or raise error?
     # Passing None will trigger the original selection logic inside select_features_daily
     feature_subset_to_use = None 
else:
     print(f"Using {len(current_stable_features)} pre-tuned stable features for date {daily_target_date}.")
     feature_subset_to_use = current_stable_features


# --- Initialize FeatureSelector (if needed) ---
# Assuming feature_selector object is initialized earlier in the notebook
if 'feature_selector' not in locals():
     feature_selection_config = { # Adapt based on your actual config structure
        'data': { 'base_dir': config['data']['base_dir'] },
        'feature_selection': config['feature_selection'],
        'output': { 'results_dir': config['output']['results_dir'] }
     }
     feature_selector = FeatureSelector(feature_selection_config, run_id=run_id, logger=logging.getLogger('FeatureSelectorDaily'))
     print("Initialized FeatureSelector for daily processing.")

# --- Run Daily Process using the Subset ---
print(f"\nRunning daily processing for {daily_target_date}...")
daily_results = feature_selector.select_features_daily(
    target_date_str=daily_target_date,
    target_col='PE_RATIO_RATIO_SIGNED_LOG', # Ensure this is correct
    feature_subset=feature_subset_to_use # Pass the loaded stable features
)

# --- Display Daily Results ---
if daily_results:
    print(f"\nDaily Processing Results for {daily_target_date}:")
    
    selected_features = daily_results.get('selected_features', [])
    print(f"\nFeatures used/selected ({len(selected_features)}):")
    # Print only a few if the list is long
    for feature in selected_features[:10]: # Print top 10 or all
        print(f"- {feature}")
    if len(selected_features) > 10:
        print(f"... and {len(selected_features) - 10} more.")

    # Importance scores might be None if feature_subset was used and importance wasn't recalculated
    importance_df = daily_results.get('importance_scores')
    if importance_df is not None and not importance_df.empty:
        print("\nFeature Importance Scores (for the subset used):")
        # Ensure sorting happens if DF exists
        importance_df = importance_df.sort_values(getattr(importance_df, 'mean_importance', importance_df.columns[0]), ascending=False) 
        print(importance_df.head(10))
    else:
        print("\nFeature Importance Scores: Not calculated or available for pre-selected subset.")

    # SHAP values might also be None
    selected_shap_values = daily_results.get('selected_shap_values')
    if selected_shap_values is not None and selected_shap_values[0] is not None:
         print("\nSHAP Values Summary (for the subset used):")
         # Ensure columns match features
         shap_columns = selected_features if len(selected_features) == selected_shap_values[0].shape[1] else None
         if shap_columns:
              shap_df = pd.DataFrame(selected_shap_values[0], columns=shap_columns)
              print(shap_df.describe())
         else:
              print("SHAP columns mismatch or features unavailable.")
    else:
        print("\nSHAP Values: Not calculated or available for pre-selected subset.")

    # Visualize feature importance if available
    if importance_df is not None and not importance_df.empty:
         try:
            feature_selector.visualize_feature_importance(daily_results)
         except Exception as e:
            print(f"Could not generate importance visualization: {e}")
            
    # Create summary report if needed
    # feature_selector.create_summary(daily_results, 'PE_RATIO_RATIO_SIGNED_LOG') # Adapt if needed

    # --- ADD YOUR DAILY REGRESSION LOGIC HERE ---
    # Use the 'daily_results' (specifically the data implicitly processed within 
    # select_features_daily, or X_processed if returned) and the 'selected_features'
    # list to run your daily cross-sectional regression.
    print("\nPlaceholder for daily regression using the selected features...")
    # Example:
    # if 'X_processed' in daily_results and 'y_processed' in daily_results:
    #    X_daily_final = daily_results['X_processed'][selected_features]
    #    y_daily_final = daily_results['y_processed']
    #    # ... train/predict daily model ...
    #    residuals = ...
    # else:
    #    print("Need processed data returned from select_features_daily to run regression.")
    
else:
    print(f"Daily processing failed for {daily_target_date}")


In [None]:
# --- Cell 4: Feature Selection ---
target_date = '2024-09-02'

# Initialize FeatureSelector with correct config structure
feature_selection_config = {
    'data': {
        'base_dir': config['data']['base_dir'],  # This should be the root data directory
        'features': {
            'fundamentals': {
                'base_dir': os.path.join(config['data']['base_dir'], 'features', 'fundamentals')
            }
        }
    },
    'feature_selection': config['feature_selection'],
    'output': {
        'results_dir': os.path.join(config['data']['base_dir'], 'results', 'feature_selection')
    }
}

# Initialize feature selector
feature_selector = FeatureSelector(feature_selection_config, run_id=run_id, logger=logging.getLogger('FeatureSelector'))

# Select features for target date
results = feature_selector.select_features_daily(target_date, target_col='PE_RATIO_RATIO_SIGNED_LOG')

# Display results
if results:
    print(f"\nSelected {len(results['selected_features'])} features:")
    for feature in results['selected_features']:
        print(f"- {feature}")
    
    # Display feature importance scores
    print("\nFeature Importance Scores:")
    importance_df = results['importance_scores']  # This is already a DataFrame
    importance_df = importance_df.sort_values('mean_importance', ascending=False)
    print(importance_df.head(10))
    
    # Display SHAP values summary
    print("\nSHAP Values Summary:")
    shap_df = pd.DataFrame(results['selected_shap_values'][0], columns=results['selected_features'])
    print(shap_df.describe())
    
    # Visualize feature importance
    feature_selector.visualize_feature_importance(results)
    
    # Create summary
    feature_selector.create_summary(results, 'PE_RATIO_RATIO_SIGNED_LOG')
else:
    print("No results returned from feature selection")

In [None]:
run_performance_analysis()

In [7]:
# # Example: Run feature selection for a specific date
# date = "2024-03-15"
# results = run_feature_selection(date)

# # Example: Run feature selection with custom run ID
# custom_run_id = "my_experiment_1"
# results = run_feature_selection(date, run_id=custom_run_id)

# # Example: Analyze feature stability
# start_date = "2024-03-01"
# end_date = "2024-03-15"
# stability = analyze_feature_stability(start_date, end_date, custom_run_id)

In [None]:
# --- Cell 4: Feature Selection ---
import numpy as np
import pandas as pd
import os
import logging
from datetime import datetime

target_date = '2024-09-02'

# Initialize FeatureSelector with correct config structure
feature_selection_config = {
    'data': {
        'base_dir': config['data']['base_dir'],  # This should be the root data directory
        'features': {
            'fundamentals': {
                'base_dir': os.path.join(config['data']['base_dir'], 'features', 'fundamentals')
            }
        }
    },
    'feature_selection': config['feature_selection'],
    'output': {
        'results_dir': os.path.join(config['data']['base_dir'], 'results', 'feature_selection')
    }
}

# Initialize feature selector
feature_selector = FeatureSelector(feature_selection_config, run_id=run_id, logger=logging.getLogger('FeatureSelector'))

# Select features for target date
results = feature_selector.select_features_daily(target_date, target_col='PE_RATIO_RATIO_SIGNED_LOG')

# Display results
if results:
    print(f"\nSelected {len(results['selected_features'])} features:")
    for feature in results['selected_features']:
        print(f"- {feature}")
    
    # Display feature importance scores
    print("\nFeature Importance Scores:")
    importance_df = results['importance_scores']  # This is already a DataFrame
    importance_df = importance_df.sort_values('mean_importance', ascending=False)
    print(importance_df.head(10))
    
    # Display SHAP values summary
    print("\nSHAP Values Summary:")
    shap_df = pd.DataFrame(results['selected_shap_values'][0], columns=results['selected_features'])
    print(shap_df.describe())
    
    # Display cross-validation performance
    print("\nCross-validation Performance:")
    print("-" * 30)
    for i, score in enumerate(results['cv_scores']):
        print(f"Fold {i+1}:")
        print(f"  RMSE: {score['rmse']:.4f}")
        print(f"  R2: {score['r2']:.4f}")
        print(f"  Trees: {score['n_trees']}")
    
    print("\nAverage Performance:")
    print("-" * 30)
    avg_rmse = np.mean([score['rmse'] for score in results['cv_scores']])
    avg_r2 = np.mean([score['r2'] for score in results['cv_scores']])
    print(f"Average RMSE: {avg_rmse:.4f}")
    print(f"Average R2: {avg_r2:.4f}")
    
    # Visualize feature importance
    feature_selector.visualize_feature_importance(results)
    
    # Create summary
    feature_selector.create_summary(results, 'PE_RATIO_RATIO_SIGNED_LOG')
else:
    print("No results returned from feature selection")