# DECOHERE Quantitative Trading Pipeline - Updated Version

This notebook implements the updated pipeline structure with efficient data storage and processing.

## Pipeline Components
1. Data Loading and Storage
2. Data Processing
3. Feature Generation
4. Feature Selection
5. Model Training and Evaluation

## Pipeline Modes
- Day Mode: Process single day data
- Week Mode: Process weekly data
- Year Mode: Process yearly data

In [1]:
import os
import sys
import pandas as pd
import numpy as np
import yaml
import logging
from datetime import datetime, timedelta
from pathlib import Path

# Add project root to Python path
project_root = str(Path.cwd().parent)
if project_root not in sys.path:
    sys.path.append(project_root)

# Import project modules
from src.data.efficient_data_storage import EfficientDataStorage, DataType, DataStage
from src.data.data_processor import DataProcessor
from src.features.feature_generator import FeatureGenerator
from src.features.feature_selector import FeatureSelector
from src.models.model_trainer import ModelTrainer
from src.utils.logging_config import setup_logging
from src.utils.config_loader import load_config, load_mode_config

## 1. Setup and Configuration

In [3]:
def setup_pipeline():
    """Initialize pipeline configuration and logging."""
    # Load configuration
    config = load_config()
    
    # Setup logging
    logger = setup_logging(config)
    logger.info("Pipeline initialization started")
    
    # Initialize storage system
    storage = EfficientDataStorage(config, logger)
    
    # Initialize data processor
    processor = DataProcessor(config, logger)
    
    logger.info("Pipeline initialization completed")
    return config, logger, storage, processor

# Initialize pipeline
config, logger, storage, processor = setup_pipeline()

FileNotFoundError: Configuration file not found: config.yaml

## 2. Data Loading and Processing

In [None]:
def load_and_process_data(date_str: str, mode: str = 'day'):
    """Load and process data for a specific date and mode."""
    logger.info(f"Loading and processing data for date: {date_str} in {mode} mode")
    
    try:
        # Load fundamentals data
        fundamentals_df = storage.load_data(
            data_type=DataType.FUNDAMENTALS,
            stage=DataStage.RAW,
            date=date_str,
            mode=mode
        )
        
        if fundamentals_df.empty:
            logger.error(f"No fundamentals data found for date: {date_str}")
            return None
        
        # Load returns data
        returns_df = storage.load_data(
            data_type=DataType.RETURNS,
            stage=DataStage.RAW,
            date=date_str,
            mode=mode
        )
        
        # Process fundamentals data - store intermediate results
        intermediate_data = processor.process_fundamentals(fundamentals_df)
        
        # Store intermediate data
        storage.store_data(
            df=intermediate_data,
            data_type=DataType.FUNDAMENTALS,
            stage=DataStage.INTERMEDIATE,
            date=date_str
        )
        
        # Further process for final processed data
        processed_fundamentals = processor.prepare_for_features(intermediate_data)
        
        # Store processed data
        storage.store_data(
            df=processed_fundamentals,
            data_type=DataType.FUNDAMENTALS,
            stage=DataStage.PROCESSED,
            date=date_str
        )
        
        return processed_fundamentals, intermediate_data, returns_df
        
    except Exception as e:
        logger.error(f"Error in data loading and processing: {e}")
        return None

# Example usage
date_str = datetime.now().strftime('%Y-%m-%d')
processed_data, intermediate_data, returns_data = load_and_process_data(date_str, mode='day')

if processed_data is not None:
    print(f"Processed data shape: {processed_data.shape}")
    print(f"Intermediate data shape: {intermediate_data.shape}")
    print(f"Returns data shape: {returns_data.shape if not returns_data.empty else 'No returns data'}")

## 3. Feature Generation

In [None]:
def generate_features(intermediate_data: pd.DataFrame, processed_data: pd.DataFrame, date_str: str):
    """Generate features from intermediate and processed data."""
    logger.info(f"Generating features for date: {date_str}")
    
    try:
        # Initialize feature generator
        feature_gen = FeatureGenerator(config, logger)
        
        # Generate features using both intermediate and processed data
        features_df = feature_gen.generate_features(intermediate_data, processed_data)
        
        # Store features
        storage.store_data(
            df=features_df,
            data_type=DataType.FUNDAMENTALS,
            stage=DataStage.FEATURES,
            date=date_str
        )
        
        return features_df
        
    except Exception as e:
        logger.error(f"Error in feature generation: {e}")
        return None

# Generate features
if processed_data is not None and intermediate_data is not None:
    features_df = generate_features(intermediate_data, processed_data, date_str)
    if features_df is not None:
        print(f"Generated features shape: {features_df.shape}")

## 4. Feature Selection

In [None]:
def select_features(features_df: pd.DataFrame, returns_df: pd.DataFrame):
    """Select the most relevant features."""
    logger.info("Starting feature selection")
    
    try:
        # Initialize feature selector
        selector = FeatureSelector(config, logger)
        
        # Select features
        selected_features = selector.select_features(features_df, returns_df)
        
        return selected_features
        
    except Exception as e:
        logger.error(f"Error in feature selection: {e}")
        return None

# Select features
if features_df is not None and not returns_df.empty:
    selected_features = select_features(features_df, returns_df)
    if selected_features is not None:
        print(f"Selected {len(selected_features)} features")

## 5. Model Training and Evaluation

In [None]:
def train_and_evaluate_model(features_df: pd.DataFrame, returns_df: pd.DataFrame, selected_features: list):
    """Train and evaluate the model."""
    logger.info("Starting model training and evaluation")
    
    try:
        # Initialize model trainer
        trainer = ModelTrainer(config, logger)
        
        # Train model
        model = trainer.train_model(features_df, returns_df, selected_features)
        
        # Evaluate model
        evaluation_results = trainer.evaluate_model(model, features_df, returns_df)
        
        return model, evaluation_results
        
    except Exception as e:
        logger.error(f"Error in model training and evaluation: {e}")
        return None, None

# Train and evaluate model
if selected_features is not None:
    model, results = train_and_evaluate_model(features_df, returns_df, selected_features)
    if model is not None and results is not None:
        print("Model training and evaluation completed successfully")

## 6. Run Complete Pipeline

In [None]:
def run_pipeline(date_str: str, mode: str = 'day'):
    """Run the complete pipeline for a specific date and mode."""
    logger.info(f"Starting pipeline run for date: {date_str} in {mode} mode")
    
    try:
        # 1. Load and process data
        processed_data, returns_data = load_and_process_data(date_str, mode)
        if processed_data is None:
            return False
        
        # 2. Generate features
        features_df = generate_features(processed_data, date_str)
        if features_df is None:
            return False
        
        # 3. Select features
        selected_features = select_features(features_df, returns_data)
        if selected_features is None:
            return False
        
        # 4. Train and evaluate model
        model, results = train_and_evaluate_model(features_df, returns_data, selected_features)
        if model is None or results is None:
            return False
        
        logger.info("Pipeline run completed successfully")
        return True
        
    except Exception as e:
        logger.error(f"Error in pipeline run: {e}")
        return False

# Run pipeline
success = run_pipeline(date_str, mode='day')
if success:
    print("Pipeline completed successfully")
else:
    print("Pipeline failed")