## Trial-runs for data processing step

In [25]:
import os 

In [26]:
os.chdir(r"C:\Users\Junior\OneDrive\Desktop\Heart-Attack-Prediction-Model")

## Trial-runs for entity_config 

In [27]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataProcessingConfig:
    root_dir: Path
    unzip_data_dir: Path
    selected_data_file: Path 
    validation_report: Path
    all_schema: dict
    target_column: str

## Trial-runs for configuration file

In [28]:
#importing all project paths and modules necessary for project configurations 
from heartAttack.constants import  *
from heartAttack.utils.common import read_yaml, create_directories

In [29]:
#Updating the configuration file 
class ConfigurationManager:
    def __init__(
        self, 
        config_filepath = CONFIG_FILE_PATH, 
        params_filepath = PARAMS_FILE_PATH,
        selected_schema_filepath = SELECTED_SCHEMA_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.selected_schema = read_yaml(selected_schema_filepath)
       
        create_directories([self.config.artifacts_root])
    
    def get_data_processing_config(self) -> DataProcessingConfig:
        config = self.config.data_processing
        data_validation_config = self.config.data_validation
        selected_schema = self.selected_schema.COLUMNS  
            
        # Get target column from selected_schema 
        target_column = getattr(self.selected_schema, 'TARGET_COLUMN', None)
        if target_column is None:
            raise ValueError("Target column not specified in selected schema")
        
        create_directories([config.root_dir])
        
        data_processing_config = DataProcessingConfig(
            root_dir=Path(config.root_dir),
            validation_report= Path(config.validation_report),
            selected_data_file=Path(config.selected_data_file),
            all_schema=selected_schema,
            unzip_data_dir=Path(data_validation_config.unzip_data_dir),
            target_column=target_column  
        )
        return data_processing_config

## Trial-runs for data_processing_component

In [30]:
import os 
from heartAttack import logger
import pandas as pd 

In [31]:
class DataProcessing:
    def __init__(self, config: DataProcessingConfig):
        self.config = config
    
    def extract_and_save_features(self):
        """Load, validate, and save selected features"""
        try:
            df = pd.read_csv(self.config.unzip_data_dir)
            selected_columns = list(self.config.all_schema.keys())
            
            # Validate and select columns
            if missing := [col for col in selected_columns if col not in df.columns]:
                raise ValueError(f"Missing columns: {missing}")
            
            selected_df = df[selected_columns].copy()
            self.config.selected_data_file.parent.mkdir(parents=True, exist_ok=True)
            selected_df.to_csv(self.config.selected_data_file, index=False)
            
            # Log results
            logger.info(f"Saved {len(selected_columns)} features to {self.config.selected_data_file}")
            if hasattr(self.config, 'target_column'):
                logger.info(f"Target column: {self.config.target_column}")
            
            return selected_df
            
        except Exception as e:
            logger.error(f"Feature extraction failed: {e}")
            raise
    
    def validate_all_columns(self) -> bool:
        """Validate data against schema"""
        try:
            data = pd.read_csv(self.config.selected_data_file)
            schema_cols = set(self.config.all_schema.keys())
            data_cols = set(data.columns)
            
            validation_status = data_cols.issubset(schema_cols)
            report_content = (
                f"Validation status: {validation_status}\n"
                f"Data columns: {sorted(data_cols)}\n"
                f"Schema columns: {sorted(schema_cols)}"
            )
            
            self.config.validation_report.parent.mkdir(exist_ok=True, parents=True)
            self.config.validation_report.write_text(report_content)
            
            logger.info(f"Validation {'passed' if validation_status else 'failed'}")
            return validation_status
            
        except Exception as e:
            logger.error(f"Validation failed: {e}")
            raise

## Trial-runs for data_processing_pipeline

In [32]:
try: 
    config = ConfigurationManager()
    data_processing_config = config.get_data_processing_config()
    data_processing = DataProcessing(config=data_processing_config)
    data_processing.extract_and_save_features()
    data_processing.validate_all_columns()
except Exception as e:

    raise e

[2025-07-29 22:04:55,994: INFO: common: yaml_file: config\config.yaml loaded successfully]
[2025-07-29 22:04:55,998: INFO: common: yaml_file: params.yaml loaded successfully]
[2025-07-29 22:04:56,004: INFO: common: yaml_file: selected_schema.yaml loaded successfully]
[2025-07-29 22:04:56,006: INFO: common: Created directory at: artifacts]
[2025-07-29 22:04:56,007: INFO: common: Created directory at: artifacts/data_processing]
[2025-07-29 22:04:56,025: INFO: 404485137: Saved 10 features to artifacts\data_processing\selected_features.csv]
[2025-07-29 22:04:56,030: INFO: 404485137: Target column: {'name': 'heart_attack'}]
[2025-07-29 22:04:56,063: INFO: 404485137: Validation passed]
