## Data Processing Step Trial-runs 

In [1]:
import os 

In [2]:
os.chdir(r"C:\Users\USER\Desktop\MLDefaults\Rising-Village-Prediction-Model")

## Trial-runs for entity_config file 

In [3]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataProcessingConfig:
    root_dir: Path
    unzip_data_dir: Path
    selected_data_file: Path 
    validation_report: str
    all_schema: dict
    target_column: str

## Trial-runs for CongigurationManager file 

In [4]:
#importing all project paths and modules necessary for project configurations 
from raisingVillage.constants import  *
from raisingVillage.utils.common import read_yaml, create_directories

In [5]:
#Updating the configuration file 
class ConfigurationManager:
    def __init__(
        self, 
        config_filepath = CONFIG_FILE_PATH, 
        params_filepath = PARAMS_FILE_PATH,
        selected_schema_filepath = SELECTED_SCHEMA_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.selected_schema = read_yaml(selected_schema_filepath)
       
        create_directories([self.config.artifacts_root])
    
    def get_data_processing_config(self) -> DataProcessingConfig:
        config = self.config.data_processing
        data_validation_config = self.config.data_validation
        selected_schema = self.selected_schema.COLUMNS
        
        # Get target column from schema (adjust this based on your actual schema structure)
        target_column = getattr(self.selected_schema, 'TARGET', None)
        if target_column is None:
            raise ValueError("Target column not specified in schema")
        
        create_directories([config.root_dir])
        
        data_processing_config = DataProcessingConfig(
            root_dir=config.root_dir,
            validation_report=Path(config.validation_report),
            selected_data_file=Path(config.selected_data_file),
            all_schema=selected_schema,
            unzip_data_dir=data_validation_config.unzip_data_dir,
            target_column=target_column  
        )
        return data_processing_config

## Trial-runs for components file

In [6]:
import os 
from raisingVillage import logger
import pandas as pd 

In [7]:
class DataProcessing:
    def __init__(self, config: DataProcessingConfig):
        self.config = config
    
    def extract_and_save_features(self):
        """Load, validate, and save selected features"""
        try:
            df = pd.read_csv(self.config.unzip_data_dir)
            selected_columns = list(self.config.all_schema.keys())
            
            # Validate and select columns
            if missing := [col for col in selected_columns if col not in df.columns]:
                raise ValueError(f"Missing columns: {missing}")
            
            selected_df = df[selected_columns].copy()
            self.config.selected_data_file.parent.mkdir(parents=True, exist_ok=True)
            selected_df.to_csv(self.config.selected_data_file, index=False)
            
            # Log results
            logger.info(f"Saved {len(selected_columns)} features to {self.config.selected_data_file}")
            if hasattr(self.config, 'target_column'):
                logger.info(f"Target column: {self.config.target_column}")
            
            return selected_df
            
        except Exception as e:
            logger.error(f"Feature extraction failed: {e}")
            raise
    
    def validate_all_columns(self) -> bool:
        """Validate data against schema"""
        try:
            data = pd.read_csv(self.config.selected_data_file)
            schema_cols = set(self.config.all_schema.keys())
            data_cols = set(data.columns)
            
            validation_status = data_cols.issubset(schema_cols)
            report_content = (
                f"Validation status: {validation_status}\n"
                f"Data columns: {sorted(data_cols)}\n"
                f"Schema columns: {sorted(schema_cols)}"
            )
            
            self.config.validation_report.parent.mkdir(exist_ok=True, parents=True)
            self.config.validation_report.write_text(report_content)
            
            logger.info(f"Validation {'passed' if validation_status else 'failed'}")
            return validation_status
            
        except Exception as e:
            logger.error(f"Validation failed: {e}")
            raise

## Trial-runs for pipeline

In [8]:
try: 
    config = ConfigurationManager()
    data_processing_config = config.get_data_processing_config()
    data_processing = DataProcessing(config=data_processing_config)
    data_processing.extract_and_save_features()
    data_processing.validate_all_columns()
except Exception as e:

    raise e

[2025-05-28 18:35:02,466: INFO: common: yaml_file: config\config.yaml loaded successfully]
[2025-05-28 18:35:02,532: INFO: common: yaml_file: params.yaml loaded successfully]
[2025-05-28 18:35:02,559: INFO: common: yaml_file: selected_schema.yaml loaded successfully]
[2025-05-28 18:35:02,561: INFO: common: Created directory at: artifacts]
[2025-05-28 18:35:02,561: INFO: common: Created directory at: artifacts/data_processing]


  df = pd.read_csv(self.config.unzip_data_dir)


[2025-05-28 18:35:05,119: INFO: 404485137: Saved 5 features to artifacts\data_processing\selected_features.csv]
[2025-05-28 18:35:05,119: INFO: 404485137: Target column: {'HH Income + Production/Day (USD)': 'float64'}]
[2025-05-28 18:35:05,304: INFO: 404485137: Validation passed]
