## Data Transformation Trial-runs 

In [17]:
import os 

In [18]:
os.chdir(r"C:\Users\USER\Desktop\MLDefaults\Rising-Village-Prediction-Model")

## Trial-runs for entity_config file 

In [19]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataTransformationConfig:
    root_dir: Path
    data_path: Path
    processed_data_dir: Path

## Trial-runs for configuration_file

In [20]:
#importing all project paths and modules necessary for project configurations 
from raisingVillage.constants import  *
from raisingVillage.utils.common import read_yaml, create_directories

In [21]:
#Updating the configuration file 
class ConfigurationManager:
    def __init__(
        self, 
        config_filepath = CONFIG_FILE_PATH, 
        params_filepath = PARAMS_FILE_PATH,
        selected_schema_filepath = SELECTED_SCHEMA_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.selected_schema = read_yaml(selected_schema_filepath)
       
        create_directories([self.config.artifacts_root])
    
    def get_data_transformation_config(self) -> DataTransformationConfig:
        config = self.config.data_transformation 
        
        create_directories([config.root_dir])
        
        data_transformation_config = DataTransformationConfig(
            root_dir=config.root_dir,
            data_path=config.data_path, 
            processed_data_dir=config.processed_data_dir
        )
        return data_transformation_config

## Trial-runs for components 

In [22]:
import pandas as pd
import os
from pathlib import Path
from raisingVillage import logger

In [23]:
class DataTransformation:
    def __init__(self, config: DataTransformationConfig):
        self.config = config
        self.df = pd.read_csv(self.config.data_path)
        self.processed_df = None
    
    def create_binary_classes(self, df: pd.DataFrame, target_col: str = "HH Income + Production/Day (USD)") -> pd.DataFrame:
        """
        Convert continuous target to binary classes using median split
        Args:
            df: Input DataFrame
            target_col: Name of column to binarize
        Returns:
            DataFrame with new 'target_binary' column
        """
        median_val = df[target_col].median()
        df['target_binary'] = (df[target_col] > median_val).astype(int)
        logger.info(f"Created binary classes (median threshold: {median_val:.2f})")
        return df
    
    def fill_missing_with_mode(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        Fill all missing values using mode (most frequent value) for each column
        Args:
            df: Input DataFrame with missing values
        Returns:
            DataFrame with missing values filled
        """
        for col in df.columns:
            if df[col].isnull().any():
                mode_val = df[col].mode()[0] if not df[col].mode().empty else "Unknown"
                df[col] = df[col].fillna(mode_val)
                logger.info(f"Filled missing values in {col} with mode: {mode_val}")
        return df
    
    def process_and_store_data(self) -> None:
        """
        Execute full processing pipeline and store results
        Args:
            df: Raw input DataFrame
        """
        df=self.df.copy() #work on a copy of dataframe
        df = self.create_binary_classes(df) # Step 1: Binarize target
        df = self.fill_missing_with_mode(df)  # Step 2: Handle missing values
        self.processed_df = df # Store processed data
        
        # Save to artifacts
        os.makedirs(self.config.processed_data_dir, exist_ok=True)
        save_path = Path(self.config.processed_data_dir) / "processed_df.csv"
        df.to_csv(save_path, index=False)
        logger.info(f"Processed data stored at: {save_path}")

## Trial-runs for pipeline

In [24]:
try:
    config = ConfigurationManager()
    data_transformation_config = config.get_data_transformation_config()
    data_transformation = DataTransformation(config=data_transformation_config)
    data_transformation.process_and_store_data()
except Exception as e:
    raise e

[2025-05-29 08:08:56,527: INFO: common: yaml_file: config\config.yaml loaded successfully]
[2025-05-29 08:08:56,542: INFO: common: yaml_file: params.yaml loaded successfully]
[2025-05-29 08:08:56,549: INFO: common: yaml_file: selected_schema.yaml loaded successfully]
[2025-05-29 08:08:56,549: INFO: common: Created directory at: artifacts]
[2025-05-29 08:08:56,565: INFO: common: Created directory at: artifacts/data_transformation]
[2025-05-29 08:08:56,604: INFO: 590413929: Created binary classes (median threshold: 2.26)]
[2025-05-29 08:08:56,612: INFO: 590413929: Filled missing values in most_recommend_rtv_program with mode: 1.0]
[2025-05-29 08:08:56,620: INFO: 590413929: Filled missing values in least_recommend_rtv_program with mode: 99.0]
[2025-05-29 08:08:56,628: INFO: 590413929: Filled missing values in most_recommend_rtv_program_reason with mode: It has greatly influencedÂ  our community in improving agricultural skills and other methods of farming,Â  I highly recommend it to move 