## Model Training Trial-runs

In [150]:
import os

In [151]:
os.chdir(r"C:\Users\USER\Desktop\MLDefaults\Rising-Village-Prediction-Model")

## Trial-runs for entity_config file

In [152]:
from dataclasses import dataclass
from pathlib import Path
from typing import Dict, Any

@dataclass(frozen=True)
class ModelTrainingConfig:
    root_dir: Path
    train_set_path: Path
    test_set_path: Path 
    model_name: str 
    tfidf_params: Dict[str, Any]  # Stores TfidfVectorizer params
    gb_params: Dict[str, Any]  # Stores GradientBoosting params       
    target_column: str

## Trial-runs for the ConfigurationManager

In [153]:
#importing all project paths and modules necessary for project configurations 
from raisingVillage.constants import  *
from raisingVillage.utils.common import read_yaml, create_directories


In [154]:
#Updating the configuration file 
class ConfigurationManager:
    def __init__(
        self, 
        config_filepath = CONFIG_FILE_PATH, 
        params_filepath = PARAMS_FILE_PATH,
        selected_schema_filepath = SELECTED_SCHEMA_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.selected_schema = read_yaml(selected_schema_filepath)
       
        create_directories([self.config.artifacts_root])
    
    def get_model_training_config(self) -> ModelTrainingConfig:
        config = self.config.model_training
        data_splitting_config = self.config.data_splitting
        tfidf_params = self.params.model_training.TfidfVectorizer
        gb_params = self.params.model_training.GradientBoostingClassifier
        schema = self.selected_schema.TARGET
        
        create_directories([config.root_dir])
        
        model_training_config = ModelTrainingConfig(
            root_dir=config.root_dir,
            train_set_path = data_splitting_config.train_set_path,
            test_set_path = data_splitting_config.test_set_path,
            tfidf_params=tfidf_params,
            gb_params=gb_params,
            target_column=self.selected_schema.TARGET,
            model_name=config.model_name, 
        )
        return model_training_config

## Trial-runs for components 

In [155]:
import pandas as pd 
import os 
import joblib
from raisingVillage import logger
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import difflib

In [156]:
class ModelTrainer:
    def __init__(self, config: ModelTrainingConfig):
        self.config = config
    
    def train(self):
        try:
            # 1. Load data
            train_data = pd.read_csv(self.config.train_set_path)
            test_data = pd.read_csv(self.config.test_set_path)
            
            # 2. Clean column names
            train_data.columns = train_data.columns.str.strip().str.replace(' ', '_')
            test_data.columns = test_data.columns.str.strip().str.replace(' ', '_')

            # 3. Set target column
            target_column = 'target_binary'
            
            # 4. Verify target exists
            if target_column not in train_data.columns:
                raise ValueError(
                    f"Target column '{target_column}' not found. Available columns: {list(train_data.columns)}"
                )

            # 5. Prepare features and target
            train_x = train_data.drop(columns=[target_column, 'HH_Income_+_Production/Day_(USD)'])
            test_x = test_data.drop(columns=[target_column, 'HH_Income_+_Production/Day_(USD)'])
            train_y = train_data[target_column]
            test_y = test_data[target_column]

            # 6. Fix and validate Tfidf parameters
            tfidf_params = self._validate_tfidf_params(self.config.tfidf_params)

            # 7. Create preprocessing pipeline
            preprocessor = ColumnTransformer(
                transformers=[
                    ('text1', TfidfVectorizer(**tfidf_params), 'most_recommend_rtv_program_reason'),
                    ('text2', TfidfVectorizer(**tfidf_params), 'least_recommend_rtv_program_reason'),
                    ('num', StandardScaler(), ['most_recommend_rtv_program', 'least_recommend_rtv_program'])
                ],
                remainder='passthrough'
            )

            # 8. Create and train pipeline
            pipeline = Pipeline([
                ('preprocessor', preprocessor),
                ('classifier', GradientBoostingClassifier(**self.config.gb_params))
            ])
            
            pipeline.fit(train_x, train_y)
            
            # 9. Save model
            os.makedirs(self.config.root_dir, exist_ok=True)
            joblib.dump(pipeline, os.path.join(self.config.root_dir, self.config.model_name))
            
            return pipeline

        except Exception as e:
            print(f"Error during training: {str(e)}")
            raise

    def _validate_tfidf_params(self, params):
        """Ensure TfidfVectorizer parameters are properly formatted"""
        validated_params = params.copy()
        
        # Convert ngram_range to tuple if needed
        if 'ngram_range' in validated_params:
            if isinstance(validated_params['ngram_range'], str):
                # Convert from string "(1, 2)" to tuple (1, 2)
                validated_params['ngram_range'] = eval(validated_params['ngram_range'])
            elif not isinstance(validated_params['ngram_range'], tuple):
                raise ValueError("ngram_range must be a tuple, e.g. (1, 2)")
        
        return validated_params

## Trial-runs for pipeline

In [157]:
#pipeline 
try:
    config = ConfigurationManager()
    model_training_config =config.get_model_training_config()
    model_training = ModelTrainer(config=model_training_config)
    model_training.train()
except Exception as e:
    raise e

[2025-05-29 16:37:21,743: INFO: common: yaml_file: config\config.yaml loaded successfully]
[2025-05-29 16:37:21,764: INFO: common: yaml_file: params.yaml loaded successfully]
[2025-05-29 16:37:21,774: INFO: common: yaml_file: selected_schema.yaml loaded successfully]
[2025-05-29 16:37:21,781: INFO: common: Created directory at: artifacts]
[2025-05-29 16:37:21,783: INFO: common: Created directory at: artifacts/model_training]
