In [1]:
import os

In [2]:
%pwd

'd:\\SAMITH\\Github\\Air-Quality-Health-Alert-System\\research'

In [3]:
os.chdir("../")

In [4]:
%pwd

'd:\\SAMITH\\Github\\Air-Quality-Health-Alert-System'

In [5]:
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class ModelTrainerConfig:
    """Model Trainer Configuration"""
    root_dir: Path
    train_data_path: Path
    test_data_path: Path
    model_name: str
    target_column: str
    
    # XGBoost hyperparameters
    n_estimators: int
    max_depth: int
    learning_rate: float
    subsample: float
    colsample_bytree: float
    random_state: int

In [6]:
from Air_Quality_Health_Alert_System.constants import *
from Air_Quality_Health_Alert_System.utils.common import read_yaml, create_directories

In [7]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH,
        schema_filepath = SCHEMA_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])
    def get_model_trainer_config(self) -> ModelTrainerConfig:
        config = self.config.model_trainer
        params = self.params.XGBOOST
        schema = self.schema.TARGET_COLUMN

        create_directories([config.root_dir])

        model_trainer_config = ModelTrainerConfig(
            root_dir=config.root_dir,
            train_data_path=config.train_data_path,
            test_data_path=config.test_data_path,
            model_name=config.model_name,
            target_column=schema.name,
            
            n_estimators=params.n_estimators,
            max_depth=params.max_depth,
            learning_rate=params.learning_rate,
            subsample=params.subsample,
            colsample_bytree=params.colsample_bytree,
            random_state=params.random_state
        )
        return model_trainer_config

In [19]:
import pandas as pd
import os
from Air_Quality_Health_Alert_System import logger
from sklearn.model_selection import RandomizedSearchCV, TimeSeriesSplit
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score
import joblib

In [None]:
class ModelTrainer:
    def __init__(self, config):
        self.config = config
        self.scaler = None
        self.feature_columns = None

    def load_scaler(self):
        """Load the fitted scaler from data transformation stage"""
        scaler_path = os.path.join(self.config.data_transformation_dir, "scaler.joblib")
        if os.path.exists(scaler_path):
            self.scaler = joblib.load(scaler_path)
            logger.info(f"Scaler loaded from: {scaler_path}")
        else:
            logger.warning(f"No scaler found at: {scaler_path}")
            self.scaler = None

    def add_lag_features(self, df, target_col, lags=[1, 2, 3], rolling_windows=[3, 7]):
        """Add lag and rolling window features for time series"""
        df = df.copy()
        initial_rows = len(df)
        
        # Create lag features
        for lag in lags:
            df[f"{target_col}_lag{lag}"] = df[target_col].shift(lag)
        
        # Create rolling window features
        for window in rolling_windows:
            df[f"{target_col}_rolling{window}"] = df[target_col].shift(1).rolling(window=window).mean()
            df[f"{target_col}_rolling{window}_std"] = df[target_col].shift(1).rolling(window=window).std()
        
        # Drop rows with NaN values
        df = df.dropna()
        final_rows = len(df)
        
        logger.info(f"Added lag features. Rows: {initial_rows} -> {final_rows} (removed {initial_rows - final_rows} rows)")
        return df

    def prepare_features(self, train_data, test_data):
        """Prepare features for training and testing"""
        
        # Columns to drop from features
        columns_to_drop = ['date', 'city', 'AQI_Category']
        
        # Add lag features for time series modeling
        train_data = self.add_lag_features(train_data, self.config.target_column)
        test_data = self.add_lag_features(test_data, self.config.target_column)

        # Split features and target
        train_x = train_data.drop(columns=columns_to_drop + [self.config.target_column], errors='ignore')
        test_x = test_data.drop(columns=columns_to_drop + [self.config.target_column], errors='ignore')
        train_y = train_data[self.config.target_column]
        test_y = test_data[self.config.target_column]

        # Handle categorical columns
        cat_cols = train_x.select_dtypes(include=['object']).columns.tolist()
        if cat_cols:
            logger.info(f"Encoding categorical columns: {cat_cols}")
            train_x = pd.get_dummies(train_x, columns=cat_cols, drop_first=True)
            test_x = pd.get_dummies(test_x, columns=cat_cols, drop_first=True)
            
            # Align test set columns with training set
            test_x = test_x.reindex(columns=train_x.columns, fill_value=0)

        # Store feature columns for later use
        self.feature_columns = train_x.columns.tolist()
        
        logger.info(f"Feature preparation completed:")
        logger.info(f"  Training features shape: {train_x.shape}")
        logger.info(f"  Test features shape: {test_x.shape}")
        logger.info(f"  Total features: {len(self.feature_columns)}")

        return train_x, test_x, train_y, test_y

    def get_hyperparameter_grid(self):
        """Get hyperparameter grid for model tuning"""
        param_grid = {
            'n_estimators': [100, 200, 300, 500],
            'max_depth': [3, 5, 7, 9, None],
            'learning_rate': [0.01, 0.05, 0.1, 0.2],
            'subsample': [0.6, 0.7, 0.8, 0.9, 1.0],
            'colsample_bytree': [0.6, 0.7, 0.8, 0.9, 1.0],
            'reg_alpha': [0, 0.01, 0.1, 1],
            'reg_lambda': [0, 0.01, 0.1, 1],
            'min_child_weight': [1, 3, 5, 7],
            'gamma': [0, 0.1, 0.2, 0.3]
        }
        return param_grid

    def evaluate_model(self, model, test_x, test_y):
        """Evaluate model performance with comprehensive metrics"""
        predictions = model.predict(test_x)
        
        # Calculate metrics
        mse = mean_squared_error(test_y, predictions)
        rmse = np.sqrt(mse)
        mae = mean_absolute_error(test_y, predictions)
        r2 = r2_score(test_y, predictions)
        
        # Additional metrics
        mape = np.mean(np.abs((test_y - predictions) / test_y)) * 100
        
        metrics = {
            'mse': mse,
            'rmse': rmse,
            'mae': mae,
            'r2': r2,
            'mape': mape
        }
        
        logger.info("=== Model Evaluation Metrics ===")
        logger.info(f"Mean Squared Error (MSE): {mse:.4f}")
        logger.info(f"Root Mean Squared Error (RMSE): {rmse:.4f}")
        logger.info(f"Mean Absolute Error (MAE): {mae:.4f}")
        logger.info(f"R-squared Score: {r2:.4f}")
        logger.info(f"Mean Absolute Percentage Error (MAPE): {mape:.2f}%")
        
        return metrics, predictions

    def save_model_artifacts(self, model, metrics=None):
        """Save model, scaler, feature info, and metrics"""
        os.makedirs(self.config.root_dir, exist_ok=True)
        
        # Prepare model artifacts
        model_artifacts = {
            'model': model,
            'scaler': self.scaler,
            'feature_columns': self.feature_columns,
            'target_column': self.config.target_column,
            'model_type': 'XGBRegressor',
            'timestamp': datetime.now().isoformat(),
            'metrics': metrics
        }
        
        # Save complete model package
        model_path = os.path.join(self.config.root_dir, self.config.model_name)
        joblib.dump(model_artifacts, model_path)
        logger.info(f"Model artifacts saved at: {model_path}")
        
        # Save feature importance if available
        if hasattr(model, 'feature_importances_'):
            feature_importance = pd.DataFrame({
                'feature': self.feature_columns,
                'importance': model.feature_importances_
            }).sort_values('importance', ascending=False)
            
            importance_path = os.path.join(self.config.root_dir, "feature_importance.csv")
            feature_importance.to_csv(importance_path, index=False)
            logger.info(f"Feature importance saved at: {importance_path}")
            
            # Log top 10 important features
            logger.info("=== Top 10 Important Features ===")
            for idx, row in feature_importance.head(10).iterrows():
                logger.info(f"{row['feature']}: {row['importance']:.4f}")

    def train(self):
        """Main training pipeline"""
        logger.info("Starting model training pipeline...")
        
        try:
            # Load scaler from data transformation stage
            self.load_scaler()
            
            # Load data
            logger.info("Loading training and test data...")
            train_data = pd.read_csv(self.config.train_data_path, parse_dates=['date'])
            test_data = pd.read_csv(self.config.test_data_path, parse_dates=['date'])
            
            logger.info(f"Train data shape: {train_data.shape}")
            logger.info(f"Test data shape: {test_data.shape}")

            # Prepare features
            train_x, test_x, train_y, test_y = self.prepare_features(train_data, test_data)

            # Initialize model
            xgb_model = XGBRegressor(
                tree_method="hist", 
                random_state=self.config.random_state,
                n_jobs=-1
            )

            # Hyperparameter tuning
            logger.info("Starting hyperparameter tuning...")
            param_grid = self.get_hyperparameter_grid()
            
            # Time series cross-validation
            tscv = TimeSeriesSplit(n_splits=5)

            # Randomized search for efficiency
            random_search = RandomizedSearchCV(
                estimator=xgb_model,
                param_distributions=param_grid,
                n_iter=getattr(self.config, 'n_iter', 50),
                scoring='neg_mean_squared_error',
                cv=tscv,
                verbose=1,
                random_state=self.config.random_state,
                n_jobs=-1
            )

            # Fit model with early stopping
            logger.info("Training model with hyperparameter optimization...")
            random_search.fit(
                train_x, 
                train_y,
                eval_set=[(test_x, test_y)],
                verbose=False
            )
            
            best_model = random_search.best_estimator_
            
            logger.info("=== Best Hyperparameters ===")
            for param, value in random_search.best_params_.items():
                logger.info(f"{param}: {value}")

            # Model evaluation
            metrics, predictions = self.evaluate_model(best_model, test_x, test_y)

            # Save model artifacts
            self.save_model_artifacts(best_model, metrics)

            logger.info("Model training completed successfully!")
            
            return {
                'model': best_model,
                'metrics': metrics,
                'predictions': predictions,
                'feature_columns': self.feature_columns
            }
            
        except Exception as e:
            logger.error(f"Error in model training: {str(e)}")
            raise e

In [23]:
try:
    config = ConfigurationManager()
    model_trainer_config = config.get_model_trainer_config()
    model_trainer = ModelTrainer(config=model_trainer_config)  
    model_trainer.train()
except Exception as e:
    raise e

[2025-08-20 18:26:41,081: INFO: common: yaml file: config\config.yaml loaded successfully]
[2025-08-20 18:26:41,085: INFO: common: yaml file: params.yaml loaded successfully]
[2025-08-20 18:26:41,088: INFO: common: yaml file: schema.yaml loaded successfully]
[2025-08-20 18:26:41,090: INFO: common: created directory at: artifacts]
[2025-08-20 18:26:41,091: INFO: common: created directory at: artifacts/model_trainer]
Fitting 5 folds for each of 30 candidates, totalling 150 fits
[0]	validation_0-rmse:50.81160
[1]	validation_0-rmse:50.38528
[2]	validation_0-rmse:49.98442
[3]	validation_0-rmse:49.56856
[4]	validation_0-rmse:49.17602
[5]	validation_0-rmse:48.78915
[6]	validation_0-rmse:48.38676
[7]	validation_0-rmse:47.99000
[8]	validation_0-rmse:47.59736
[9]	validation_0-rmse:47.20931
[10]	validation_0-rmse:46.84348
[11]	validation_0-rmse:46.48331
[12]	validation_0-rmse:46.10878
[13]	validation_0-rmse:45.75705
[14]	validation_0-rmse:45.39138
[15]	validation_0-rmse:45.02980
[16]	validation_0