In [105]:
import os

In [106]:
%pwd

'c:\\Users\\RICH-FILES\\Desktop\\ml\\client-subscription-prediction'

In [107]:
os.chdir("../")

In [108]:
%pwd

'c:\\Users\\RICH-FILES\\Desktop\\ml'

In [109]:
proj_link = 'c:\\Users\\RICH-FILES\\Desktop\\ml\\client-subscription-prediction'
os.chdir(proj_link)


In [110]:
# create root entity

from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class ModelTrainerConfig:
    root_dir: Path
    train_data_path: Path
    test_data_path: Path
    model_name: str
    preprocessor_name: str
    sm_model_name: str
    sm_processor_name: str
    rf_model_name: str
    rf_preprocessor_name: str
    rf_smote_model_name: str
    rf_smote_preprocessor_name: str
    xgb_model_name: str
    xgb_preprocessor_name: str
    target_column: str
    max_iter: int
    penalty: int
    C: float
    solver: str
    random_state: int
    model_type: str
    n_estimators: int
    max_depth: int
    

In [111]:
proj_link = 'c:\\Users\\RICH-FILES\\Desktop\\ml\\client-subscription-prediction'
os.chdir(proj_link)

In [112]:
#create configuration manager 
#from clientClassifier.constants import *
from clientClassifier.constants import CONFIG_FILE_PATH, PARAMS_FILE_PATH, SCHEMA_FILE_PATH
from clientClassifier.utils.common import read_yaml, create_directories

In [113]:
class ConfigurationManager:
    def __init__(
        self,
        config_file_path = CONFIG_FILE_PATH,
        params_file_path = PARAMS_FILE_PATH,
        schema_filepath = SCHEMA_FILE_PATH):
            
        self.config = read_yaml(config_file_path)
        self.params = read_yaml(params_file_path)
        self.schema = read_yaml(schema_filepath)
        
        create_directories([self.config.artifacts_root])
        
        
    def get_model_trainer_config(self) -> ModelTrainerConfig:
        config = self.config.model_trainer
        params = self.params.Logistic_Regression, self.params.random_forest, self.params.XGBClassifier
        schema = self.schema.TARGET_COLUMN
        
        
        create_directories([config["root_dir"]])
        
        model_trainer_config = ModelTrainerConfig(
            root_dir=config.root_dir,
            train_data_path=config.train_data_path,
            test_data_path=config.test_data_path,
            model_name=config.model_name,
            preprocessor_name=config.preprocessor_name,
            sm_model_name=config.sm_model_name,
            sm_processor_name=config.sm_processor_name,
            rf_model_name = config.rf_model_name,
            rf_preprocessor_name = config.rf_preprocessor_name,
            rf_smote_model_name = config.rf_smote_model_name,
            rf_smote_preprocessor_name = config.rf_smote_preprocessor_name,
            xgb_model_name = config.xgb_model_name,
            xgb_preprocessor_name = config.xgb_preprocessor_name,
            target_column=schema.name,
            max_iter=params["max_iter"],
            penalty=params["penalty"],
            C=params["C"],
            solver=params["solver"],
            n_estimators=params.n_estimators,
            max_depth=params.max_depth,
            random_state=params.random_state            
        )
        
        return model_trainer_config

In [114]:
#define model trainer components
import os
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import joblib
from clientClassifier import logger
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE    
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier





In [None]:
class ModelTrainer:
    def __init__(self, config: ModelTrainerConfig):
        self.config = config
        
    def train(self):
        logger.info("Training model")
        
        # load train and test data
        train_data = pd.read_csv(self.config.train_data_path)
        test_data = pd.read_csv(self.config.test_data_path)
        
        # separate features and target variable
        X_train = train_data.drop(columns=[self.config.target_column], axis=1)
        y_train = train_data[self.config.target_column]
        
        X_test = test_data.drop(columns=[self.config.target_column], axis=1)
        y_test = test_data[self.config.target_column]
        
        # create preprocessing pipeline for numerical and categorical features
        numerical_features = X_train.select_dtypes(include=['int64', 'float64']).columns.tolist()       
        categorical_features = X_train.select_dtypes(include=['object']).columns.tolist()   

        numerical_transformer = StandardScaler()
        categorical_transformer = OneHotEncoder(handle_unknown='ignore')                
        
        preprocessor = ColumnTransformer(   
            transformers=[
                ('num', numerical_transformer, numerical_features),
                ('cat', categorical_transformer, categorical_features)
            ]
        )   
        
        # create a pipeline with preprocessing and model
        pipeline = Pipeline([   
            ('preprocessor', preprocessor),
            ('classifier', LogisticRegression(
                class_weight=self.config.class_weight,
                max_iter=self.config.max_iter,
                penalty=self.config.penalty,
                C=self.config.C,
                solver=self.config.solver,
                random_state=self.config.random_state
            ))
        ])      
        
        # fit the pipeline on the training data
        pipeline.fit(X_train, y_train)      
        
     
        # make predictions on the test data 
        y_pred = pipeline.predict(X_test)      
         
        # evaluate the model
        report = classification_report(y_test, y_pred)
        cm = confusion_matrix(y_test, y_pred)   
        accuracy = accuracy_score(y_test, y_pred)   

        logger.info(f"Classification Report:\n{report}")
        logger.info(f"Confusion Matrix:\n{cm}") 
        logger.info(f"Accuracy: {accuracy}")   
         
        # save the model    
        
        
        model = pipeline.named_steps['classifier']
        preprocessor = pipeline.named_steps['preprocessor'] 
        model_path = os.path.join(self.config.root_dir, self.config.model_name)                     
        preprocessor_path = os.path.join(self.config.root_dir, self.config.preprocessor_name)
        joblib.dump(preprocessor, preprocessor_path)    
        joblib.dump(model, model_path)  
        
        logger.info(f"Model saved at: {model_path}")
        logger.info(f"Preprocessor saved at: {preprocessor_path}")      
        
        # save the model
        joblib.dump(model, os.path.join(self.config.root_dir, self.config.model_name))
        
  
    def train_with_SMOTE(self):
        logger.info("Training model with SMOTE")

        # Load data
        train_data = pd.read_csv(self.config.train_data_path)
        test_data = pd.read_csv(self.config.test_data_path)

        # Separate features and target
        X_train = train_data.drop(columns=[self.config.target_column])
        y_train = train_data[self.config.target_column]

        X_test = test_data.drop(columns=[self.config.target_column])
        y_test = test_data[self.config.target_column]

        # Define numerical and categorical features
        numerical_features = X_train.select_dtypes(include=['int64', 'float64']).columns.tolist()
        categorical_features = X_train.select_dtypes(include=['object']).columns.tolist()

        # Transformers
        numerical_transformer = StandardScaler()
        categorical_transformer = OneHotEncoder(handle_unknown='ignore')

        sm_preprocessor = ColumnTransformer(
            transformers=[
                ('num', numerical_transformer, numerical_features),
                ('cat', categorical_transformer, categorical_features)
            ]
        )

        #  Preprocess X_train using fit_transform (fit only on training data)
        X_train_processed = sm_preprocessor.fit_transform(X_train)

        # Applying SMOTE to resample training data
        smote = SMOTE(random_state=self.config.random_state)
        X_resampled, y_resampled = smote.fit_resample(X_train_processed, y_train)

        logger.info(f"After SMOTE: {X_resampled.shape}, Target distribution: {dict(pd.Series(y_resampled).value_counts())}")

        #  Fitting model on resampled data
        model = LogisticRegression(
            class_weight=self.config.class_weight,
            max_iter=self.config.max_iter,
            penalty=self.config.penalty,
            C=self.config.C,
            solver=self.config.solver,
            random_state=self.config.random_state
        )

        model.fit(X_resampled, y_resampled)
            
        # make predictions on the test data 
        X_test_processed = sm_preprocessor.transform(X_test)
        y_pred_resampled = model.predict(X_test_processed)

        # evaluate the model
        resampled_report = classification_report(y_test, y_pred_resampled)
        resampled_cm = confusion_matrix(y_test, y_pred_resampled)   
        resampled_accuracy = accuracy_score(y_test, y_pred_resampled)   

        logger.info(f"Classification Report:\n{resampled_report}")
        logger.info(f"Confusion Matrix:\n{resampled_cm}") 
        logger.info(f"Accuracy: {resampled_accuracy}")   
            
        # save the model
        joblib.dump(model, os.path.join(self.config.root_dir, self.config.sm_model_name))
        joblib.dump(sm_preprocessor, os.path.join(self.config.root_dir, self.config.sm_processor_name))
        
        
        
    def train_RandomForest(self):
        logger.info("Training Random Forest model")

        # Load data
        train_data = pd.read_csv(self.config.train_data_path)
        test_data = pd.read_csv(self.config.test_data_path)

        # Separate features and target
        X_train = train_data.drop(columns=[self.config.target_column])
        y_train = train_data[self.config.target_column]

        X_test = test_data.drop(columns=[self.config.target_column])
        y_test = test_data[self.config.target_column]

        # Define numerical and categorical features
        numerical_features = X_train.select_dtypes(include=['int64', 'float64']).columns.tolist()
        categorical_features = X_train.select_dtypes(include=['object']).columns.tolist()

        # Transformers
        numerical_transformer = StandardScaler()
        categorical_transformer = OneHotEncoder(handle_unknown='ignore')

        rf_preprocessor = ColumnTransformer(
            transformers=[
                ('num', numerical_transformer, numerical_features),
                ('cat', categorical_transformer, categorical_features)
            ]
        )

        # Preprocess X_train using fit_transform (fit only on training data)
        X_train_processed = rf_preprocessor.fit_transform(X_train)

        # Fitting Random Forest model on the processed data
        rf_model = RandomForestClassifier(random_state=self.config.random_state,
                                          n_estimators=self.config.n_estimators,
                                          max_depth=self.config.max_depth,
                                          class_weight="balanced")  
        rf_model.fit(X_train_processed, y_train)

        # make predictions on the test data 
        X_test_processed = rf_preprocessor.transform(X_test)
        y_pred_rf = rf_model.predict(X_test_processed)

        # evaluate the model
        rf_report = classification_report(y_test, y_pred_rf)
        rf_cm = confusion_matrix(y_test, y_pred_rf)   
        rf_accuracy = accuracy_score(y_test, y_pred_rf)   

        logger.info(f"Random Forest Classification Report:\n{rf_report}")
        logger.info(f"Random Forest Confusion Matrix:\n{rf_cm}") 
        logger.info(f"Random Forest Accuracy: {rf_accuracy}")
        
        # save the model
        rf_model_path = os.path.join(self.config.root_dir, self.config.rf_model_name_model_name)   
        rf_preprocessor_path = os.path.join(self.config.root_dir, self.config.rf_preprocessor_name)
        joblib.dump(rf_model, rf_model_path)
        joblib.dump(rf_preprocessor, rf_preprocessor_path)
        return rf_report, rf_cm, rf_accuracy    
        
    def train_RandomForest_with_SMOTE(self):
        logger.info("Training Random Forest model with SMOTE")

        # Load data
        train_data = pd.read_csv(self.config.train_data_path)
        test_data = pd.read_csv(self.config.test_data_path)

        # Separate features and target
        X_train = train_data.drop(columns=[self.config.target_column])
        y_train = train_data[self.config.target_column]

        X_test = test_data.drop(columns=[self.config.target_column])
        y_test = test_data[self.config.target_column]

        # Define numerical and categorical features
        numerical_features = X_train.select_dtypes(include=['int64', 'float64']).columns.tolist()
        categorical_features = X_train.select_dtypes(include=['object']).columns.tolist()

        # Transformers
        numerical_transformer = StandardScaler()
        categorical_transformer = OneHotEncoder(handle_unknown='ignore')

        rf_smote_preprocessor = ColumnTransformer(
            transformers=[
                ('num', numerical_transformer, numerical_features),
                ('cat', categorical_transformer, categorical_features)
            ]
        )

        # Preprocess X_train using fit_transform (fit only on training data)
        X_train_processed = rf_smote_preprocessor.fit_transform(X_train)

        # Applying SMOTE to resample training data
        smote = SMOTE(random_state=self.config.random_state)
        X_resampled, y_resampled = smote.fit_resample(X_train_processed, y_train)

        logger.info(f"After SMOTE: {X_resampled.shape}, Target distribution: {dict(pd.Series(y_resampled).value_counts())}")

        # Fitting Random Forest model on the processed data
        rf_model = RandomForestClassifier(random_state=self.config.random_state,
                                          n_estimators=self.config.n_estimators,
                                          max_depth=self.config.max_depth,
                                          class_weight= "balanced")  
        rf_model.fit(X_resampled, y_resampled)

        # make predictions on the test data 
        X_test_processed = rf_smote_preprocessor.transform(X_test)
        y_pred_rf_smote = rf_model.predict(X_test_processed)

        # evaluate the model
        rf_smote_report = classification_report(y_test, y_pred_rf_smote)
        rf_smote_cm = confusion_matrix(y_test, y_pred_rf_smote)   
        rf_smote_accuracy = accuracy_score(y_test, y_pred_rf_smote)   

        logger.info(f"Random Forest with SMOTE Classification Report:\n{rf_smote_report}") 
        logger.info(f"Random Forest with SMOTE Confusion Matrix:\n{rf_smote_cm}")
        logger.info(f"Random Forest with SMOTE Accuracy: {rf_smote_accuracy}")
        
        # save the model
        rf_smote_model_path = os.path.join(self.config.root_dir, self.config.rf_smote_model_name)
        rf_smote_preprocessor_path = os.path.join(self.config.root_dir, self.config.rf_smote_preprocessor_name)
        joblib.dump(rf_model, rf_smote_model_path)
        joblib.dump(rf_smote_preprocessor, rf_smote_preprocessor_path)  
        return rf_smote_report, rf_smote_cm, rf_smote_accuracy  
    
    
    def train_XGBOOST(self):
        logger.info("Training XGBoost model")

        # Load data
        train_data = pd.read_csv(self.config.train_data_path)
        test_data = pd.read_csv(self.config.test_data_path)

        # Separate features and target
        X_train = train_data.drop(columns=[self.config.target_column])
        y_train = train_data[self.config.target_column]

        X_test = test_data.drop(columns=[self.config.target_column])
        y_test = test_data[self.config.target_column]

        # Define numerical and categorical features
        numerical_features = X_train.select_dtypes(include=['int64', 'float64']).columns.tolist()
        categorical_features = X_train.select_dtypes(include=['object']).columns.tolist()

        # Transformers
        numerical_transformer = StandardScaler()
        categorical_transformer = OneHotEncoder(handle_unknown='ignore')

        xgb_preprocessor = ColumnTransformer(
            transformers=[
                ('num', numerical_transformer, numerical_features),
                ('cat', categorical_transformer, categorical_features)
            ]
        )

        # Preprocess X_train using fit_transform (fit only on training data)
        X_train_processed = xgb_preprocessor.fit_transform(X_train)

        # Fitting XGBoost model on the processed data
        xgb_model = XGBClassifier(random_state=self.config.random_state,
                                   n_estimators=self.config.n_estimators,
                                   max_depth=self.config.max_depth,
                                   class_weight= "balanced")
        
          
        xgb_model.fit(X_train_processed, y_train)

        # make predictions on the test data 
        X_test_processed = xgb_preprocessor.transform(X_test)
        y_pred_xgb = xgb_model.predict(X_test_processed)

        # evaluate the model
        xgb_report = classification_report(y_test, y_pred_xgb)
        xgb_cm = confusion_matrix(y_test, y_pred_xgb)   
        xgb_accuracy = accuracy_score(y_test, y_pred_xgb)   

        logger.info(f"XGBoost Classification Report:\n{xgb_report}")
        logger.info(f"XGBoost Confusion Matrix:\n{xgb_cm}") 
        logger.info(f"XGBoost Accuracy: {xgb_accuracy}")
        
        # save the model
        xgb_model_path = os.path.join(self.config.root_dir, self.config.xgb_model_name)   
        xgb_preprocessor_path = os.path.join(self.config.root_dir, self.config.xgb_preprocessor_name)
        joblib.dump
            
        
        
        
            

In [116]:
try:
    config = ConfigurationManager()
    model_trainer_config = config.get_model_trainer_config()
    model_trainer = ModelTrainer(model_trainer_config)
    model_trainer.train()
    model_trainer.train_with_SMOTE()
    model_trainer.train_RandomForest()
    model_trainer.train_RandomForest_with_SMOTE()
    model_trainer.train_XGBoost()       
except Exception as e:
    logger.exception(e)
    raise e 



[2025-04-12 20:27:40,274: INFO: common: yaml file: config\config.yaml loaded successfully]
[2025-04-12 20:27:40,281: INFO: common: yaml file: params.yaml loaded successfully]
[2025-04-12 20:27:40,288: INFO: common: yaml file: schema.yaml loaded successfully]
[2025-04-12 20:27:40,290: INFO: common: created directory at: artifacts]
[2025-04-12 20:27:40,292: INFO: common: created directory at: artifacts/model_trainer]
[2025-04-12 20:27:40,293: ERROR: 1466305328: tuple indices must be integers or slices, not str]
Traceback (most recent call last):
  File "C:\Users\RICH-FILES\AppData\Local\Temp\ipykernel_6200\1466305328.py", line 3, in <module>
    model_trainer_config = config.get_model_trainer_config()
  File "C:\Users\RICH-FILES\AppData\Local\Temp\ipykernel_6200\3828299950.py", line 38, in get_model_trainer_config
    max_iter=params["max_iter"],
TypeError: tuple indices must be integers or slices, not str


TypeError: tuple indices must be integers or slices, not str