In [249]:
import os

In [250]:
%pwd

'c:\\Users\\RICH-FILES\\Desktop\\ml\\client-subscription-prediction'

In [251]:
os.chdir("../")

In [252]:
%pwd

'c:\\Users\\RICH-FILES\\Desktop\\ml'

In [253]:
proj_link = 'c:\\Users\\RICH-FILES\\Desktop\\ml\\client-subscription-prediction'
os.chdir(proj_link)


In [None]:
# create root entity

from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class ModelTrainerConfig:
    root_dir: Path
    train_data_path: Path
    test_data_path: Path
    model_name: str
    preprocessor_name: str
    sm_model_name: str
    sm_processor_name: str
    target_column: str
    class_weight: str
    max_iter: int
    penalty: int
    C: float
    solver: str
    random_state: int
    

In [255]:
proj_link = 'c:\\Users\\RICH-FILES\\Desktop\\ml\\client-subscription-prediction'
os.chdir(proj_link)

In [256]:
#create configuration manager 
#from clientClassifier.constants import *
from clientClassifier.constants import CONFIG_FILE_PATH, PARAMS_FILE_PATH, SCHEMA_FILE_PATH
from clientClassifier.utils.common import read_yaml, create_directories

In [None]:
class ConfigurationManager:
    def __init__(
        self,
        config_file_path = CONFIG_FILE_PATH,
        params_file_path = PARAMS_FILE_PATH,
        schema_filepath = SCHEMA_FILE_PATH):
            
        self.config = read_yaml(config_file_path)
        self.params = read_yaml(params_file_path)
        self.schema = read_yaml(schema_filepath)
        
        create_directories([self.config.artifacts_root])
        
        
    def get_model_trainer_config(self) -> ModelTrainerConfig:
        config = self.config.model_trainer
        params = self.params.Logistic_Regression
        schema = self.schema.TARGET_COLUMN
        
        
        create_directories([config["root_dir"]])
        
        model_trainer_config = ModelTrainerConfig(
            root_dir=config.root_dir,
            train_data_path=config.train_data_path,
            test_data_path=config.test_data_path,
            model_name=config.model_name,
            preprocessor_name=config.preprocessor_name,
            sm_model_name=config.sm_model_name,
            sm_processor_name=config.sm_processor_name,
            target_column=schema.name,
            class_weight=params.class_weight,
            max_iter=params["max_iter"],
            penalty=params["penalty"],
            C=params["C"],
            solver=params["solver"],
            random_state=params["random_state"]
        )
        
        return model_trainer_config

In [258]:
#define model trainer components
import os
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import joblib
from clientClassifier import logger
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE





In [None]:
class ModelTrainer:
    def __init__(self, config: ModelTrainerConfig):
        self.config = config
        
    def train(self):
        logger.info("Training model")
        
        # load train and test data
        train_data = pd.read_csv(self.config.train_data_path)
        test_data = pd.read_csv(self.config.test_data_path)
        
        # separate features and target variable
        X_train = train_data.drop(columns=[self.config.target_column], axis=1)
        y_train = train_data[self.config.target_column]
        
        X_test = test_data.drop(columns=[self.config.target_column], axis=1)
        y_test = test_data[self.config.target_column]
        
        # create preprocessing pipeline for numerical and categorical features
        numerical_features = X_train.select_dtypes(include=['int64', 'float64']).columns.tolist()       
        categorical_features = X_train.select_dtypes(include=['object']).columns.tolist()   

        numerical_transformer = StandardScaler()
        categorical_transformer = OneHotEncoder(handle_unknown='ignore')                
        
        preprocessor = ColumnTransformer(   
            transformers=[
                ('num', numerical_transformer, numerical_features),
                ('cat', categorical_transformer, categorical_features)
            ]
        )   
        
        # create a pipeline with preprocessing and model
        pipeline = Pipeline([   
            ('preprocessor', preprocessor),
            ('classifier', LogisticRegression(
                class_weight=self.config.class_weight,
                max_iter=self.config.max_iter,
                penalty=self.config.penalty,
                C=self.config.C,
                solver=self.config.solver,
                random_state=self.config.random_state
            ))
        ])      
        
        # fit the pipeline on the training data
        pipeline.fit(X_train, y_train)      
        
     
        # make predictions on the test data 
        y_pred = pipeline.predict(X_test)      
         
        # evaluate the model
        report = classification_report(y_test, y_pred)
        cm = confusion_matrix(y_test, y_pred)   
        accuracy = accuracy_score(y_test, y_pred)   

        logger.info(f"Classification Report:\n{report}")
        logger.info(f"Confusion Matrix:\n{cm}") 
        logger.info(f"Accuracy: {accuracy}")   
         
        # save the model    
        
        
        model = pipeline.named_steps['classifier']
        preprocessor = pipeline.named_steps['preprocessor'] 
        model_path = os.path.join(self.config.root_dir, self.config.model_name)                     
        preprocessor_path = os.path.join(self.config.root_dir, self.config.preprocessor_name)
        joblib.dump(preprocessor, preprocessor_path)    
        joblib.dump(model, model_path)  
        
        logger.info(f"Model saved at: {model_path}")
        logger.info(f"Preprocessor saved at: {preprocessor_path}")      
        
        # save the model
        joblib.dump(model, os.path.join(self.config.root_dir, self.config.model_name))
        
  
    def train_with_SMOTE(self):
        logger.info("Training model with SMOTE")

        # Load data
        train_data = pd.read_csv(self.config.train_data_path)
        test_data = pd.read_csv(self.config.test_data_path)

        # Separate features and target
        X_train = train_data.drop(columns=[self.config.target_column])
        y_train = train_data[self.config.target_column]

        X_test = test_data.drop(columns=[self.config.target_column])
        y_test = test_data[self.config.target_column]

        # Define numerical and categorical features
        numerical_features = X_train.select_dtypes(include=['int64', 'float64']).columns.tolist()
        categorical_features = X_train.select_dtypes(include=['object']).columns.tolist()

        # Transformers
        numerical_transformer = StandardScaler()
        categorical_transformer = OneHotEncoder(handle_unknown='ignore')

        sm_preprocessor = ColumnTransformer(
            transformers=[
                ('num', numerical_transformer, numerical_features),
                ('cat', categorical_transformer, categorical_features)
            ]
        )

        #  Preprocess X_train using fit_transform (fit only on training data)
        X_train_processed = sm_preprocessor.fit_transform(X_train)

        # Applying SMOTE to resample training data
        smote = SMOTE(random_state=self.config.random_state)
        X_resampled, y_resampled = smote.fit_resample(X_train_processed, y_train)

        logger.info(f"After SMOTE: {X_resampled.shape}, Target distribution: {dict(pd.Series(y_resampled).value_counts())}")

        #  Fitting model on resampled data
        model = LogisticRegression(
            class_weight=self.config.class_weight,
            max_iter=self.config.max_iter,
            penalty=self.config.penalty,
            C=self.config.C,
            solver=self.config.solver,
            random_state=self.config.random_state
        )

        model.fit(X_resampled, y_resampled)
            
        # make predictions on the test data 
        X_test_processed = sm_preprocessor.transform(X_test)
        y_pred_resampled = model.predict(X_test_processed)

        # evaluate the model
        resampled_report = classification_report(y_test, y_pred_resampled)
        resampled_cm = confusion_matrix(y_test, y_pred_resampled)   
        resampled_accuracy = accuracy_score(y_test, y_pred_resampled)   

        logger.info(f"Classification Report:\n{resampled_report}")
        logger.info(f"Confusion Matrix:\n{resampled_cm}") 
        logger.info(f"Accuracy: {resampled_accuracy}")   
            
        
        
        # save the model    
            

        #model_path_resampled = os.path.join(self.config.root_dir, self.config.model_name)                     
        #preprocessor_path_resampled = os.path.join(self.config.root_dir, self.config.preprocessor_name)
        
        #save the model and preprocessor using this example        # save the model
        joblib.dump(model, os.path.join(self.config.root_dir, self.config.sm_model_name))
        joblib.dump(sm_preprocessor, os.path.join(self.config.root_dir, self.config.sm_processor_name))
            

In [260]:
try:
    config = ConfigurationManager()
    model_trainer_config = config.get_model_trainer_config()
    model_trainer = ModelTrainer(model_trainer_config)
    model_trainer.train()
    model_trainer.train_with_SMOTE()
except Exception as e:
    logger.exception(e)
    raise e 



[2025-04-12 14:21:18,901: INFO: common: yaml file: config\config.yaml loaded successfully]
[2025-04-12 14:21:18,907: INFO: common: yaml file: params.yaml loaded successfully]
[2025-04-12 14:21:18,913: INFO: common: yaml file: schema.yaml loaded successfully]
[2025-04-12 14:21:18,915: INFO: common: created directory at: artifacts]
[2025-04-12 14:21:18,917: INFO: common: created directory at: artifacts/model_trainer]
[2025-04-12 14:21:18,918: INFO: 2133987789: Training model]
[2025-04-12 14:21:19,050: INFO: 2133987789: Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.74      0.82       807
           1       0.20      0.54      0.29        98

    accuracy                           0.72       905
   macro avg       0.56      0.64      0.56       905
weighted avg       0.85      0.72      0.76       905
]
[2025-04-12 14:21:19,051: INFO: 2133987789: Confusion Matrix:
[[595 212]
 [ 45  53]]]
[2025-04-12 14:21:19,053: INFO: 21339877

AttributeError: 'ModelTrainerConfig' object has no attribute 'model_path_resampled'