### Model Build


In [1]:
import os
import warnings

warnings.filterwarnings("ignore")

In [2]:
%pwd

'd:\\GitHub\\Ineuron_adult_census_income_prediction\\research'

In [3]:
os.chdir("../")

In [4]:
%pwd

'd:\\GitHub\\Ineuron_adult_census_income_prediction'

In [5]:
import joblib
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import (
    RandomForestClassifier,
    AdaBoostClassifier,
    GradientBoostingClassifier,
)
from xgboost import XGBClassifier

In [6]:
from dataclasses import dataclass
from pathlib import Path


@dataclass
class ModelBuildConfig:
    root_dir: Path
    preprocessed_data_file: Path
    preprocessor_file: Path
    model_file: Path
    model_results: Path
    best_params: Path

In [7]:
from adult_census.constants import *
from adult_census.utils.common import read_yaml, create_directories

In [8]:
class ConfigurationManager:
    def __init__(self, config=CONFIG_FILE_PATH, params=PARAMS_FILE_PATH):
        self.config = read_yaml(config)
        self.params = read_yaml(params)

    def get_model_build_config(self) -> ModelBuildConfig:
        config = self.config.model_build

        model_build_config = ModelBuildConfig(
            root_dir=config.root_dir,
            preprocessed_data_file=config.preprocessed_data_file,
            preprocessor_file=config.preprocessor_file,
            model_file=config.model_file,
            model_results=config.model_results,
            best_params=config.best_params,
        )

        return model_build_config, self.params

In [9]:
from adult_census.logging import logger

In [10]:
# helper functions for model training and evaluation


def fit_model(model, x, y, parameters = None):
    try:
        if parameters != None:
            gcv = GridSearchCV(estimator=model, param_grid=parameters, n_jobs=-1)
            gcv.fit(X=x, y=y)
            best_params = gcv.best_params_
            model.set_params(**gcv.best_params_)
            pred = model.predict(x)
            result = classification_report(y, pred, output_dict=True)
            return model, result, best_params
        else:
            model.fit(x, y)
            pred = model.predict(x)
            result = classification_report(y, pred, output_dict=True)
            return model, result
    except Exception as e:
        raise e


def get_models():
    models_dict = {
        "LogisticRegression": LogisticRegression(),
        "SVC": SVC(),
        "KNeighborsClassifier": KNeighborsClassifier(),
        "GaussianNB": GaussianNB(),
        "DecisionTreeClassifier": DecisionTreeClassifier(),
        "RandomForestClassifier": RandomForestClassifier(),
        "AdaBoostClassifier": AdaBoostClassifier(),
        "GradientBoostingClassifier": GradientBoostingClassifier(),
        "XGBClassifier": XGBClassifier(),
    }

    return models_dict


def get_train_test_split(df: pd.DataFrame):
    X = df.drop("income", axis=1)
    y = df["income"]

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )

    return [X_train, X_test, y_train, y_test]


def get_report(result: dict):
    cr = {}
    cr["models"] = []
    cr["accuracy"] = []
    cr["macro avg precision"] = []
    cr["macro avg recall"] = []
    cr["macro avg f1-score"] = []
    cr["weighted avg precision"] = []
    cr["weighted avg recall"] = []
    cr["weighted avg f1-score"] = []
    for i, model in enumerate(result.keys()):
        cr["models"].append(model)
        cr["accuracy"].append(round(result[model]["accuracy"], 2))
        cr["macro avg precision"].append(
            round(result[model]["macro avg"]["precision"], 2)
        )
        cr["macro avg recall"].append(round(result[model]["macro avg"]["recall"], 2))
        cr["macro avg f1-score"].append(
            round(result[model]["macro avg"]["f1-score"], 2)
        )
        cr["weighted avg precision"].append(
            round(result[model]["weighted avg"]["precision"], 2)
        )
        cr["weighted avg recall"].append(
            round(result[model]["weighted avg"]["recall"], 2)
        )
        cr["weighted avg f1-score"].append(
            round(result[model]["weighted avg"]["f1-score"], 2)
        )
        target_class = result[model].keys()
        for each in target_class:
            if each not in ["accuracy", "macro avg", "weighted avg"]:
                if i == 0:
                    cr[each + " " + "precision"] = []
                    cr[each + " " + "recall"] = []
                    cr[each + " " + "f1-score"] = []

                cr[each + " " + "precision"].append(
                    round(result[model][each]["precision"], 2)
                )
                cr[each + " " + "recall"].append(
                    round(result[model][each]["recall"], 2)
                )
                cr[each + " " + "f1-score"].append(
                    round(result[model][each]["f1-score"], 2)
                )

    return pd.DataFrame(cr)

In [11]:
class ModelBuild:
    def __init__(self, config: ModelBuildConfig, params):
        self.config = config
        self.params = params

        create_directories([self.config.root_dir])

    def get_data_preprocessor(self):
        logger.info("Loading transformed data and preprocessor.")

        df = pd.read_csv(self.config.preprocessed_data_file)

        with open(self.config.preprocessor_file, "rb") as f:
            preprocessor = joblib.load(f)

        logger.info("Transformed data and Preprocessor loading complete.")

        return df, preprocessor

    def train_model(self, models_dict, X_train, y_train, parameters: bool = False):
        try:
            tr_models = {}
            tr_results = {}

            for model in models_dict.keys():
                if (parameters == True) and (model in self.params.keys()):
                    logger.info(f"Hyperparameter tuning for {model} started.")
                    parameter_of_model = self.params[model]
                    tr_model, result, best_params = fit_model(
                        models_dict[model],
                        X_train,
                        y_train,
                        parameters=parameter_of_model,
                    )

                    logger.info(f"Hyperparameter tuning for {model} completed.")

                    if (best_params != None) and (
                        not os.path.exists(self.config.best_params)
                    ):
                        with open(self.config.best_params, "w") as f:
                            f.write(f"Best Params for {model}: \n {best_params}")
                    else:
                        with open(self.config.best_params, "a") as f:
                            f.write(f"\n\nBest Params for {model}: \n {best_params}")
                else:
                    logger.info(f"Model training for {model} started.")
                    tr_model, result = fit_model(models_dict[model], X_train, y_train)
                    logger.info(f"Model training for {model} completed.")

                tr_models[model] = tr_model
                tr_results[model] = result

            return tr_models, tr_results
        except Exception as e:
            raise e

    def get_best_model(self, models_dict: dict, models_results: pd.DataFrame):
        best_model = models_results.sort_values("accuracy", ascending=False)[
            "models"
        ].to_list()[0]
        best_model = models_dict[best_model]
        return best_model

    def save_results(self, results: dict):
        try:
            for result in results.keys():
                res = results[result]
                if not os.path.exists(self.config.model_results):
                    with open(self.config.model_results, "w") as f:
                        f.write(
                            f"Results for {result}: \n{res.sort_values('accuracy', ascending=False).to_string()}"
                        )
                else:
                    with open(self.config.model_results, "a") as f:
                        f.write(
                            f"\nResults for {result}: \n{res.sort_values('accuracy', ascending=False).to_string()}"
                        )

            logger.info(f"Model results saved to {self.config.model_results}")
        except Exception as e:
            raise e

    def save_model(self, model):
        try:
            with open(self.config.model_file, "wb") as f:
                joblib.dump(model, f)
                logger.info(f"Model saved to {self.config.model_file}")
        except Exception as e:
            raise e

In [12]:
try:
    config = ConfigurationManager()
    model_build_config, params = config.get_model_build_config()
    model_build = ModelBuild(config=model_build_config, params=params)
    df, preprocessor = model_build.get_data_preprocessor()
    train_test_set = get_train_test_split(df=df)
    models = get_models()
    tr_models, tr_results = model_build.train_model(
        models_dict=models, X_train=train_test_set[0], y_train=train_test_set[2]
    )
    tr_results = get_report(result=tr_results)
    hy_models, hy_results = model_build.train_model(
        models_dict=models,
        X_train=train_test_set[0],
        y_train=train_test_set[2],
        parameters=True,
    )
    hy_results = get_report(result=hy_results)
    best_model = model_build.get_best_model(hy_models, hy_results)
    model_build.save_results(
        {"Trained Models": tr_results, "Tunned Models": hy_results}
    )
    model_build.save_model(best_model)
except Exception as e:
    raise e

[2024-02-04 23:55:49,279]: INFO common yaml file: config\config.yaml loads successfully
[2024-02-04 23:55:49,285]: INFO common yaml file: params.yaml loads successfully
[2024-02-04 23:55:49,287]: INFO common created directory at : artifacts/model_results
[2024-02-04 23:55:49,288]: INFO 233220522 Loading transformed data and preprocessor.
[2024-02-04 23:55:49,429]: INFO 233220522 Transformed data and Preprocessor loading complete.
[2024-02-04 23:55:49,442]: INFO 233220522 Model training for LogisticRegression started.
[2024-02-04 23:55:49,562]: INFO 233220522 Model training for LogisticRegression completed.
[2024-02-04 23:55:49,563]: INFO 233220522 Model training for SVC started.
[2024-02-04 23:56:28,405]: INFO 233220522 Model training for SVC completed.
[2024-02-04 23:56:28,406]: INFO 233220522 Model training for KNeighborsClassifier started.
[2024-02-04 23:56:29,994]: INFO 233220522 Model training for KNeighborsClassifier completed.
[2024-02-04 23:56:29,995]: INFO 233220522 Model trai

In [36]:
tr_results.sort_values("accuracy", ascending=False)

Unnamed: 0,models,accuracy,macro avg precision,macro avg recall,macro avg f1-score,weighted avg precision,weighted avg recall,weighted avg f1-score,0 precision,0 recall,0 f1-score,1 precision,1 recall,1 f1-score
4,DecisionTreeClassifier,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
5,RandomForestClassifier,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
8,XGBClassifier,0.89,0.87,0.83,0.85,0.89,0.89,0.89,0.91,0.95,0.93,0.82,0.72,0.77
2,KNeighborsClassifier,0.88,0.85,0.82,0.83,0.88,0.88,0.88,0.91,0.94,0.92,0.78,0.7,0.74
1,SVC,0.86,0.82,0.76,0.79,0.85,0.86,0.85,0.88,0.94,0.91,0.76,0.59,0.66
6,AdaBoostClassifier,0.86,0.82,0.77,0.79,0.85,0.86,0.85,0.88,0.94,0.91,0.76,0.6,0.67
7,GradientBoostingClassifier,0.86,0.83,0.78,0.8,0.86,0.86,0.86,0.88,0.94,0.91,0.78,0.61,0.68
0,LogisticRegression,0.85,0.8,0.76,0.78,0.84,0.85,0.84,0.88,0.93,0.9,0.73,0.59,0.65
3,GaussianNB,0.64,0.68,0.74,0.62,0.83,0.64,0.66,0.97,0.54,0.69,0.39,0.94,0.56


In [14]:
hy_results.sort_values("accuracy", ascending=False)

Unnamed: 0,models,accuracy,macro avg precision,macro avg recall,macro avg f1-score,weighted avg precision,weighted avg recall,weighted avg f1-score,0 precision,0 recall,0 f1-score,1 precision,1 recall,1 f1-score
4,DecisionTreeClassifier,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
5,RandomForestClassifier,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
8,XGBClassifier,0.89,0.87,0.83,0.85,0.89,0.89,0.89,0.91,0.95,0.93,0.82,0.72,0.77
2,KNeighborsClassifier,0.88,0.85,0.82,0.83,0.88,0.88,0.88,0.91,0.94,0.92,0.78,0.7,0.74
1,SVC,0.86,0.82,0.76,0.79,0.85,0.86,0.85,0.88,0.94,0.91,0.76,0.59,0.66
6,AdaBoostClassifier,0.86,0.82,0.77,0.79,0.85,0.86,0.85,0.88,0.94,0.91,0.76,0.6,0.67
7,GradientBoostingClassifier,0.86,0.83,0.78,0.8,0.86,0.86,0.86,0.88,0.94,0.91,0.78,0.61,0.68
0,LogisticRegression,0.85,0.8,0.76,0.78,0.84,0.85,0.84,0.88,0.93,0.9,0.73,0.59,0.65
3,GaussianNB,0.64,0.68,0.74,0.62,0.83,0.64,0.66,0.97,0.54,0.69,0.39,0.94,0.56
