### Model Build


In [1]:
import os
import warnings

warnings.filterwarnings("ignore")

In [2]:
%pwd

'd:\\GitHub\\Ineuron_adult_census_income_prediction\\research'

In [3]:
os.chdir("../")

In [4]:
%pwd

'd:\\GitHub\\Ineuron_adult_census_income_prediction'

In [5]:
import joblib
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import (
    RandomForestClassifier,
    AdaBoostClassifier,
    GradientBoostingClassifier,
)
from xgboost import XGBClassifier

In [6]:
from dataclasses import dataclass
from pathlib import Path


@dataclass
class ModelBuildConfig:
    root_dir: Path
    preprocessed_data_file: Path
    preprocessor_file: Path
    model_file: Path
    model_results: Path
    best_params: Path

In [7]:
from adult_census.constants import *
from adult_census.utils.common import read_yaml, create_directories

In [8]:
class ConfigurationManager:
    def __init__(self, config=CONFIG_FILE_PATH, params=PARAMS_FILE_PATH):
        self.config = read_yaml(config)
        self.params = read_yaml(params)

    def get_model_build_config(self) -> ModelBuildConfig:
        config = self.config.model_build

        model_build_config = ModelBuildConfig(
            root_dir=config.root_dir,
            preprocessed_data_file=config.preprocessed_data_file,
            preprocessor_file=config.preprocessor_file,
            model_file=config.model_file,
            model_results=config.model_results,
            best_params=config.best_params,
        )

        return model_build_config, self.params

In [9]:
from adult_census.logging import logger

In [10]:
# helper functions for model training and evaluation


def fit_model(model, x, y, parameters = None):
    try:
        if parameters != None:
            gcv = GridSearchCV(estimator=model, param_grid=parameters, n_jobs=-1)
            gcv.fit(X=x, y=y)
            best_params = gcv.best_params_
            model.set_params(**gcv.best_params_)
            pred = model.predict(x)
            result = classification_report(y, pred, output_dict=True)
            return model, result, best_params
        else:
            model.fit(x, y)
            pred = model.predict(x)
            result = classification_report(y, pred, output_dict=True)
            return model, result
    except Exception as e:
        raise e


def get_models():
    models_dict = {
        "LogisticRegression": LogisticRegression(),
        "KNeighborsClassifier": KNeighborsClassifier(),
        "GaussianNB": GaussianNB(),
        "DecisionTreeClassifier": DecisionTreeClassifier(),
        "RandomForestClassifier": RandomForestClassifier(),
        "AdaBoostClassifier": AdaBoostClassifier(),
        "GradientBoostingClassifier": GradientBoostingClassifier(),
        "XGBClassifier": XGBClassifier(),
    }

    return models_dict


def get_train_test_split(df: pd.DataFrame):
    X = df.drop("income", axis=1)
    y = df["income"]

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )

    return [X_train, X_test, y_train, y_test]


def get_report(result: dict):
    cr = {}
    cr["models"] = []
    cr["accuracy"] = []
    cr["macro avg precision"] = []
    cr["macro avg recall"] = []
    cr["macro avg f1-score"] = []
    cr["weighted avg precision"] = []
    cr["weighted avg recall"] = []
    cr["weighted avg f1-score"] = []
    for i, model in enumerate(result.keys()):
        cr["models"].append(model)
        cr["accuracy"].append(round(result[model]["accuracy"], 2))
        cr["macro avg precision"].append(
            round(result[model]["macro avg"]["precision"], 2)
        )
        cr["macro avg recall"].append(round(result[model]["macro avg"]["recall"], 2))
        cr["macro avg f1-score"].append(
            round(result[model]["macro avg"]["f1-score"], 2)
        )
        cr["weighted avg precision"].append(
            round(result[model]["weighted avg"]["precision"], 2)
        )
        cr["weighted avg recall"].append(
            round(result[model]["weighted avg"]["recall"], 2)
        )
        cr["weighted avg f1-score"].append(
            round(result[model]["weighted avg"]["f1-score"], 2)
        )
        target_class = result[model].keys()
        for each in target_class:
            if each not in ["accuracy", "macro avg", "weighted avg"]:
                if i == 0:
                    cr[each + " " + "precision"] = []
                    cr[each + " " + "recall"] = []
                    cr[each + " " + "f1-score"] = []

                cr[each + " " + "precision"].append(
                    round(result[model][each]["precision"], 2)
                )
                cr[each + " " + "recall"].append(
                    round(result[model][each]["recall"], 2)
                )
                cr[each + " " + "f1-score"].append(
                    round(result[model][each]["f1-score"], 2)
                )

    return pd.DataFrame(cr)

In [11]:
class ModelBuild:
    def __init__(self, config: ModelBuildConfig, params):
        self.config = config
        self.params = params

        create_directories([self.config.root_dir])

    def get_data_preprocessor(self):
        logger.info("Loading transformed data and preprocessor.")

        df = pd.read_csv(self.config.preprocessed_data_file)

        with open(self.config.preprocessor_file, "rb") as f:
            preprocessor = joblib.load(f)

        logger.info("Transformed data and Preprocessor loading complete.")

        return df, preprocessor

    def train_model(self, models_dict, X_train, y_train, parameters: bool = False):
        try:
            tr_models = {}
            tr_results = {}

            for model in models_dict.keys():
                if (parameters == True) and (model in self.params.keys()):
                    logger.info(f"Hyperparameter tuning for {model} started.")
                    parameter_of_model = self.params[model]
                    tr_model, result, best_params = fit_model(
                        models_dict[model],
                        X_train,
                        y_train,
                        parameters=parameter_of_model,
                    )

                    logger.info(f"Hyperparameter tuning for {model} completed.")

                    if (best_params != None) and (
                        not os.path.exists(self.config.best_params)
                    ):
                        with open(self.config.best_params, "w") as f:
                            f.write(f"Best Params for {model}: \n {best_params}")
                    else:
                        with open(self.config.best_params, "a") as f:
                            f.write(f"\n\nBest Params for {model}: \n {best_params}")
                else:
                    logger.info(f"Model training for {model} started.")
                    tr_model, result = fit_model(models_dict[model], X_train, y_train)
                    logger.info(f"Model training for {model} completed.")

                tr_models[model] = tr_model
                tr_results[model] = result

            return tr_models, tr_results
        except Exception as e:
            raise e

    def evaluate_model(self, models_dict, X_test, y_test):
        try:
            logger.info("Models evaluation started.")

            ts_results = {}

            for model in models_dict.keys():
                pred = models_dict[model].predict(X_test)
                result = classification_report(y_test, pred, output_dict=True)
                ts_results[model] = result

            logger.info("Models evaluation completed.")

            return ts_results

        except Exception as e:
            raise e

    def get_best_model(self, models_dict: dict, models_results: pd.DataFrame):
        best_model = models_results.sort_values("accuracy", ascending=False)[
            "models"
        ].to_list()[0]
        best_model = models_dict[best_model]
        return best_model

    def save_results(self, results: dict):
        try:
            for result in results.keys():
                res = results[result]
                if not os.path.exists(self.config.model_results):
                    with open(self.config.model_results, "w") as f:
                        f.write(
                            f"Results for {result}: \n{res.sort_values('accuracy', ascending=False).to_string()}"
                        )
                else:
                    with open(self.config.model_results, "a") as f:
                        f.write(
                            f"\nResults for {result}: \n{res.sort_values('accuracy', ascending=False).to_string()}"
                        )

            logger.info(f"Model results saved to {self.config.model_results}")
        except Exception as e:
            raise e

    def save_model(self, model):
        try:
            with open(self.config.model_file, "wb") as f:
                joblib.dump(model, f)
                logger.info(f"Model saved to {self.config.model_file}")
        except Exception as e:
            raise e

In [12]:
try:
    config = ConfigurationManager()
    model_build_config, params = config.get_model_build_config()
    model_build = ModelBuild(config=model_build_config, params=params)
    df, preprocessor = model_build.get_data_preprocessor()
    train_test_set = get_train_test_split(df=df)
    models = get_models()
    tr_models, tr_results = model_build.train_model(
        models_dict=models, X_train=train_test_set[0], y_train=train_test_set[2]
    )
    tr_results = get_report(result=tr_results)
    ts_results = model_build.evaluate_model(
        tr_models, X_test=train_test_set[1], y_test=train_test_set[3]
    )
    ts_results = get_report(result=ts_results)
    hy_models, hy_results = model_build.train_model(
        models_dict=models,
        X_train=train_test_set[0],
        y_train=train_test_set[2],
        parameters=True,
    )
    hy_results = get_report(result=hy_results)
    hy_ts_results = model_build.evaluate_model(
        hy_models, X_test=train_test_set[1], y_test=train_test_set[3]
    )
    hy_ts_results = get_report(hy_ts_results)
    best_model = model_build.get_best_model(hy_models, hy_ts_results)
    model_build.save_results(
        {
            "Non-Tunning Train Report": tr_results,
            "Non-Tunning Test Report": ts_results,
            "Hyper-Tunning Train Report": hy_results,
            "Hyper-Tunning Test Report": hy_ts_results,
        }
    )
    model_build.save_model(best_model)
except Exception as e:
    raise e

[2024-02-08 00:08:25,774]: INFO common yaml file: config\config.yaml loads successfully
[2024-02-08 00:08:25,779]: INFO common yaml file: params.yaml loads successfully
[2024-02-08 00:08:25,782]: INFO common created directory at : artifacts/model_results
[2024-02-08 00:08:25,784]: INFO 1254238484 Loading transformed data and preprocessor.
[2024-02-08 00:08:25,900]: INFO 1254238484 Transformed data and Preprocessor loading complete.
[2024-02-08 00:08:25,918]: INFO 1254238484 Model training for LogisticRegression started.
[2024-02-08 00:08:26,099]: INFO 1254238484 Model training for LogisticRegression completed.
[2024-02-08 00:08:26,100]: INFO 1254238484 Model training for KNeighborsClassifier started.
[2024-02-08 00:08:27,859]: INFO 1254238484 Model training for KNeighborsClassifier completed.
[2024-02-08 00:08:27,860]: INFO 1254238484 Model training for GaussianNB started.
[2024-02-08 00:08:27,919]: INFO 1254238484 Model training for GaussianNB completed.
[2024-02-08 00:08:27,920]: INF

In [13]:
tr_results.sort_values("accuracy", ascending=False)

Unnamed: 0,models,accuracy,macro avg precision,macro avg recall,macro avg f1-score,weighted avg precision,weighted avg recall,weighted avg f1-score,0 precision,0 recall,0 f1-score,1 precision,1 recall,1 f1-score
3,DecisionTreeClassifier,0.94,0.93,0.9,0.91,0.94,0.94,0.94,0.94,0.98,0.96,0.92,0.81,0.86
4,RandomForestClassifier,0.94,0.92,0.91,0.92,0.94,0.94,0.94,0.96,0.96,0.96,0.88,0.86,0.87
1,KNeighborsClassifier,0.87,0.82,0.8,0.81,0.86,0.87,0.86,0.9,0.93,0.91,0.74,0.68,0.71
7,XGBClassifier,0.86,0.82,0.79,0.81,0.86,0.86,0.86,0.89,0.93,0.91,0.75,0.66,0.7
5,AdaBoostClassifier,0.84,0.78,0.75,0.76,0.83,0.84,0.83,0.87,0.92,0.9,0.7,0.57,0.63
6,GradientBoostingClassifier,0.84,0.8,0.75,0.77,0.83,0.84,0.84,0.87,0.93,0.9,0.72,0.57,0.64
0,LogisticRegression,0.83,0.78,0.73,0.75,0.82,0.83,0.82,0.86,0.92,0.89,0.69,0.55,0.61
2,GaussianNB,0.49,0.63,0.64,0.49,0.79,0.49,0.5,0.94,0.35,0.51,0.32,0.93,0.47


In [14]:
ts_results.sort_values("accuracy", ascending=False)

Unnamed: 0,models,accuracy,macro avg precision,macro avg recall,macro avg f1-score,weighted avg precision,weighted avg recall,weighted avg f1-score,0 precision,0 recall,0 f1-score,1 precision,1 recall,1 f1-score
6,GradientBoostingClassifier,0.84,0.79,0.73,0.75,0.83,0.84,0.83,0.87,0.93,0.9,0.7,0.54,0.61
0,LogisticRegression,0.83,0.77,0.72,0.74,0.82,0.83,0.82,0.86,0.92,0.89,0.68,0.52,0.59
5,AdaBoostClassifier,0.83,0.78,0.73,0.75,0.82,0.83,0.83,0.87,0.93,0.89,0.69,0.53,0.6
7,XGBClassifier,0.83,0.77,0.74,0.75,0.82,0.83,0.82,0.87,0.91,0.89,0.67,0.56,0.61
1,KNeighborsClassifier,0.82,0.74,0.73,0.73,0.81,0.82,0.81,0.87,0.89,0.88,0.62,0.56,0.59
4,RandomForestClassifier,0.81,0.73,0.72,0.72,0.8,0.81,0.8,0.86,0.89,0.88,0.6,0.54,0.57
3,DecisionTreeClassifier,0.79,0.71,0.69,0.7,0.79,0.79,0.79,0.85,0.88,0.87,0.57,0.51,0.54
2,GaussianNB,0.49,0.63,0.64,0.49,0.79,0.49,0.5,0.94,0.35,0.52,0.31,0.93,0.46


In [15]:
hy_results.sort_values("accuracy", ascending=False)

Unnamed: 0,models,accuracy,macro avg precision,macro avg recall,macro avg f1-score,weighted avg precision,weighted avg recall,weighted avg f1-score,0 precision,0 recall,0 f1-score,1 precision,1 recall,1 f1-score
3,DecisionTreeClassifier,0.94,0.93,0.9,0.91,0.94,0.94,0.94,0.94,0.98,0.96,0.92,0.81,0.86
4,RandomForestClassifier,0.94,0.92,0.91,0.92,0.94,0.94,0.94,0.96,0.96,0.96,0.88,0.86,0.87
1,KNeighborsClassifier,0.87,0.82,0.8,0.81,0.86,0.87,0.86,0.9,0.93,0.91,0.74,0.68,0.71
7,XGBClassifier,0.86,0.82,0.79,0.81,0.86,0.86,0.86,0.89,0.93,0.91,0.75,0.66,0.7
6,GradientBoostingClassifier,0.84,0.8,0.75,0.77,0.83,0.84,0.84,0.87,0.93,0.9,0.72,0.57,0.64
0,LogisticRegression,0.83,0.78,0.73,0.75,0.82,0.83,0.82,0.86,0.92,0.89,0.69,0.55,0.61
2,GaussianNB,0.49,0.63,0.64,0.49,0.79,0.49,0.5,0.94,0.35,0.51,0.32,0.93,0.47
5,AdaBoostClassifier,0.41,0.57,0.57,0.41,0.72,0.41,0.41,0.86,0.27,0.41,0.27,0.86,0.42


In [16]:
hy_ts_results.sort_values("accuracy", ascending=False)

Unnamed: 0,models,accuracy,macro avg precision,macro avg recall,macro avg f1-score,weighted avg precision,weighted avg recall,weighted avg f1-score,0 precision,0 recall,0 f1-score,1 precision,1 recall,1 f1-score
6,GradientBoostingClassifier,0.84,0.79,0.73,0.75,0.83,0.84,0.83,0.87,0.93,0.9,0.7,0.54,0.61
0,LogisticRegression,0.83,0.77,0.72,0.74,0.82,0.83,0.82,0.86,0.92,0.89,0.68,0.52,0.59
7,XGBClassifier,0.83,0.77,0.74,0.75,0.82,0.83,0.82,0.87,0.91,0.89,0.67,0.56,0.61
1,KNeighborsClassifier,0.82,0.74,0.73,0.73,0.81,0.82,0.81,0.87,0.89,0.88,0.62,0.56,0.59
4,RandomForestClassifier,0.81,0.73,0.72,0.72,0.8,0.81,0.8,0.86,0.89,0.88,0.6,0.54,0.57
3,DecisionTreeClassifier,0.79,0.71,0.69,0.7,0.79,0.79,0.79,0.85,0.88,0.87,0.57,0.51,0.54
2,GaussianNB,0.49,0.63,0.64,0.49,0.79,0.49,0.5,0.94,0.35,0.52,0.31,0.93,0.46
5,AdaBoostClassifier,0.41,0.56,0.56,0.41,0.72,0.41,0.41,0.86,0.27,0.41,0.27,0.86,0.41
