In [3]:
from collections import namedtuple
import os
os.chdir("../")

In [2]:
from phishing_domain_detector.constants import *
from phishing_domain_detector.utils import read_yaml, create_directories

In [4]:
class ConfigurationManager:
    def __init__(self, config_filepath = CONFIG_FILE_PATH, params_filepath = PARAMS_FILE_PATH, schema_filepath = SCHEMA_FILE_PATH):
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)
        create_directories([self.config.artifacts_root])
    
    def get_data_ingestion_config(self):
        config = self.config.data_ingestion

        create_directories([config.root_dir])

        data_ingestion_config = DataIngestionConfig(
            root_dir = config.root_dir,
            source_URL= config.source_URL,
            local_data_file = config.local_data_file,
            unzip_dir = config.unzip_dir
        )

        return data_ingestion_config

    def get_data_validation_config(self):
        config =self.config.data_validation
        
        create_directories([config.root_dir])

        data_validation_config = DataValidationConfig(
            root_dir = config.root_dir,
            data_dir = config.data_dir,
            train_file_name = config.train_file_name,
            test_file_name = config.test_file_name,
            report_file_path = config.report_file_path,
            report_page_file_path = config.report_page_file_path
        )

        return data_validation_config

    def get_data_transformation_config(self):
        config =self.config.data_transformation
        
        create_directories([config.root_dir])

        data_transformation_config = DataTransformationConfig(
            root_dir = config.root_dir,
            data_dir = config.data_dir,
            train_file_name = config.train_file_name,
            test_file_name = config.test_file_name,
            train_trans = config.train_trans,
            test_trans = config.test_trans,
            rand_state = self.params.RANDOM_STATE
        )

        return data_transformation_config

    def get_model_trainer_config(self):
        config =self.config.model_trainer
        
        create_directories([config.trained_model_dir])

        model_trainer_config = ModelTrainerConfig(
            trained_model_dir = config.trained_model_dir,
            data_dir= config.data_dir,
            train_file= config.train_file,
            test_file= config.test_file,
            model_file_name= config.model_file_name,
            best_model_name= config.best_model_name
        )

        return model_trainer_config
    

In [5]:
from phishing_domain_detector.constants import *
from phishing_domain_detector.entity import *
from phishing_domain_detector.config import *
from phishing_domain_detector.utils import read_yaml, create_directories

In [6]:
import os
import pandas  as pd
import numpy as np
from sklearn.metrics import accuracy_score,confusion_matrix,roc_auc_score,classification_report
from sklearn.metrics import precision_recall_fscore_support

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier

from phishing_domain_detector.entity import ModelTrainerConfig
from phishing_domain_detector import logger
from phishing_domain_detector.utils import *


class ModelTrainer:
    def __init__(self,config = ModelTrainerConfig):
        self.config = config
        self.models = []

    def get_train_test_file(self):
        logger.info("Fetching dataset")
        train_file = os.path.join(self.config.data_dir, self.config.train_file)
        test_file = os.path.join(self.config.data_dir, self.config.test_file)
        train_df = pd.read_csv(train_file)
        test_df = pd.read_csv(test_file)
        return train_df, test_df

    def train_model(self):
        train_df, _ = self.get_train_test_file()
        
        X_train = train_df.iloc[:,:-1]
        y_train = train_df.iloc[:,-1]

        logger.info("Training data on Logistic Regression")
        
        lr_model = LogisticRegression()
        lr_model.fit(X_train,y_train)
        self.models.append(lr_model)

        logger.info("Training data on Random Forest Classifier") 

        rf_model = RandomForestClassifier(random_state=42)
        rf_model.fit(X_train,y_train)
        self.models.append(rf_model)
        
        logger.info("Training data on Gradient Boosting Classifier")

        gb_model = GradientBoostingClassifier(random_state=42)
        gb_model.fit(X_train, y_train)
        self.models.append(rf_model)

        logger.info("Training data on XG-Boost Classifier")

        xgb_model = XGBClassifier(random_state=42)
        xgb_model.fit(X_train, y_train)
        self.models.append(xgb_model)

        logger.info("Model trained on all models")

    def evaluater(self, actual, predicted):
        print(confusion_matrix(actual, predicted))
        print()
        print(classification_report(actual, predicted))
        print()
        print('roc_auc_score: ', roc_auc_score(actual, predicted))
        print()
        print("test set accuracy score :",  accuracy_score(actual, predicted))
        print()
        p, r, f, _ = precision_recall_fscore_support(actual, predicted, average='binary')
        print('test set precision: ', p)
        print()
        print('test set recall: ', r)
        print()
        print('test set f1-score: ', f)
        print()
        print()
    
    def model_eval(self):
        _, test_df = self.get_train_test_file()
        
        X_test = test_df.iloc[:,:-1]
        y_test = test_df.iloc[:,-1]

        best_score = 0.0

        for model in self.models:
            score = model.score(X_test, y_test)
            if score > best_score:
                best_score = score
                best_model = model
            best_score = max(score, best_score)
            print(f"{type(model).__name__} test accuracy: {score:.3f}")
            self.evaluater(y_test, model.predict(X_test))
            logger.info(self.evaluater(y_test, model.predict(X_test)))
        
        print(f"The best model is {type(best_model).__name__} with test accuracy: {best_score:.3f}")


In [7]:
config = ConfigurationManager()
model_trainer_config = config.get_model_trainer_config()
model_trainer = ModelTrainer(config=model_trainer_config)
model_trainer.train_model()
model_trainer.model_eval()

[2023-02-20 09:26:57,401: INFO: common]: yaml file: configs\config.yaml loaded successfully
[2023-02-20 09:26:57,407: INFO: common]: yaml file: params.yaml loaded successfully
[2023-02-20 09:26:57,436: INFO: common]: yaml file: configs\schema.yaml loaded successfully
[2023-02-20 09:26:57,439: INFO: common]: created directory at: artifacts
[2023-02-20 09:26:57,448: INFO: common]: created directory at: trained_model
[2023-02-20 09:26:57,449: INFO: 2375524105]: Fetching dataset
[2023-02-20 09:26:57,990: INFO: 2375524105]: Training data on Logistic Regression
[2023-02-20 09:26:59,531: INFO: 2375524105]: Training data on Random Forest Classifier


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[2023-02-20 09:27:14,342: INFO: 2375524105]: Training data on Gradient Boosting Classifier
[2023-02-20 09:27:38,325: INFO: 2375524105]: Training data on XG-Boost Classifier
[2023-02-20 09:27:48,425: INFO: 2375524105]: Model trained on all models
[2023-02-20 09:27:48,432: INFO: 2375524105]: Fetching dataset
LogisticRegression test accuracy: 0.831
[[25264  2734]
 [ 7177 23470]]

              precision    recall  f1-score   support

           0       0.78      0.90      0.84     27998
           1       0.90      0.77      0.83     30647

    accuracy                           0.83     58645
   macro avg       0.84      0.83      0.83     58645
weighted avg       0.84      0.83      0.83     58645


roc_auc_score:  0.8340836883656693

test set accuracy score : 0.8310000852587603

test set precision:  0.8956647840024424

test set recall:  0.7658172088622051

test set f1-score:  0.8256670946861092


[[25264  2734]
 [ 7177 23470]]

              precision    recall  f1-score   support

   