# Spambase Model Helper

In [1]:
# Importing an ipynb file from another ipynb file
!pip install ipynb

# Importing functions from another jupyter notebook
!pip install nbimporter

Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable


In [2]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

import statsmodels.api as sm

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

import nbimporter
import spambase_dataset_loader_nb

class SpambaseModelHelper():
    
    def __init__(self):
        print('init Model Helper notebook')
    
    def run_statsmodels_logit(self, data, feature_selection_type=None, verbose=False):        
        '''
        
        This function runs logistic regression based on stats models.
        
        (DataFrame, boolean, boolean) --> DataFrame
        
        Parameters
        ----------
        data: Dataframe that will be used in running the model.
        
        feature_selection_type: Type of Feature Selection to use. Available options are stepwise_back, pearson and chi2.
        verbose: True to display statsmodels summary, confusion matrix and its heatmap otherwise false.
        
        Returns
        ----------        
        DataFrame : The DataFrame of statsmodels scores.        
        '''
        X = data.iloc[:,1:].values
        y = data.iloc[:,0].values
        
        if feature_selection_type != None:
            
            loader = spambase_dataset_loader_nb.SpambaseDatasetLoader()
            
            result = loader.perform_feature_selection(data, feature_selection_type)

            X = result[0]
            y = result[1]

        X = sm.add_constant(X)
            
        # split the dataset
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=101)
        
        # building the model and fit the model using the training data
        statsmodels_lr = sm.Logit(y_train, X_train).fit()

        if verbose == True:
            print(statsmodels_lr.summary2())
        
        # performing predictions on the test dataset
        yhat = statsmodels_lr.predict(X_test)

        y_pred_statsmodels = list(map(round, yhat))

        cf_matrix = confusion_matrix(y_test, y_pred_statsmodels) 
        
        TP = cf_matrix[0][0]
        TN = cf_matrix[1][1]
        FP = cf_matrix[0][1]
        FN = cf_matrix[1][0]
        if verbose == True:
            print('Confusion Matrix')
            print(cf_matrix)
            path = ('./images/cf_matrix_TP{}_TN{}_FP{}_FN{}.png'.format(TP, TN, FP, FN))
            
            sns.set(rc={"figure.figsize":(6, 5)})
            sns_plot = sns.heatmap(cf_matrix, annot=True,  fmt='d', cmap='Blues')
            plt.savefig(path)

        acc = (TP + TN) / np.sum(cf_matrix)
        precision = TP / (TP + FP)
        recall = TP / (TP + FN)
        f1_score = (2 * (precision * recall)) / (precision + recall)

        statsmodels_scores_df = pd.DataFrame([[acc, precision, recall, f1_score]], ['Score'], ['Accuracy', 'Precision', 'Recall', 'F1-Score'])
        
        return statsmodels_scores_df

    
    def run_sklearn_log_reg(self, data, feature_selection_type=None):        
        '''
        
        This function runs logistic regression based on sklearn models.
        
        (DataFrame, boolean, boolean) --> DataFrame
        
        Parameters
        ----------
        data: Dataframe that will be used in running the model.
        
        feature_selection_type: Type of Feature Selection to use. Available options are stepwise_back, pearson and chi2.
       
        Returns
        ----------        
        DataFrame : The DataFrame of statsmodels scores.
        '''
        X = data.iloc[:,1:].values
        y = data.iloc[:,0].values
        
        if feature_selection_type!=None:
            
            loader = spambase_dataset_loader_nb.SpambaseDatasetLoader()
            
            result = loader.perform_feature_selection(data, feature_selection_type)

            X = result[0]
            y = result[1]
        
        # split the dataset
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=101)

        # instantiate the model
        sklearn_lr = LogisticRegression(max_iter=10000)

        # fit the model using the training data
        sklearn_lr.fit(X_train, y_train)

        # use model to make predictions on test data
        y_pred_sklearn = sklearn_lr.predict(X_test)

        # calculate the accuracy, precision, recall and f1-score
        acc = accuracy_score(y_test, y_pred_sklearn)
        precision = precision_score(y_test, y_pred_sklearn)
        recall = recall_score(y_test, y_pred_sklearn)
        f1 = f1_score(y_test, y_pred_sklearn)
        
        sklearn_scores_df = pd.DataFrame([[acc, precision, recall, f1]], ['Score'], ['Accuracy', 'Precision', 'Recall', 'F1-Score'])
        
        return sklearn_scores_df



In [3]:
import nbimporter
import spambase_model_helper_nb

model_helper = spambase_model_helper_nb.SpambaseModelHelper()

init Model Helper notebook


In [4]:
help(model_helper.run_statsmodels_logit)

Help on method run_statsmodels_logit in module spambase_model_helper_nb:

run_statsmodels_logit(data, feature_selection_type=None, verbose=False) method of spambase_model_helper_nb.SpambaseModelHelper instance
    This function runs logistic regression based on stats models.
    
    (DataFrame, boolean, boolean) --> DataFrame
    
    Parameters
    ----------
    data: Dataframe that will be used in running the model.
    
    feature_selection_type: Type of Feature Selection to use. Available options are stepwise_back, pearson and chi2.
    verbose: True to display statsmodels summary, confusion matrix and its heatmap otherwise false.
    
    Returns
    ----------        
    DataFrame : The DataFrame of statsmodels scores.



In [5]:
help(model_helper.run_sklearn_log_reg)

Help on method run_sklearn_log_reg in module spambase_model_helper_nb:

run_sklearn_log_reg(data, feature_selection_type=None) method of spambase_model_helper_nb.SpambaseModelHelper instance
    This function runs logistic regression based on sklearn models.
    
    (DataFrame, boolean, boolean) --> DataFrame
    
    Parameters
    ----------
    data: Dataframe that will be used in running the model.
    
    feature_selection_type: Type of Feature Selection to use. Available options are stepwise_back, pearson and chi2.
    
    Returns
    ----------        
    DataFrame : The DataFrame of statsmodels scores.

