#  Spambase Dataset Loader

In [1]:
# Importing an ipynb file from another ipynb file
!pip install ipynb

# Importing functions from another jupyter notebook
!pip install nbimporter



In [2]:
## import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import statsmodels.api as sm
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

class SpambaseDatasetLoader():
    
    def __init__(self):
        print('init Loader notebook')
    
    def load_dataset(self, url='https://archive.ics.uci.edu/ml/machine-learning-databases/spambase/spambase.data'):
        '''
        
        (string) --> None
        
        This function retrieves the spambase.data from University of California Irvine (UCI) - Dataset Repository
        and save into two files:
        
        1. spambase_all.csv - The CSV file that contain all the rows from spambase.data.
        2. spambase_balance.csv - The CSV file that rows balance between spam and not spam records.
        
        '''
        print('Loading dataset.')
        
        columns = [
            'word_freq_make',
            'word_freq_address',
            'word_freq_all',
            'word_freq_3d',
            'word_freq_our',
            'word_freq_over',
            'word_freq_remove',
            'word_freq_internet',
            'word_freq_order',
            'word_freq_mail',
            'word_freq_receive',
            'word_freq_will',
            'word_freq_people',
            'word_freq_report',
            'word_freq_addresses',
            'word_freq_free',
            'word_freq_business',
            'word_freq_email',
            'word_freq_you',
            'word_freq_credit',
            'word_freq_your',
            'word_freq_font',
            'word_freq_000',
            'word_freq_money',
            'word_freq_hp',
            'word_freq_hpl',
            'word_freq_george',
            'word_freq_650',
            'word_freq_lab',
            'word_freq_labs',
            'word_freq_telnet',
            'word_freq_857',
            'word_freq_data',
            'word_freq_415',
            'word_freq_85',
            'word_freq_technology',
            'word_freq_1999',
            'word_freq_parts',
            'word_freq_pm',
            'word_freq_direct',
            'word_freq_cs',
            'word_freq_meeting',
            'word_freq_original',
            'word_freq_project',
            'word_freq_re',
            'word_freq_edu',
            'word_freq_table',
            'word_freq_conference',
            'char_freq_;',
            'char_freq_(',
            'char_freq_[',
            'char_freq_!',
            'char_freq_$',
            'char_freq_#',
            'capital_run_length_average',
            'capital_run_length_longest',
            'capital_run_length_total',
            'spam_nonspam']
        
        # retrieve the spam data from icu.
        spam_data = pd.read_csv(url, header=None, names=columns, index_col=False)

        # make the label the first feature.
        spam_data.insert(0, 'target_spam_nonspam', spam_data['spam_nonspam'])
        spam_data = spam_data.drop('spam_nonspam', axis = 1)

        # save the data to new csv.
        spam_data.to_csv('./datasets/spambase_all.csv', index = False)
        print('Loading spambase_all.csv completed.')
            
        spam = spam_data[spam_data['target_spam_nonspam']==1]
        non_spam = spam_data[spam_data['target_spam_nonspam']==0]
        non_spam = non_spam.sample(n=len(spam), random_state=101)
        spam_data_balance = pd.concat([spam,non_spam],axis=0)            
        print('Loading spambase_balance.csv completed.')
    
    def get_full_dataset(self):
        '''
        
        (None) --> Dataframe
        
        This function returns the spambase imbalance dataset.
        
        '''
        print('get_full_dataset')
        return pd.read_csv('./datasets/spambase_all.csv')        
    
    def get_balance_dataset(self): 
        '''
        
        (None) --> Dataframe
        
        This function returns the spambase dataset based from balance dataset.
                
        '''
        print('get_balance_data')
        return pd.read_csv('./datasets/spambase_balance.csv')

    def backward_elimitation(self, data):
        '''

        (DataFrame) --> Array
        
        This backward elimitation technique used Logistic regression-based model which 
        selects the features based on the p-value score of the feature.
        The features with p-value less than 0.05 are considered to be the more relevant feature
        Source: https://www.analyticsvidhya.com/blog/2021/04/discovering-the-shades-of-feature-selection-methods/
        
        Parameters
        ----------
        data: Dataframe that will be use in feature selection.
        
        Returns
        ----------
        Array of features
        
        '''
        print('Feature Selection using Backward Elimination')
        threshold = 0.05
        selected_columns = data.columns
        selected_columns = selected_columns[1:].values # remove the label
        
        X = data.iloc[:,1:]
        y = data.iloc[:,0]
        
        # Add constant to predictors for statsmodels
        X = sm.add_constant(X)

        logit_model = sm.Logit(y, X).fit()

        # Perform backward elimination
        while len(X.columns) > 1:
            # Fit Logit model with all predictors except one
            logit_model = sm.Logit(y, X.iloc[:, :-1]).fit()

            # Get p-values for each predictor
            p_values = logit_model.pvalues

            # Remove predictor with highest p-value
            max_p_value = p_values.idxmax()
            if p_values[max_p_value] > threshold:
                X = X.drop(max_p_value, axis=1)
            else:
                break
                
        X = X.drop('const', axis=1) # cleanup remove the const column before returning
        return X.columns
    
    def person_correlation(self, data):
        '''
        
        (DataFrame) --> Array 
        
        Pearson Correlation is used to construct a correlation matrix that measures the linear association 
        between two features and gives a value between -1 and 1 indicating how related the two features are to one another.
        This measures the degree to which two features are interdependent by computing the association 
        between each feature and the target variable, the one exerting high impact on the target can be picked out
        
        A value of 1 indicates a positive correlation, -1 indicates a negative correlation and 0 indicates no correlation between the features.
        Source: https://www.analyticsvidhya.com/blog/2021/04/discovering-the-shades-of-feature-selection-methods/
        
        Parameters
        ----------
        data: Dataframe that will be use in feature selection.
        
        Returns
        ----------
        Array of features       
        
        '''
        print('Feature Selection using Person Correlation')
        corr = data.corr()
        cor_target = abs(corr['target_spam_nonspam'])
        
        #Selecting highly correlated features
        relevant_features = cor_target[cor_target>0.2]
        selected_columns = relevant_features.keys().to_list()
       
        selected_columns.remove('target_spam_nonspam')
        return selected_columns
    
    def chi2(self, data):
        '''
        
        (DataFrame) --> Array 
        
        A chi-square test is used in statistical models to check the independence of attributes.
        The model measures the degree of deviation between the expected and actual response.
        The lower the value of Chi-square, the less dependent the variables are to one another, 
        and the higher the value more is their correlation. 
        Source: https://www.analyticsvidhya.com/blog/2021/04/discovering-the-shades-of-feature-selection-methods/        
        
        Parameters
        ----------
        data: Dataframe that will be use in feature selection.
        
        Returns
        ----------
        Array of features
        
        '''
        print('Feature Selection using Chi-squared')
        X = data.iloc[:,1:]
        y = data.iloc[:,0]
        
        chi2_features = SelectKBest(chi2, k=19)
        X_kbest_features = chi2_features.fit_transform(X, y)
        mask=chi2_features.get_support()
        selected_columns =[]
        for bool, feature in zip(mask, X.columns):
            if (bool):
                selected_columns.append(feature)

        return selected_columns
        
    
    def perform_feature_selection(self, data, feature_selection_type):
        '''
                        
        (DataFrame, float) --> Dataframe
        
        
        This function performs feature selection based on provided feature selection type.  
        
        Parameters
        ----------
        data: Dataframe that will be use in feature selection.
        
        feature_selection_type: The type of feature selection to be performed.
        1. stepwise_back - use stepwise backward elimation technique in feature selection..
        2. pearson - perform the feature selection using pearson correlation.
        3. chi2 - perform the feature selection using chi-squared.
        
        Returns
        ----------        
        DataFrame : The DataFrame after performing feature selection.
        
        '''
        print('The shape before feature selection: {}'.format(data.shape))
        
        if feature_selection_type == 'stepwise_back':        
            selected_columns = self.backward_elimitation(data)
        elif feature_selection_type == 'pearson':
            selected_columns = self.person_correlation(data)
        elif feature_selection_type == 'chi2':
            selected_columns = self.chi2(data)
        else:
            raise ValueError('Unknown type of feature selection.')
        
        y = pd.DataFrame()
        y['target_spam_nonspam'] = data.iloc[:,0]
        
        X = pd.DataFrame(data = data.iloc[:,1:], columns = selected_columns)
        print('The shape after feature selection: {}'.format(X.shape))
               
        return X, y

In [3]:
import nbimporter
import spambase_dataset_loader_nb

loader = spambase_dataset_loader_nb.SpambaseDatasetLoader()

init Loader notebook


In [4]:
help(loader.load_dataset)

Help on method load_dataset in module spambase_dataset_loader_nb:

load_dataset(url='https://archive.ics.uci.edu/ml/machine-learning-databases/spambase/spambase.data') method of spambase_dataset_loader_nb.SpambaseDatasetLoader instance
    (string) --> None
    
    This function retrieves the spambase.data from University of California Irvine (UCI) - Dataset Repository
    and save into two files:
    
    1. spambase_all.csv - The CSV file that contain all the rows from spambase.data.
    2. spambase_balance.csv - The CSV file that rows balance between spam and not spam records.



In [6]:
help(loader.perform_feature_selection)

Help on method perform_feature_selection in module spambase_dataset_loader_nb:

perform_feature_selection(data, feature_selection_type) method of spambase_dataset_loader_nb.SpambaseDatasetLoader instance
    (DataFrame, float) --> Dataframe
    
    
    This function performs feature selection based on provided feature selection type.  
    
    Parameters
    ----------
    data: Dataframe that will be use in feature selection.
    
    feature_selection_type: The type of feature selection to be performed.
    1. stepwise_back - use stepwise backward elimation technique in feature selection..
    2. pearson - perform the feature selection using pearson correlation.
    3. chi2 - perform the feature selection using chi-squared.
    
    Returns
    ----------        
    DataFrame : The DataFrame after performing feature selection.



In [7]:
help(loader.backward_elimitation)

Help on method backward_elimitation in module spambase_dataset_loader_nb:

backward_elimitation(data) method of spambase_dataset_loader_nb.SpambaseDatasetLoader instance
    (DataFrame) --> Array
    
    This backward elimitation technique used Logistic regression-based model which 
    selects the features based on the p-value score of the feature.
    The features with p-value less than 0.05 are considered to be the more relevant feature
    Source: https://www.analyticsvidhya.com/blog/2021/04/discovering-the-shades-of-feature-selection-methods/
    
    Parameters
    ----------
    data: Dataframe that will be use in feature selection.
    
    Returns
    ----------
    Array of features



In [8]:
help(loader.person_correlation)

Help on method person_correlation in module spambase_dataset_loader_nb:

person_correlation(data) method of spambase_dataset_loader_nb.SpambaseDatasetLoader instance
    (DataFrame) --> Array 
    
    Pearson Correlation is used to construct a correlation matrix that measures the linear association 
    between two features and gives a value between -1 and 1 indicating how related the two features are to one another.
    This measures the degree to which two features are interdependent by computing the association 
    between each feature and the target variable, the one exerting high impact on the target can be picked out
    
    A value of 1 indicates a positive correlation, -1 indicates a negative correlation and 0 indicates no correlation between the features.
    Source: https://www.analyticsvidhya.com/blog/2021/04/discovering-the-shades-of-feature-selection-methods/
    
    Parameters
    ----------
    data: Dataframe that will be use in feature selection.
    
    Returns
 

In [9]:
help(loader.chi2)

Help on method chi2 in module spambase_dataset_loader_nb:

chi2(data) method of spambase_dataset_loader_nb.SpambaseDatasetLoader instance
    (DataFrame) --> Array 
    
    A chi-square test is used in statistical models to check the independence of attributes.
    The model measures the degree of deviation between the expected and actual response.
    The lower the value of Chi-square, the less dependent the variables are to one another, 
    and the higher the value more is their correlation. 
    Source: https://www.analyticsvidhya.com/blog/2021/04/discovering-the-shades-of-feature-selection-methods/        
    
    Parameters
    ----------
    data: Dataframe that will be use in feature selection.
    
    Returns
    ----------
    Array of features

