# Spam Classifer With Gaussian Naive Bayes

In [None]:
import pandas as pd
import numpy as np
# import pingouin as pg , kaggle does not support pingouin
from sklearn.preprocessing import PowerTransformer
from sklearn.naive_bayes import GaussianNB
from sklearn.feature_selection import SelectKBest, f_classif
from imblearn.combine import SMOTETomek
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

In [None]:
spam = pd.read_csv('../input/spambase/spambase_csv.csv')

In [None]:
spam.head()

In [None]:
x = spam[spam.drop('class', axis = 1).columns]
y = spam['class']
xtrain, xtest, ytrain, ytest = train_test_split(x,y, train_size = 0.7, random_state = 42)

## Feature Selection

- If skewness is less than -1 or greater than 1, the distribution is highly skewed.
- If skewness is between -1 and -0.5 or between 0.5 and 1, the distribution is moderately skewed.
- If skewness is between -0.5 and 0.5, the distribution is approximately symmetric



In [None]:
def skewness_check(data, skew_cols = False, non_skew = False):
    skew_feats = data.skew().sort_values(ascending = False)
    skewness = pd.DataFrame(skew_feats, columns = ['Skew'])
    skew_dict = {'High':0, 'Moderate':0, 'None':0}
    
    if skew_cols == True:
        df = skewness[((skewness['Skew'] <= -1) | (skewness['Skew'] >= 1)) | ((skewness['Skew'] > -1) & (skewness['Skew'] <= -0.5)) | ((skewness['Skew'] >= 0.5) & (skewness['Skew'] < 1))]
        return df
    
    elif skew_cols == False and non_skew == False:
        for row in skewness['Skew']:
            if row <= -1 or row >= 1:
                skew_dict['High'] += 1
            elif (row > -1 and row <= -0.5) or (row >= 0.5 and row < 1):
                skew_dict['Moderate'] += 1
            else:
                skew_dict["None"] += 1
        return pd.DataFrame.from_dict(skew_dict, orient = 'index', columns = ['Skew'])
    
    elif non_skew == True:
        df_non_skew = skewness[((skewness['Skew']>= 0) & (skewness['Skew']< 0.5)) |((skewness['Skew']> -0.5) & (skewness['Skew']<= 0))]
        return df_non_skew
   

In [None]:
skewness_check(xtrain)

All features are highly skewed. We will be dropping features based on high correlation first to prevent redundancy. This will also satisfy the assumption of independence for Gaussian Naive Bayes.

In [None]:
def correlation(data, threshold = 0.75):
    col_corr = set()
    corr_matrix = data.corr(method = 'spearman')
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if abs(corr_matrix.iloc[i,j]) >= threshold:
                colname = corr_matrix.columns[i]
                col_corr.add(colname)
    return list(col_corr)
    

In [None]:
high_corr_columns = correlation(xtrain)
high_corr_columns

In [None]:
xtrain.drop(high_corr_columns, axis = 1, inplace = True)
xtest.drop(high_corr_columns, axis = 1, inplace = True)

In [None]:
# Run on your own Jupyter notebook with pingouin installed.
# pg.homoscedasticity(xtrain, method="levene", alpha=.05)

- H0: Variances are equal.
- H1: Variances are not equal.

P-value is greater than 0.05, which means the test failed to reject the null hypothesis. There is homogeneity of variance among all features. We will be able to use the ANOVA F-test for feature selection.

In [None]:
def feature_selection(x,y):
    skb = SelectKBest(score_func = f_classif, k = 'all')
    skb.fit_transform(x,y)
    col_names = x.columns.values[skb.get_support()]
    scores = skb.scores_[skb.get_support()]
    col_scores = list(zip(col_names, scores))
    df = pd.DataFrame(col_scores, columns = ['Feature','Score'])
    mean_score = df['Score'].mean()
    max_score = df['Score'].max()
    filtered_df = df[(df['Score'] >= mean_score) & (df['Score'] <= max_score)]

    return filtered_df.sort_values('Score', ascending = False)

In [None]:
df = feature_selection(xtrain, ytrain)
df

In [None]:
FS_xtrain = xtrain[list(df['Feature'])]
FS_xtest = xtest[list(df['Feature'])]

## Data Transformation

In [None]:
# Transforming the highly skewed features to reduce skewness to approximate normally distributed data.
def data_transform_PT(data_train, data_test):
    
    pt = PowerTransformer(method = 'yeo-johnson',
                         standardize = False) # Using yeo-johnson because data contains values of zero.
    data_train_transformed = pd.DataFrame(pt.fit_transform(data_train),
                                         columns = data_train.columns)
    data_test_transformed = pd.DataFrame(pt.transform(data_test),
                                        columns = data_test.columns)
    return data_train_transformed, data_test_transformed

In [None]:
xtrain_PT, xtest_PT = data_transform_PT(FS_xtrain,FS_xtest)

In [None]:
skewness_check(xtrain_PT)

Although there is still a significant amount of highly skewed features, skewness as been highly reduced. To check this, a function called `skew_comparison` will be created to compared the change in skewness.

In [None]:
def skew_comparison(x_1, x_2):
    skew_feats = x_1.skew().sort_values(ascending = False)
    skewness = pd.DataFrame(skew_feats, columns = ['Skew Before'])
    
    skew_feats_2 = x_2.skew().sort_values(ascending = False)
    skewness_2 = pd.DataFrame(skew_feats_2, columns = ['Skew After'])
    
    df = skewness.merge(skewness_2, right_index = True, left_index = True)
    df['Skew Reduction'] = -abs(df['Skew Before'] - df['Skew After'])
    
    return df

In [None]:
skew_comparison(xtrain, xtrain_PT)

## Correcting Imbalance

In [None]:
print(ytrain.value_counts())
print(ytest.value_counts())

In [None]:
smt = SMOTETomek(random_state = 42)
xtrain_res, ytrain_res = smt.fit_resample(xtrain_PT, ytrain)
xtest_res, ytest_res = smt.fit_resample(xtest_PT, ytest)

print(ytrain_res.value_counts())
print(ytest_res.value_counts())

## Machine Learning with GaussianNB

In [None]:
gnb = GaussianNB()
gnb.fit(xtrain_res,ytrain_res)
predictions = gnb.predict(xtest_res)
accuracy = accuracy_score(ytest_res,predictions)
f1 = f1_score(ytest_res,predictions)
auc = roc_auc_score(ytest_res,predictions)

print('accuracy: ', accuracy)
print('f1: ', f1)
print('AUC: ', auc)