In [2]:
# Dependencies

import pandas as pd
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

import warnings
warnings.filterwarnings('ignore')

## ETL

In [3]:
# Store CSV of given dataset into DataFrame.

csv_file = "spam.csv"
# spam_df = pd.read_csv(csv_file, index_col=0)
spam_df = pd.read_csv(csv_file)
spam_df.head()

Unnamed: 0,make,address,all,3d,our,over,remove,internet,order,mail,...,semicol,paren,bracket,bang,dollar,pound,cap_avg,cap_long,cap_total,Class
0,0.0,0.0,0.29,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.178,0.0,0.044,0.0,0.0,1.666,10,180,ham
1,0.46,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.125,0.0,0.0,0.0,0.0,1.51,10,74,ham
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.718,11,55,ham
3,0.33,0.44,0.37,0.0,0.14,0.11,0.0,0.07,0.97,1.16,...,0.006,0.159,0.0,0.069,0.221,0.11,3.426,72,819,spam
4,0.0,2.08,0.0,0.0,3.12,0.0,1.04,0.0,0.0,0.0,...,0.0,0.0,0.0,0.263,0.0,0.0,1.428,4,20,spam


In [4]:
#Checking number of rows & columns.

total_rows=len(spam_df.axes[0])
total_cols=len(spam_df.axes[1])
print('total rows are : ', total_rows)
print('total columns are : ', total_cols)

total rows are :  4601
total columns are :  58


In [5]:
#Checking for null values in dataframe.

spam_df.isnull().values.any()


False

^The result is "False" after running the above line of code, which means there are no null values in the given dataset. Hence, we will move forward with it.

In [6]:
#Dataframe for class 'spam'
class_spam = spam_df.loc[spam_df['Class'] == 'spam']

#
total_spam_rows=len(class_spam.axes[0])
print('Total number of rows assigned to "Class Spam" =', total_spam_rows)

#------

#Dataframe for class 'ham'
class_ham = spam_df.loc[spam_df['Class'] == 'ham']

#
total_ham_rows=len(class_ham.axes[0])
print('Total number of rows assigned to "Class Ham" =', total_ham_rows)

Total number of rows assigned to "Class Spam" = 1813
Total number of rows assigned to "Class Ham" = 2788


##### Classification Task: Train the classifiers using the first 1000 instances and use the remaining 3601 for testing. 

In [7]:
# splitting dataframe by row index
train_df = spam_df.iloc[:1000,:]
test_df = spam_df.iloc[1000:,:]
print("Shape of new dataframes - {} , {}".format(train_df.shape, test_df.shape))

Shape of new dataframes - (1000, 58) , (3601, 58)


In [8]:
#Checking names of columns.
spam_df.columns

Index(['make', 'address', 'all', '3d', 'our', 'over', 'remove', 'internet',
       'order', 'mail', 'receive', 'will', 'people', 'report', 'addresses',
       'free', 'business', 'email', 'you', 'credit', 'your', 'font', '0',
       'money', 'hp', 'hpl', 'george', '650', 'lab', 'labs', 'telnet', '857',
       'data', '415', '85', 'technology', '1999', 'parts', 'pm', 'direct',
       'cs', 'meeting', 'original', 'project', 're', 'edu', 'table',
       'conference', 'semicol', 'paren', 'bracket', 'bang', 'dollar', 'pound',
       'cap_avg', 'cap_long', 'cap_total', 'Class'],
      dtype='object')

In [9]:
spam_df['Class'].unique()

array(['ham', 'spam'], dtype=object)

In [10]:
# Create data model for Training
x_train = train_df[['make', 'address', 'all', '3d', 'our', 'over', 'remove', 'internet',
                   'order', 'mail', 'receive', 'will', 'people', 'report', 'addresses',
                   'free', 'business', 'email', 'you', 'credit', 'your', 'font', '0',
                   'money', 'hp', 'hpl', 'george', '650', 'lab', 'labs', 'telnet', '857',
                   'data', '415', '85', 'technology', '1999', 'parts', 'pm', 'direct',
                   'cs', 'meeting', 'original', 'project', 're', 'edu', 'table',
                   'conference', 'semicol', 'paren', 'bracket', 'bang', 'dollar', 'pound',
                   'cap_avg', 'cap_long', 'cap_total']]

y_train = train_df['Class']

In [11]:
# Create data model for Testing
x_test = test_df[['make', 'address', 'all', '3d', 'our', 'over', 'remove', 'internet',
                   'order', 'mail', 'receive', 'will', 'people', 'report', 'addresses',
                   'free', 'business', 'email', 'you', 'credit', 'your', 'font', '0',
                   'money', 'hp', 'hpl', 'george', '650', 'lab', 'labs', 'telnet', '857',
                   'data', '415', '85', 'technology', '1999', 'parts', 'pm', 'direct',
                   'cs', 'meeting', 'original', 'project', 're', 'edu', 'table',
                   'conference', 'semicol', 'paren', 'bracket', 'bang', 'dollar', 'pound',
                   'cap_avg', 'cap_long', 'cap_total']]

y_test = test_df['Class']

##### Fuse three classifiers: (1) k-Nearest Neighbor, (2) Random Forest, (3) Logistic Regression 

In [23]:
# Fusion of three classifiers: (1) k-Nearest Neighbor, (2) Random Forest, (3) Logistic Regression
#--------------------------------------------------------------------------------------------------
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
#------------------------------------------------------

model_KN = KNeighborsClassifier()
model_KN = model_KN.fit(x_train, y_train)
score_KN = model_KN.score(x_train, y_train)

model_RF = RandomForestClassifier()
model_RF = model_RF.fit(x_train, y_train)
score_RF = model_RF.score(x_train, y_train)

model_LR = LogisticRegression()
model_LR = model_LR.fit(x_train, y_train)
score_LR = model_LR.score(x_train, y_train)


def get_models():
    models = list()
    models.append(('kn', model_KN))
    models.append(('rf', model_RF))
    models.append(('lr', model_LR))
    return models


fused_clf = VotingClassifier(estimators=get_models(), voting='hard')     #If ‘hard’, uses predicted class labels for majority rule voting.
fused_clf = fused_clf.fit(x_train,y_train)
fused_accuracy = fused_clf.score(x_test,y_test)
print("Classification Accuracy of the Fused Classifier ", fused_accuracy)

Classification Accuracy of the Fused Classifier  0.9239100249930575


In [24]:
# Metrics - Fused Classifier 
fused_predict = fused_clf.predict(x_test)
fused_accuracy = accuracy_score(y_test, fused_predict)
fused_class_repo = classification_report(y_test, fused_predict)
fused_cm = confusion_matrix(y_test, fused_predict)

print("~ Fused Classifier  ~\n")
print("Classification Accuracy: \n", fused_accuracy, "\n")
print("Classification Report: \n",  "(Here, 'recall' is per class classification accuracy.)\n", fused_class_repo) 
print("Confusion Matrix: \n", fused_cm, "\n")

~ Fused Classifier  ~

Classification Accuracy: 
 0.9239100249930575 

Classification Report: 
 (Here, 'recall' is per class classification accuracy.)
               precision    recall  f1-score   support

         ham       0.92      0.96      0.94      2182
        spam       0.94      0.86      0.90      1419

    accuracy                           0.92      3601
   macro avg       0.93      0.91      0.92      3601
weighted avg       0.92      0.92      0.92      3601

Confusion Matrix: 
 [[2100   82]
 [ 192 1227]] 



In [18]:
# AdaBoost Ensemble with Decision Tree as the base learner
#-----------------------------------------------------------
from sklearn.ensemble import AdaBoostClassifier
#-----------------------------------------------

adaboost_clf = AdaBoostClassifier()     #If 'None', then the base estimator is 'DecisionTreeClassifier' initialized with max_depth=1.
adaboost_clf = adaboost_clf.fit(x_train, y_train)
adaboost_accuracy = adaboost_clf.score(x_test, y_test)
print("Classification Accuracy of Adaboost Classifier with decision trees: ", adaboost_accuracy)


Classification Accuracy of Adaboost Classifier with decision trees:  0.9197445154123854


In [19]:
# Metrics - AdaBoost Ensemble
adaboost_predict = adaboost_clf.predict(x_test)
adaboost_accuracy = accuracy_score(y_test, adaboost_predict)
adaboost_class_repo = classification_report(y_test, adaboost_predict)
adaboost_cm = confusion_matrix(y_test, adaboost_predict)

print("~ AdaBoost Ensemble ~\n")
print("Classification Accuracy: \n", adaboost_accuracy, "\n")
print("Classification Report: \n",  "(Here, 'recall' is per class classification accuracy.)\n", adaboost_class_repo) 
print("Confusion Matrix: \n", adaboost_cm, "\n")

~ AdaBoost Ensemble ~

Classification Accuracy: 
 0.9197445154123854 

Classification Report: 
 (Here, 'recall' is per class classification accuracy.)
               precision    recall  f1-score   support

         ham       0.93      0.94      0.93      2182
        spam       0.90      0.89      0.90      1419

    accuracy                           0.92      3601
   macro avg       0.92      0.91      0.92      3601
weighted avg       0.92      0.92      0.92      3601

Confusion Matrix: 
 [[2049  133]
 [ 156 1263]] 



-----------------------------------------------