# Import packages and libraries

In [16]:
import numpy as np
import pandas as pd
from IPython.display import display
# from wordcloud import WordCloud
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC 
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Read and Explore data

In [2]:
def read_data(src):
    
    data = pd.read_csv(src)
    return data

source = "emails.csv"
df = read_data(source)

In [3]:
def explore_data(data):
    
    print(f"Shape of the data: {data.shape}\n")
    
    print("Data information:")
    display(data.info())
    
    print(f"\nNumber of null values in each column:\n{data.isnull().sum()}\n")
    
    print("Data Samples:")
    display(data.head())
    
    print("Data descriptive statistics:")
    display(data.describe())
    
explore_data(df)

Shape of the data: (5172, 3002)

Data information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5172 entries, 0 to 5171
Columns: 3002 entries, Email No. to Prediction
dtypes: int64(3001), object(1)
memory usage: 118.5+ MB


None


Number of null values in each column:
Email No.     0
the           0
to            0
ect           0
and           0
             ..
military      0
allowing      0
ff            0
dry           0
Prediction    0
Length: 3002, dtype: int64

Data Samples:


Unnamed: 0,Email No.,the,to,ect,and,for,of,a,you,hou,...,connevey,jay,valued,lay,infrastructure,military,allowing,ff,dry,Prediction
0,Email 1,0,0,1,0,0,0,2,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Email 2,8,13,24,6,6,2,102,1,27,...,0,0,0,0,0,0,0,1,0,0
2,Email 3,0,0,1,0,0,0,8,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Email 4,0,5,22,0,5,1,51,2,10,...,0,0,0,0,0,0,0,0,0,0
4,Email 5,7,6,17,1,5,2,57,0,9,...,0,0,0,0,0,0,0,1,0,0


Data descriptive statistics:


Unnamed: 0,the,to,ect,and,for,of,a,you,hou,in,...,connevey,jay,valued,lay,infrastructure,military,allowing,ff,dry,Prediction
count,5172.0,5172.0,5172.0,5172.0,5172.0,5172.0,5172.0,5172.0,5172.0,5172.0,...,5172.0,5172.0,5172.0,5172.0,5172.0,5172.0,5172.0,5172.0,5172.0,5172.0
mean,6.640565,6.188128,5.143852,3.075599,3.12471,2.62703,55.517401,2.466551,2.024362,10.600155,...,0.005027,0.012568,0.010634,0.098028,0.004254,0.006574,0.00406,0.914733,0.006961,0.290023
std,11.745009,9.534576,14.101142,6.04597,4.680522,6.229845,87.574172,4.314444,6.967878,19.281892,...,0.105788,0.199682,0.116693,0.569532,0.096252,0.138908,0.072145,2.780203,0.098086,0.453817
min,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,1.0,1.0,0.0,1.0,0.0,12.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,3.0,3.0,1.0,1.0,2.0,1.0,28.0,1.0,0.0,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,8.0,7.0,4.0,3.0,4.0,2.0,62.25,3.0,1.0,12.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
max,210.0,132.0,344.0,89.0,47.0,77.0,1898.0,70.0,167.0,223.0,...,4.0,7.0,2.0,12.0,3.0,4.0,3.0,114.0,4.0,1.0


# Data Preprocessing

## Check duplicates 

In [4]:
duplicateRows = df[df.duplicated()]
duplicateRows 

Unnamed: 0,Email No.,the,to,ect,and,for,of,a,you,hou,...,connevey,jay,valued,lay,infrastructure,military,allowing,ff,dry,Prediction


In [5]:
X = df.iloc[:, 1:-1]
y = df["Prediction"]
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2, 
                                                    random_state=3)

# Modelling 

## SVM model

In [19]:
svm_clf = make_pipeline(StandardScaler(), SVC(gamma='auto', kernel='linear'))
svm_clf.fit(X_train, y_train)
y_pred_svm = svm_clf.predict(X_test)
print(f"Accuracy of the SVM model: {accuracy_score(y_test, y_pred_svm) * 100:.3f}")
print(f"Confusion Matrix of the SVM model: {confusion_matrix(y_test, y_pred_svm)}")
print(f"Classification report of the SVM model:\n{classification_report(y_test, y_pred_svm)}")

Accuracy of the SVM model: 95.459
Confusion Matrix of the SVM model:
[[724  26]
 [ 21 264]]
Classification report of the SVM model:
              precision    recall  f1-score   support

           0       0.97      0.97      0.97       750
           1       0.91      0.93      0.92       285

    accuracy                           0.95      1035
   macro avg       0.94      0.95      0.94      1035
weighted avg       0.95      0.95      0.95      1035



## Naive Bayes Model

In [20]:
nb_clf = GaussianNB()
nb_clf.fit(X_train, y_train)
y_pred_nb = nb_clf.predict(X_test)
print(f"Accuracy of the Naive Bayes model: {accuracy_score(y_test, y_pred_nb) * 100:.3f}")
print(f"Confusion Matrix of the Naive Bayes model: {confusion_matrix(y_test, y_pred_nb)}")
print(f"Classification report of the Naive Bayes model:\n{classification_report(y_test, y_pred_nb)}")

Accuracy of the Naive Bayes model: 94.976
Confusion Matrix of the Naive Bayes model:
[[716  34]
 [ 18 267]]
Classification report of the Naive Bayes model:
              precision    recall  f1-score   support

           0       0.98      0.95      0.96       750
           1       0.89      0.94      0.91       285

    accuracy                           0.95      1035
   macro avg       0.93      0.95      0.94      1035
weighted avg       0.95      0.95      0.95      1035

