In [1]:
import re
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

### Loading Dataset

In [2]:
df = pd.read_csv('data/CEAS_08.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39154 entries, 0 to 39153
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   sender    39154 non-null  object
 1   receiver  38692 non-null  object
 2   date      39154 non-null  object
 3   subject   39126 non-null  object
 4   body      39154 non-null  object
 5   label     39154 non-null  int64 
 6   urls      39154 non-null  int64 
dtypes: int64(2), object(5)
memory usage: 2.1+ MB


### Combining subject and body and merging them into one column (email)

In [4]:
df['email'] = df['subject'] + ' ' + df['body']

### Removing unneccessary columns

In [5]:
df = df.drop(columns=['receiver', 'date', 'sender', 'subject', 'body'], axis=1)

In [6]:
df.head(2)

Unnamed: 0,label,urls,email
0,1,1,"Never agree to be a loser Buck up, your troubl..."
1,1,1,Befriend Jenna Jameson \nUpgrade your sex and ...


### Lowering text of email column and removing \n from this text

In [7]:
df['email'] = df['email'].str.lower()
df['email'] = df['email'].str.replace('\n', '')

### Removing special charachters, white spaces and duplicates and nan values from text

In [8]:
special_characters = [
    '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/',
    ':', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_', '`', '{', '|', '}', '~',
]
pattern = r'\b[a-zA-Z]+\b'

def remove_special_chars_from_text(text):
    temp = ''.join([i if i not in special_characters else ' ' for i in str(text)])
    return ' '.join(re.findall(pattern, temp))

In [9]:
df['email'] = df['email'].apply(remove_special_chars_from_text)
df['email'] = df['email'].str.replace(r'\s+', ' ', regex=True)

In [10]:
df = df.drop_duplicates(subset='email', keep='first')

In [11]:
df.shape

(34516, 3)

In [12]:
min_length = 100
df = df[df['email'].apply(len) >= min_length]

In [13]:
df.shape

(32307, 3)

In [14]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split

class EmailPhishingDataProcessor:
    def __init__(self):
        self.vectorizer = CountVectorizer()
        self.X = None
        self.y = None
        self.label_names = None
    
    def preprocess_data(self, df):
        X_email = self.vectorizer.fit_transform(df['email'])
        y = df['label'].values
        email_column_names = self.vectorizer.get_feature_names_out()        
        return X_email, y, email_column_names
    
    def process_text(self, text):
        processed_data = self.vectorizer.transform([text])
        return processed_data

    def fit(self, df, max_features=None):
        if max_features:
            self.vectorizer = CountVectorizer(max_features=max_features)        
        self.X, self.y, self.label_names = self.preprocess_data(df)
        
    def get_features_labels(self):
        return self.X, self.y, self.label_names


class NaiveBayesClassifier:
    def __init__(self):
        self.clf = MultinomialNB()
    
    def fit(self, X_train, y_train):
        self.clf.fit(X_train, y_train)
    
    def test(self, X_test, y_test):
        return self.clf.score(X_test, y_test)
    
    def predict(self, X):
        return self.clf.predict(X)


In [15]:
data_proccessor = EmailPhishingDataProcessor()
data_proccessor.fit(df, max_features=40000)
X, y, feature_names = data_proccessor.get_features_labels()

In [16]:
n_total = X.shape[0]*X.shape[1]
n_ratings = X.nnz
sparsity = n_ratings / n_total
print(f"Matrix sparsity: {round(sparsity*100,2)}%")

Matrix sparsity: 0.25%


In [17]:
X.shape

(32307, 40000)

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [19]:
clf = NaiveBayesClassifier()
clf.fit(X_train, y_train)

In [20]:
y_pred = clf.predict(X_test)

In [21]:
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy:{round(accuracy, 3)*100}%")

Accuracy:99.5%


In [22]:
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Classification Report:
              precision    recall  f1-score   support

           0       0.99      1.00      1.00      3389
           1       1.00      0.99      0.99      3073

    accuracy                           1.00      6462
   macro avg       1.00      0.99      1.00      6462
weighted avg       1.00      1.00      1.00      6462



In [27]:
test = '''
my brother, send me some money i am stuck in other country 
and i lost my wallet and i can not efford buying ticket to come back to the country.
'''

In [28]:
new_test = data_proccessor.process_text(test)
new_test_result = clf.predict(new_test)[0]
if new_test_result == 1:
    print('its phishing email')
else:
    print('Its not phishing email')

Its not phishing email


In [29]:
test2 = '''
if you click on the link and use our product, u will win a lot of money
'''

In [30]:
new_test2 = data_proccessor.process_text(test2)
new_test_result2 = clf.predict(new_test2)[0]
if new_test_result2 == 1:
    print('its phishing email')
else:
    print('Its not phishing email')

its phishing email
