In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, roc_curve, auc, RocCurveDisplay
import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import nltk
import pickle
import os
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, LSTM, SpatialDropout1D
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Download NLTK data files (if not already installed)
nltk.download('stopwords')
nltk.download('punkt')

def clean_text(text, stop_words):
    """
    Clean the text data by removing punctuation, URLs, converting to lowercase, 
    and removing stop words.
    """
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Replace URLs with the word URL_found
    text = re.sub(r'http\S+|www\S+|https\S+', 'URLfound', text, flags=re.MULTILINE)
    # Convert to lowercase
    text = text.lower()
    # Tokenize the text
    word_tokens = word_tokenize(text)
    # Remove stop words
    filtered_text = [word for word in word_tokens if word not in stop_words]
    return ' '.join(filtered_text)

def stem_text(text):
    """
    Apply stemming to the cleaned text.
    """
    stemmer = PorterStemmer()
    word_tokens = word_tokenize(text)
    stemmed_text = [stemmer.stem(word) for word in word_tokens]
    return ' '.join(stemmed_text)

def ml_preprocess(df):
    """
    Preprocess the email dataset for phishing detection.
    - Fill missing values in the subject column
    - Combine subject and body into a single content column
    - Clean the text data
    - Perform stemming on the cleaned text
    """
    stop_words = set(stopwords.words('english'))
    
    # Fill missing values in the subject column with an empty string
    df['subject'].fillna('', inplace=True)

    # Combine the subject and body into a single column named content
    df['content'] = df['subject'] + ' ' + df['body']

    # Clean the content column
    df['content'] = df['content'].apply(lambda x: clean_text(x, stop_words))

    # Perform stemming on the cleaned content
    df['content'] = df['content'].apply(stem_text)

    # Drop all other columns except label and content
    df = df[['label', 'content']]
    
    return df

# Load the dataset
df = pd.read_csv("Nazario_5.csv")

# Preprocess the dataset
df = ml_preprocess(df)

# Split the data into training and testing sets
X = df['content']
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the TfidfVectorizer
vectorizer = TfidfVectorizer()

# Define the models
rf_model = RandomForestClassifier(random_state=42)
svm_model = SVC(random_state=42)
knn_model = KNeighborsClassifier()

# Create pipelines for each model
rf_pipeline = Pipeline([
    ('vectorizer', vectorizer),
    ('classifier', rf_model)
])

svm_pipeline = Pipeline([
    ('vectorizer', vectorizer),
    ('classifier', svm_model)
])

knn_pipeline = Pipeline([
    ('vectorizer', vectorizer),
    ('classifier', knn_model)
])



ModuleNotFoundError: No module named 'tensorflow.keras.wrappers'

In [None]:
df.head(5)

Unnamed: 0,label,content
0,0,fw june 29 bna inc daili labor report user id ...
1,0,ngx failov plan hi chri tonight roll new repor...
2,0,intranet site rika r new origin messag thoma p...
3,0,fw ena upstream compani inform johngerald curr...
4,0,new master physic gerald staci attach workshee...


In [None]:

# Define the hyperparameters for Grid Search
rf_params = {
    'classifier__n_estimators': [100, 200, 300],
    'classifier__max_depth': [None, 10, 20, 30],
    'classifier__min_samples_split': [2, 5, 10]
}





# Perform Grid Search CV
rf_grid = GridSearchCV(rf_pipeline, rf_params, cv=5, n_jobs=-1, verbose=1)

# Train and evaluate Random Forest
rf_grid.fit(X_train, y_train)

print("Best parameters for Random Forest:", rf_grid.best_params_)




Fitting 5 folds for each of 36 candidates, totalling 180 fits
Best parameters for Random Forest: {'classifier__max_depth': None, 'classifier__min_samples_split': 10, 'classifier__n_estimators': 300}


In [None]:

rf_pred = rf_grid.predict(X_test)
print("Random Forest Classification Report:")
print(classification_report(y_test, rf_pred, digits=4))

Random Forest Classification Report:
              precision    recall  f1-score   support

           0     0.9968    0.9840    0.9904       313
           1     0.9836    0.9967    0.9901       300

    accuracy                         0.9902       613
   macro avg     0.9902    0.9903    0.9902       613
weighted avg     0.9903    0.9902    0.9902       613



In [None]:
svm_params = {
    'classifier__C': [0.1, 1, 10],
    'classifier__kernel': ['linear', 'rbf']
}
svm_grid = GridSearchCV(svm_pipeline, svm_params, cv=5, n_jobs=-1, verbose=1)
# Train and evaluate SVM
svm_grid.fit(X_train, y_train)
print("Best parameters for SVM:", svm_grid.best_params_)


Fitting 5 folds for each of 6 candidates, totalling 30 fits
Best parameters for SVM: {'classifier__C': 1, 'classifier__kernel': 'linear'}


In [None]:

svm_pred = svm_grid.predict(X_test)
print("SVM Classification Report:")
print(classification_report(y_test, svm_pred, digits=4))

SVM Classification Report:
              precision    recall  f1-score   support

           0     1.0000    0.9968    0.9984       313
           1     0.9967    1.0000    0.9983       300

    accuracy                         0.9984       613
   macro avg     0.9983    0.9984    0.9984       613
weighted avg     0.9984    0.9984    0.9984       613



In [None]:
knn_params = {
    'classifier__n_neighbors': [3, 5, 7],
    'classifier__weights': ['uniform', 'distance']
}

knn_grid = GridSearchCV(knn_pipeline, knn_params, cv=5, n_jobs=-1, verbose=1)
# Train and evaluate KNN
knn_grid.fit(X_train, y_train)
print("Best parameters for KNN:", knn_grid.best_params_)


Fitting 5 folds for each of 6 candidates, totalling 30 fits
Best parameters for KNN: {'classifier__n_neighbors': 5, 'classifier__weights': 'distance'}


In [None]:

knn_pred = knn_grid.predict(X_test)
print("KNN Classification Report:")
print(classification_report(y_test, knn_pred, digits=4))

KNN Classification Report:
              precision    recall  f1-score   support

           0     1.0000    0.9553    0.9771       313
           1     0.9554    1.0000    0.9772       300

    accuracy                         0.9772       613
   macro avg     0.9777    0.9776    0.9772       613
weighted avg     0.9782    0.9772    0.9772       613



In [None]:

# Tokenize and pad sequences for deep learning models
max_words = 10000
max_len = 500
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

X_train_pad = pad_sequences(X_train_seq, maxlen=max_len)
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len)

def create_cnn_model():
    model = Sequential()
    model.add(Embedding(max_words, 128, input_length=max_len))
    model.add(Conv1D(128, 5, activation='relu'))
    model.add(GlobalMaxPooling1D())
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

def create_lstm_model():
    model = Sequential()
    model.add(Embedding(max_words, 128, input_length=max_len))
    model.add(SpatialDropout1D(0.2))
    model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

cnn_model = KerasClassifier(build_fn=create_cnn_model, epochs=5, batch_size=32, verbose=1)
lstm_model = KerasClassifier(build_fn=create_lstm_model, epochs=5, batch_size=32, verbose=1)

# Define the hyperparameters for Grid Search
cnn_params = {
    'epochs': [5, 10],
    'batch_size': [32, 64]
}
lstm_params = {
    'epochs': [5, 10],
    'batch_size': [32, 64]
}




In [None]:

cnn_grid = GridSearchCV(cnn_model, cnn_params, cv=3, n_jobs=-1, verbose=1)

# Train and evaluate CNN
cnn_grid.fit(X_train_pad, y_train)
print("Best parameters for CNN:", cnn_grid.best_params_)
cnn_pred = cnn_grid.predict(X_test_pad)
print("CNN Classification Report:")
print(classification_report(y_test, cnn_pred, digits=4))

In [None]:
lstm_grid = GridSearchCV(lstm_model, lstm_params, cv=3, n_jobs=-1, verbose=1)


# Train and evaluate LSTM
lstm_grid.fit(X_train_pad, y_train)
print("Best parameters for LSTM:", lstm_grid.best_params_)
lstm_pred = lstm_grid.predict(X_test_pad)
print("LSTM Classification Report:")
print(classification_report(y_test, lstm_pred, digits=4))

In [None]:
# Save models
if not os.path.exists('models'):
    os.makedirs('models')

with open('models/rf_model.pkl', 'wb') as f:
    pickle.dump(rf_grid.best_estimator_, f)

with open('models/svm_model.pkl', 'wb') as f:
    pickle.dump(svm_grid.best_estimator_, f)

with open('models/knn_model.pkl', 'wb') as f:
    pickle.dump(knn_grid.best_estimator_, f)

cnn_grid.best_estimator_.model.save('models/cnn_model.h5')
lstm_grid.best_estimator_.model.save('models/lstm_model.h5')