In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, roc_curve, auc, RocCurveDisplay
import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import nltk
import pickle
import os
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, LSTM, SpatialDropout1D
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import keras_tuner as kt
import matplotlib.pyplot as plt

# Download NLTK data files (if not already installed)
nltk.download('stopwords')
nltk.download('punkt')

def clean_text(text, stop_words):
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r'http\S+|www\S+|https\S+', 'URLfound', text, flags=re.MULTILINE)
    text = text.lower()
    word_tokens = word_tokenize(text)
    filtered_text = [word for word in word_tokens if word not in stop_words]
    return ' '.join(filtered_text)

def stem_text(text):
    stemmer = PorterStemmer()
    word_tokens = word_tokenize(text)
    stemmed_text = [stemmer.stem(word) for word in word_tokens]
    return ' '.join(stemmed_text)

def ml_preprocess(df):
    stop_words = set(stopwords.words('english'))
    df['subject'].fillna('', inplace=True)
    df['content'] = df['subject'] + ' ' + df['body']
    df['content'] = df['content'].apply(lambda x: clean_text(x, stop_words))
    df['content'] = df['content'].apply(stem_text)
    df = df[['label', 'content']]
    return df

# Load the dataset
df = pd.read_csv("Nazario_5.csv")

# Preprocess the dataset
df = ml_preprocess(df)

# Split the data into training and testing sets
X = df['content']
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the TfidfVectorizer
vectorizer = TfidfVectorizer()

# Define the models
rf_model = RandomForestClassifier(random_state=42)
svm_model = SVC(probability=True, random_state=42)
knn_model = KNeighborsClassifier()

# Create pipelines for each model
rf_pipeline = Pipeline([
    ('vectorizer', vectorizer),
    ('classifier', rf_model)
])

svm_pipeline = Pipeline([
    ('vectorizer', vectorizer),
    ('classifier', svm_model)
])

knn_pipeline = Pipeline([
    ('vectorizer', vectorizer),
    ('classifier', knn_model)
])

# Define the hyperparameters for Grid Search
rf_params = {
    'classifier__n_estimators': [100, 200, 300],
    'classifier__max_depth': [None, 10, 20, 30],
    'classifier__min_samples_split': [2, 5, 10]
}

# Perform Grid Search CV for Random Forest
rf_grid = GridSearchCV(rf_pipeline, rf_params, cv=5, n_jobs=-1, verbose=1)
rf_grid.fit(X_train, y_train)
print("Best parameters for Random Forest:", rf_grid.best_params_)
rf_pred = rf_grid.predict(X_test)
rf_pred_prob = rf_grid.predict_proba(X_test)[:, 1]
print("Random Forest Classification Report:")
print(classification_report(y_test, rf_pred, digits=4))

# Define the hyperparameters for SVM
svm_params = {
    'classifier__C': [0.1, 1, 10],
    'classifier__kernel': ['linear', 'rbf']
}
# Perform Grid Search CV for SVM
svm_grid = GridSearchCV(svm_pipeline, svm_params, cv=5, n_jobs=-1, verbose=1)
svm_grid.fit(X_train, y_train)
print("Best parameters for SVM:", svm_grid.best_params_)
svm_pred = svm_grid.predict(X_test)
svm_pred_prob = svm_grid.predict_proba(X_test)[:, 1]
print("SVM Classification Report:")
print(classification_report(y_test, svm_pred, digits=4))

# Define the hyperparameters for KNN
knn_params = {
    'classifier__n_neighbors': [3, 5, 7],
    'classifier__weights': ['uniform', 'distance']
}
# Perform Grid Search CV for KNN
knn_grid = GridSearchCV(knn_pipeline, knn_params, cv=5, n_jobs=-1, verbose=1)
knn_grid.fit(X_train, y_train)
print("Best parameters for KNN:", knn_grid.best_params_)
knn_pred = knn_grid.predict(X_test)
knn_pred_prob = knn_grid.predict_proba(X_test)[:, 1]
print("KNN Classification Report:")
print(classification_report(y_test, knn_pred, digits=4))

# Tokenize and pad sequences for deep learning models
max_words = 10000
max_len = 500
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

X_train_pad = pad_sequences(X_train_seq, maxlen=max_len)
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len)

def build_cnn_model(hp):
    model = Sequential()
    model.add(Embedding(max_words, 128, input_length=max_len))
    model.add(Conv1D(
        filters=hp.Int('filters', min_value=32, max_value=256, step=32),
        kernel_size=hp.Choice('kernel_size', values=[3, 5, 7]),
        activation='relu'
    ))
    model.add(GlobalMaxPooling1D())
    model.add(Dense(
        units=hp.Int('units', min_value=32, max_value=256, step=32),
        activation='relu'
    ))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(
        optimizer='adam',
        loss='binary_crossentropy',
        metrics=['accuracy']
    )
    return model

def build_lstm_model(hp):
    model = Sequential()
    model.add(Embedding(max_words, 128, input_length=max_len))
    model.add(SpatialDropout1D(0.2))
    model.add(LSTM(
        units=hp.Int('lstm_units', min_value=32, max_value=256, step=32),
        dropout=0.2,
        recurrent_dropout=0.2
    ))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(
        optimizer='adam',
        loss='binary_crossentropy',
        metrics=['accuracy']
    )
    return model

# Hyperparameter tuning for CNN
cnn_tuner = kt.Hyperband(
    build_cnn_model,
    objective='val_accuracy',
    max_epochs=10,
    hyperband_iterations=2,
    directory='my_dir',
    project_name='cnn_tuning'
)

# Hyperparameter tuning for LSTM
lstm_tuner = kt.Hyperband(
    build_lstm_model,
    objective='val_accuracy',
    max_epochs=10,
    hyperband_iterations=2,
    directory='my_dir',
    project_name='lstm_tuning'
)

# Search for best hyperparameters
cnn_tuner.search(X_train_pad, y_train, epochs=5, validation_split=0.2)
lstm_tuner.search(X_train_pad, y_train, epochs=5, validation_split=0.2)

# Retrieve best hyperparameters for CNN
best_cnn_hps = cnn_tuner.get_best_hyperparameters(num_trials=1)[0]
print(f"""
The hyperparameter search for CNN is complete. The optimal number of filters in the Conv1D layer is {best_cnn_hps.get('filters')},
the optimal kernel size is {best_cnn_hps.get('kernel_size')}, and the optimal number of units in the Dense layer is {best_cnn_hps.get('units')}.
""")

# Retrieve best hyperparameters for LSTM
best_lstm_hps = lstm_tuner.get_best_hyperparameters(num_trials=1)[0]
print(f"""
The hyperparameter search for LSTM is complete. The optimal number of units in the LSTM layer is {best_lstm_hps.get('lstm_units')}.
""")

# Train with best hyperparameters for CNN
cnn_model = cnn_tuner.hypermodel.build(best_cnn_hps)
cnn_model.fit(X_train_pad, y_train, epochs=10, validation_split=0.2)
cnn_model.save('models/best_cnn_model.h5')

cnn_pred_prob = cnn_model.predict(X_test_pad)
cnn_pred = (cnn_pred_prob > 0.5).astype("int32")
print("CNN Classification Report:")
print(classification_report(y_test, cnn_pred, digits=4))

# Train with best hyperparameters for LSTM
lstm_model = lstm_tuner.hypermodel.build(best_lstm_hps)
lstm_model.fit(X_train_pad, y_train, epochs=10, validation_split=0.2)
lstm_model.save('models/best_lstm_model.h5')

lstm_pred_prob = lstm_model.predict(X_test_pad)
lstm_pred = (lstm_pred_prob > 0.5).astype("int32")
print("LSTM Classification Report:")
print(classification_report(y_test, lstm_pred, digits=4))

# Plot ROC curves
def plot_roc_curve(y_true, y_pred_prob, model_name):
    fpr, tpr, _ = roc_curve(y_true, y_pred_prob)
    roc_auc = auc(fpr, tpr)
    plt.figure()
    plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'Receiver Operating Characteristic - {model_name}')
    plt.legend(loc="lower right")
    plt.show()
    print(f'{model_name} AUROC: {roc_auc:.2f}')

# Create directory to save models if it doesn't exist
if not os.path.exists('models'):
    os.makedirs('models')

# Save traditional ML models
with open('models/rf_model.pkl', 'wb') as f:
    pickle.dump(rf_grid.best_estimator_, f)

with open('models/svm_model.pkl', 'wb') as f:
    pickle.dump(svm_grid.best_estimator_, f)

with open('models/knn_model.pkl', 'wb') as f:
    pickle.dump(knn_grid.best_estimator_, f)

# Plot ROC curves for all models
plot_roc_curve(y_test, rf_pred_prob, 'Random Forest')
plot_roc_curve(y_test, svm_pred_prob, 'SVM')
plot_roc_curve(y_test, knn_pred_prob, 'KNN')
plot_roc_curve(y_test, cnn_pred_prob, 'CNN')
plot_roc_curve(y_test, lstm_pred_prob, 'LSTM')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Rohan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Rohan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


FileNotFoundError: [Errno 2] No such file or directory: '/content/drive/MyDrive/Dataset/Nazario_5.csv'

In [None]:
df.head(5)

Unnamed: 0,label,content
0,0,fw june 29 bna inc daili labor report user id ...
1,0,ngx failov plan hi chri tonight roll new repor...
2,0,intranet site rika r new origin messag thoma p...
3,0,fw ena upstream compani inform johngerald curr...
4,0,new master physic gerald staci attach workshee...


In [3]:
import tensorflow as tf

# Check if GPU is available and its detailSs
if tf.test.gpu_device_name():
    print('Default GPU Device: {}'.format(tf.test.gpu_device_name()))
else:
    print("No GPU found")


No GPU found. Please ensure GPU runtime is selected in Colab.
