In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_curve, auc
import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import nltk
import pickle
import os
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import matplotlib.pyplot as plt
from transformers import BertTokenizer, BertForSequenceClassification, DistilBertTokenizer, DistilBertForSequenceClassification, RobertaTokenizer, RobertaForSequenceClassification
import torch

# Download NLTK data files (if not already installed)
nltk.download('stopwords')
nltk.download('punkt')

def clean_text(text, stop_words):
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r'http\S+|www\S+|https\S+', 'URLfound', text, flags=re.MULTILINE)
    text = text.lower()
    word_tokens = word_tokenize(text)
    filtered_text = [word for word in word_tokens if word not in stop_words]
    return ' '.join(filtered_text)

def stem_text(text):
    stemmer = PorterStemmer()
    word_tokens = word_tokenize(text)
    stemmed_text = [stemmer.stem(word) for word in word_tokens]
    return ' '.join(stemmed_text)

def ml_preprocess(df):
    stop_words = set(stopwords.words('english'))
    df['subject'].fillna('', inplace=True)
    df['content'] = df['subject'] + ' ' + df['body']
    df['content'] = df['content'].apply(lambda x: clean_text(x, stop_words))
    df['content'] = df['content'].apply(stem_text)
    df = df[['label', 'content']]
    return df

# Load the dataset
df = pd.read_csv("Nazario_5.csv")

# Preprocess the dataset
df = ml_preprocess(df)

# Split the data into training and testing sets
X = df['content']
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Tokenize and pad sequences for deep learning models
max_words = 10000
max_len = 500
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

X_train_pad = pad_sequences(X_train_seq, maxlen=max_len)
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len)

# Load traditional ML models
with open('models/rf_pipeline.pkl', 'rb') as f:
    rf_model = pickle.load(f)

with open('models/svm_pipeline.pkl', 'rb') as f:
    svm_model = pickle.load(f)

with open('models/knn_pipeline.pkl', 'rb') as f:
    knn_model = pickle.load(f)

# Load deep learning models
cnn_model = load_model('models/best_cnn_model.h5')
lstm_model = load_model('models/best_lstm_model.h5')

# Load Hugging Face models using safetensors
bert_tokenizer = BertTokenizer.from_pretrained('models/bert_model')
bert_model = BertForSequenceClassification.from_pretrained('models/bert_model', use_safetensors=True)

distilbert_tokenizer = DistilBertTokenizer.from_pretrained('models/distilbert_model')
distilbert_model = DistilBertForSequenceClassification.from_pretrained('models/distilbert_model', use_safetensors=True)

roberta_tokenizer = RobertaTokenizer.from_pretrained('models/roberta_model')
roberta_model = RobertaForSequenceClassification.from_pretrained('models/roberta_model', use_safetensors=True)

def predict_hf_model(model, tokenizer, X_test):
    encodings = tokenizer(X_test.tolist(), truncation=True, padding=True, max_length=512, return_tensors='pt')
    outputs = model(**encodings)
    predictions = torch.argmax(outputs.logits, dim=1).detach().numpy()
    pred_prob = torch.softmax(outputs.logits, dim=1).detach().numpy()[:, 1]
    return predictions, pred_prob

# Predict and print classification report and ROC curve for Random Forest
rf_pred = rf_model.predict(X_test)
rf_pred_prob = rf_model.predict_proba(X_test)[:, 1]
print("Random Forest Classification Report:")
print(classification_report(y_test, rf_pred, digits=4))

# Predict and print classification report and ROC curve for SVM
svm_pred = svm_model.predict(X_test)
svm_pred_prob = svm_model.predict_proba(X_test)[:, 1]
print("SVM Classification Report:")
print(classification_report(y_test, svm_pred, digits=4))

# Predict and print classification report and ROC curve for KNN
knn_pred = knn_model.predict(X_test)
knn_pred_prob = knn_model.predict_proba(X_test)[:, 1]
print("KNN Classification Report:")
print(classification_report(y_test, knn_pred, digits=4))

# Predict and print classification report and ROC curve for CNN
cnn_pred_prob = cnn_model.predict(X_test_pad)
cnn_pred = (cnn_pred_prob > 0.5).astype("int32")
print("CNN Classification Report:")
print(classification_report(y_test, cnn_pred, digits=4))

# Predict and print classification report and ROC curve for LSTM
lstm_pred_prob = lstm_model.predict(X_test_pad)
lstm_pred = (lstm_pred_prob > 0.5).astype("int32")
print("LSTM Classification Report:")
print(classification_report(y_test, lstm_pred, digits=4))

# Predict and print classification report and ROC curve for BERT
bert_pred, bert_pred_prob = predict_hf_model(bert_model, bert_tokenizer, X_test)
print("BERT Classification Report:")
print(classification_report(y_test, bert_pred, digits=4))

# Predict and print classification report and ROC curve for DistilBERT
distilbert_pred, distilbert_pred_prob = predict_hf_model(distilbert_model, distilbert_tokenizer, X_test)
print("DistilBERT Classification Report:")
print(classification_report(y_test, distilbert_pred, digits=4))

# Predict and print classification report and ROC curve for RoBERTa
roberta_pred, roberta_pred_prob = predict_hf_model(roberta_model, roberta_tokenizer, X_test)
print("RoBERTa Classification Report:")
print(classification_report(y_test, roberta_pred, digits=4))

# Plot ROC curves
def plot_roc_curve(y_true, y_pred_prob, model_name):
    fpr, tpr, _ = roc_curve(y_true, y_pred_prob)
    roc_auc = auc(fpr, tpr)
    plt.figure()
    plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'Receiver Operating Characteristic - {model_name}')
    plt.legend(loc="lower right")
    plt.show()
    print(f'{model_name} AUROC: {roc_auc:.2f}')

# Plot ROC curves for all models
plot_roc_curve(y_test, rf_pred_prob, 'Random Forest')
plot_roc_curve(y_test, svm_pred_prob, 'SVM')
plot_roc_curve(y_test, knn_pred_prob, 'KNN')
plot_roc_curve(y_test, cnn_pred_prob, 'CNN')
plot_roc_curve(y_test, lstm_pred_prob, 'LSTM')
plot_roc_curve(y_test, bert_pred_prob, 'BERT')
plot_roc_curve(y_test, distilbert_pred_prob, 'DistilBERT')
plot_roc_curve(y_test, roberta_pred_prob, 'RoBERTa')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Rohan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Rohan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
Some weights of the model checkpoint at models/roberta_model were not used when initializing RobertaForSequenceClassification: ['bert.em

Random Forest Classification Report:
              precision    recall  f1-score   support

           0     0.9968    0.9840    0.9904       313
           1     0.9836    0.9967    0.9901       300

    accuracy                         0.9902       613
   macro avg     0.9902    0.9903    0.9902       613
weighted avg     0.9903    0.9902    0.9902       613

SVM Classification Report:
              precision    recall  f1-score   support

           0     0.9968    0.9904    0.9936       313
           1     0.9901    0.9967    0.9934       300

    accuracy                         0.9935       613
   macro avg     0.9934    0.9935    0.9935       613
weighted avg     0.9935    0.9935    0.9935       613

KNN Classification Report:
              precision    recall  f1-score   support

           0     1.0000    0.9553    0.9771       313
           1     0.9554    1.0000    0.9772       300

    accuracy                         0.9772       613
   macro avg     0.9777    0.9776    

RuntimeError: [enforce fail at alloc_cpu.cpp:80] data. DefaultCPUAllocator: not enough memory: you tried to allocate 7713325056 bytes.