In [1]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.metrics import classification_report
from sklearn.metrics import roc_curve, auc
from datasets import load_dataset
import matplotlib.pyplot as plt
import pandas as pd

In [None]:
dataset = load_dataset("SetFit/enron_spam")
print(dataset)

In [None]:
dataset = dataset.map(lambda example: {
    "text": f"subject: {example['subject']}\ncontent: {example['message']}",
    "label": example["label_text"]
}, remove_columns=[col for col in dataset["train"].features if col not in ('text', 'label')])
print(dataset)

In [None]:
sample = dataset["train"][14]

print("Example of an e-mail in the dataset:\n")
print(sample["text"])
print("\nlabel:", sample["label"])

In [None]:
model_tfidf = make_pipeline(TfidfVectorizer(), MultinomialNB())
model_bow = make_pipeline(CountVectorizer(), MultinomialNB())

In [None]:
model_tfidf.fit(dataset["train"]["text"], dataset["train"]["label"])
model_bow.fit(dataset["train"]["text"], dataset["train"]["label"])
print("Models fitted")

In [None]:
samples = [
    "You inherited 1M USD!!",
    "This seminar is pretty cool!",
    "free iPhone in exchange for credit card information",
    "Reminder: free iPhone"
]
predictions_tfidf = model_tfidf.predict(samples)
predictions_bow = model_bow.predict(samples)

pd.DataFrame({
    "Message": samples,
    "Prediction (BoW)": predictions_bow,
    "Prediction (TF-IDF)": predictions_tfidf,
})

In [None]:
y_true = dataset["test"]["label"]

In [None]:
y_pred_bow = model_bow.predict(dataset["test"]["text"])
print(classification_report(y_true=y_true, y_pred=y_pred_bow, digits=3))

In [None]:
y_pred_tfidf = model_tfidf.predict(dataset["test"]["text"])
print(classification_report(y_true=y_true, y_pred=y_pred_tfidf, digits=3))

In [None]:
spam_class_index = model_bow.classes_.tolist().index("spam")
probabilities = model_bow.predict_proba(dataset["test"]["text"])  # Shape (2000, 2)
probabilities = probabilities[:, spam_class_index]

fpr, tpr, _ = roc_curve(y_true, probabilities, pos_label="spam")
roc_auc = auc(fpr, tpr)

plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
plt.show()