In [83]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    confusion_matrix
)
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB




df = pd.read_csv("mail_l7_dataset.csv")
# print(df.head())

# df["Category"] = df["Category"].astype(str)
df.loc[df["Category"].str.lower().str.strip()== "spam", "Category"] = 0
df.loc[df["Category"].str.lower().str.strip()== "ham", "Category"] = 1

X = df["Message"].astype(str)
y = df["Category"].astype(int)
X_train , X_test, y_train, y_test = train_test_split(
    X, y, test_size = 0.2, random_state = 42
)

tfidf = TfidfVectorizer( min_df=1, stop_words="english" , lowercase=True)
X_train_features  = tfidf.fit_transform(X_train)
X_test_features = tfidf.transform(X_test)

lr = LogisticRegression(max_iter=1000, random_state=42)
lr.fit(X_train_features, y_train)
lr_predict = lr.predict(X_test_features)

def print_metrics(name, y_true, y_pred, pos_label=0):
    acc = accuracy_score(y_true, y_pred)
    pre = precision_score(y_true, y_pred, pos_label=pos_label)
    rec = recall_score(y_true, y_pred, pos_label=pos_label)
    f1 = f1_score(y_true, y_pred, pos_label=pos_label)
    
    print(f"{name} performance: ")
    print(f"accuracy: {acc:.3f}")
    print(f"precision: {pre:.3f}")
    print(f"recall: {rec:.3f}")
    print(f"F1-score: {f1:.3f}")
     
def print_confmat(name, y_true, y_pred):
    cm = confusion_matrix(y_true, y_pred, labels=[1,0])
    cm_df = pd.DataFrame(
        cm,
        index = ["actual Ham (1)", "actual spam (0)"],
        columns = ["pred Ham (1)", "pred spam (0)"]
    )
    print(f"{name} - confusion_matrix: \n {cm_df}")

print_metrics("Logistic Regression: ", y_test, lr_predict) 
print_confmat("Logistic Regression: ", y_test, lr_predict) 

print("--------------------------------------\n\n")



rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train_features, y_train)

rf_predict = rf.predict(X_test_features)

print_metrics("Random Forest", y_test, rf_predict)
print_confmat("Random Forest", y_test, rf_predict)

print("--------------------------------------\n\n")




nb = MultinomialNB()
nb.fit(X_train_features, y_train)

nb_predict = nb.predict(X_test_features)

print_metrics("Naive Bayes", y_test, nb_predict)
print_confmat("Naive Bayes", y_test, nb_predict)

print("--------------------------------------\n\n")


i = 3
sample_text = X_test.iloc[i]
true_label = y_test.iloc[i]

lr_pred_one = int(lr.predict(tfidf.transform([sample_text]))[0])
rf_pred_one = int(lr.predict(tfidf.transform([sample_text]))[0])
nb_pred_one = int(lr.predict(tfidf.transform([sample_text]))[0])

def lab2str(r):
    return "Spam (0)" if r == 0 else "Ham (1)"

print("Actual : ", lab2str(true_label))
print("LR prediction : ", lab2str(lr_pred_one))
print("RF prediction : ", lab2str(rf_pred_one))
print("NB prediction : ", lab2str(nb_pred_one))



    




Logistic Regression:  performance: 
accuracy: 1.000
precision: 1.000
recall: 1.000
F1-score: 1.000
Logistic Regression:  - confusion_matrix: 
                  pred Ham (1)  pred spam (0)
actual Ham (1)            101              0
actual spam (0)             0             99
--------------------------------------


Random Forest performance: 
accuracy: 1.000
precision: 1.000
recall: 1.000
F1-score: 1.000
Random Forest - confusion_matrix: 
                  pred Ham (1)  pred spam (0)
actual Ham (1)            101              0
actual spam (0)             0             99
--------------------------------------


Naive Bayes performance: 
accuracy: 1.000
precision: 1.000
recall: 1.000
F1-score: 1.000
Naive Bayes - confusion_matrix: 
                  pred Ham (1)  pred spam (0)
actual Ham (1)            101              0
actual spam (0)             0             99
--------------------------------------


Actual :  Ham (1)
LR prediction :  Ham (1)
RF prediction :  Ham (1)
NB predicti