In [1]:
import pandas as pd

df = pd.read_csv("./archive/spam_ham_dataset.csv", usecols=["label", "text"])

train_set = df.sample(frac=0.8, random_state=42)

train_set.head()

Unnamed: 0,label,text
1566,ham,"Subject: hpl nom for march 30 , 2001\r\n( see ..."
1988,spam,Subject: online pharxmacy 80 % off all meds\r\...
1235,ham,Subject: re : nom / actual volume for april 17...
2868,ham,Subject: re : meter 8740 dec 99\r\nrobert and ...
4903,ham,Subject: re : coastal oil & gas corporation\r\...


In [2]:
train_set.describe()

Unnamed: 0,label,text
count,4137,4137
unique,2,4028
top,ham,Subject: calpine daily gas nomination\r\n>\r\n...
freq,2952,17


In [3]:
train_set.groupby("label").describe()

Unnamed: 0_level_0,text,text,text,text
Unnamed: 0_level_1,count,unique,top,freq
label,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ham,2952,2867,Subject: calpine daily gas nomination\r\n>\r\n...,17
spam,1185,1161,Subject: \r\n,11


In [4]:
train_set["length"] = train_set["text"].apply(len)

train_set.groupby("label").describe()

Unnamed: 0_level_0,length,length,length,length,length,length,length,length
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
label,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
ham,2952.0,966.733062,1397.986685,30.0,223.75,515.0,1219.25,32258.0
spam,1185.0,1214.724051,1846.855705,11.0,276.0,564.0,1237.0,22073.0


In [5]:
train_set.length.describe()

count     4137.000000
mean      1037.767464
std       1543.832150
min         11.000000
25%        236.000000
50%        535.000000
75%       1221.000000
max      32258.000000
Name: length, dtype: float64

In [6]:
import string
from nltk.corpus import stopwords


def clean_text(text: str) -> list[str]:
    text = text.translate(str.maketrans("", "", string.punctuation + string.digits))
    words = [
        word for word in text.split() if word.lower() not in stopwords.words("english")
    ] # type: ignore

    return words


sample_text = (
    "Hey there! This is a sample review, which happens to contain punctuations."
)
clean_text(sample_text)

['Hey', 'sample', 'review', 'happens', 'contain', 'punctuations']

In [7]:
train_set["clean_text"] = train_set["text"].apply(clean_text)

In [8]:
spam_emails = train_set[train_set["label"] == "spam"]["clean_text"].apply(pd.Series).stack()
ham_emails = train_set[train_set["label"] == "ham"]["clean_text"].apply(pd.Series).stack()

words_f_spam = spam_emails.value_counts() / spam_emails.count()
words_f_ham = ham_emails.value_counts() / ham_emails.count()

words_f_spam.head()

Subject    0.008456
http       0.005245
com        0.004995
company    0.004438
e          0.003475
Name: count, dtype: float64

In [9]:

frequency = train_set["label"].value_counts() / train_set["label"].count()

global_p_ham, global_p_spam = frequency["ham"], frequency["spam"]

In [10]:
def predict(text: str) -> str:
    text = clean_text(text)

    p_ham, p_spam = global_p_ham, global_p_spam

    for word in text:
        p_ham *= words_f_ham[word] if word in words_f_ham else 0
        p_spam *= words_f_spam[word] if word in words_f_spam else 0

    return "spam" if p_spam > p_ham else "ham"


predict("""
Subject: photoshop , windows , office . cheap . main trending
abasements darer prudently fortuitous undergone
lighthearted charm orinoco taster
railroad affluent pornographic cuvier
irvin parkhouse blameworthy chlorophyll
robed diagrammatic fogarty clears bayda
inconveniencing managing represented smartness hashish
academies shareholders unload badness
danielson pure caffein
spaniard chargeable levin
""")

'ham'

In [11]:
train_set["prediction"] = train_set["text"].apply(predict)

train_set.head()

Unnamed: 0,label,text,length,clean_text,prediction
1566,ham,"Subject: hpl nom for march 30 , 2001\r\n( see ...",96,"[Subject, hpl, nom, march, see, attached, file...",ham
1988,spam,Subject: online pharxmacy 80 % off all meds\r\...,22073,"[Subject, online, pharxmacy, meds, disscount, ...",ham
1235,ham,Subject: re : nom / actual volume for april 17...,409,"[Subject, nom, actual, volume, april, th, agre...",ham
2868,ham,Subject: re : meter 8740 dec 99\r\nrobert and ...,3519,"[Subject, meter, dec, robert, put, heads, toge...",ham
4903,ham,Subject: re : coastal oil & gas corporation\r\...,13448,"[Subject, coastal, oil, gas, corporation, meli...",ham


In [12]:
print(train_set[["label", "prediction"]].head())

true_positives = train_set[(train_set["label"] == "spam") & (train_set["prediction"] == "spam")]
true_negatives = train_set[(train_set["label"] == "ham") & (train_set["prediction"] == "ham")]
false_positives = train_set[(train_set["label"] == "ham") & (train_set["prediction"] == "spam")]
false_negatives = train_set[(train_set["label"] == "spam") & (train_set["prediction"] == "ham")]

print(f"True positives: {len(true_positives)}")
print(f"True negatives: {len(true_negatives)}")
print(f"False positives: {len(false_positives)}")
print(f"False negatives: {len(false_negatives)}")

assert len(train_set) == len(true_positives) + len(true_negatives) + len(false_positives) + len(false_negatives)

accuracy = (len(true_positives) + len(true_negatives)) / len(train_set)
precision = len(true_positives) / (len(true_positives) + len(false_positives))
recall = len(true_positives) / (len(true_positives) + len(false_negatives))

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")

     label prediction
1566   ham        ham
1988  spam        ham
1235   ham        ham
2868   ham        ham
4903   ham        ham
True positives: 741
True negatives: 2947
False positives: 5
False negatives: 444
Accuracy: 0.891467246797196
Precision: 0.9932975871313673
Recall: 0.6253164556962025


In [13]:
test_set = df.drop(train_set.index)

test_set["prediction"] = test_set["text"].apply(predict)

test_set.head()

Unnamed: 0,label,text,prediction
3,spam,"Subject: photoshop , windows , office . cheap ...",ham
4,ham,Subject: re : indian springs\r\nthis deal is t...,ham
5,ham,Subject: ehronline web address change\r\nthis ...,ham
9,ham,"Subject: nominations for oct . 21 - 23 , 2000\...",ham
16,ham,Subject: re : first delivery - wheeler operati...,ham


In [14]:
test_set[["label", "prediction"]]

true_positives = test_set[(test_set["label"] == "spam") & (test_set["prediction"] == "spam")]
true_negatives = test_set[(test_set["label"] == "ham") & (test_set["prediction"] == "ham")]
false_positives = test_set[(test_set["label"] == "ham") & (test_set["prediction"] == "spam")]
false_negatives = test_set[(test_set["label"] == "spam") & (test_set["prediction"] == "ham")]

accuracy = (len(true_positives) + len(true_negatives)) / len(test_set)
precision = len(true_positives) / (len(true_positives) + len(false_positives))
recall = len(true_positives) / (len(true_positives) + len(false_negatives))

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")

Accuracy: 0.7311411992263056
Precision: 1.0
Recall: 0.11464968152866242
