# SPAM Hit Man

Naive bayes approach to spam detection

[Dataset](https://github.com/MWiechmann/enron_spam_data)

In [2]:
import polars as pl

df_spam = pl.read_csv("enron_spam_data.csv")

df_spam = df_spam.filter(pl.col("Message").is_not_null())

df_spam = df_spam.sample(fraction=1, shuffle=True)

test_size = 5000

df_test, df_train = df_spam.head(test_size), df_spam.tail(-test_size)

## Most common words

In the naive bayes approach the presence of a word is a Bernoulli variable.
Each one of them conditionaly indipendent from one another, given the evidence.

The evidence in this case is also referred to as the ground truth.

The 'spam', 'ham' classification is the ground truth

Here we take top 500 most common words to build the bayesian network, every word becomes a variable in the network

In [3]:
most_common_words = pl.Series(
    ' '.join(df_train['Message'].str.to_lowercase()).split()
).value_counts()

most_common_words = most_common_words.filter(pl.col('').str.contains(r'^\w+$'))

In [None]:
most_common_words['']
top_words = most_common_words.sort('count', descending=True)[100:600][''] # cutting the "tail"
top_words

"""only"""
"""mail"""
"""vince"""
"""over"""
"""what"""
…
"""45"""
"""commission"""
"""opportunity"""
"""really"""
"""earnings"""


## Building and training the model

idk

In [6]:
parameter_training_dataset = df_train.with_columns(
    pl.col('Message').str.contains(f"{word}").cast(pl.Int32).alias(word) for word in top_words
)

In [7]:
parameter_training_dataset

Message ID,Subject,Message,Spam/Ham,Date,only,mail,vince,over,what,01,need,m,thanks,some,power,market,like,d,7,corp,www,should,o,into,re,stock,12,year,inc,6,could,just,8,who,30,sent,…,june,center,executive,required,expected,electricity,capital,done,reserved,case,type,action,read,chairman,increase,contracts,oo,80,capacity,better,private,public,el,edu,equity,na,paid,daily,department,material,steve,volume,45,commission,opportunity,really,earnings
i64,str,str,str,str,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,…,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32
27380,"""failure notice""","""hi . this is the qmail - send …","""spam""","""2005-07-19""",0,1,0,0,0,0,0,1,0,0,0,0,0,1,1,0,0,0,1,0,1,0,0,0,1,1,0,1,1,0,1,0,…,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,1,1,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0
7198,"""re : enron / stanford program""","""vince , i will call paul racic…","""ham""","""2000-10-10""",0,0,1,0,0,0,0,1,1,0,0,0,0,1,1,0,0,0,1,0,1,0,1,0,1,0,0,0,0,0,0,1,…,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
18333,"""fwd : need meds xanaix | = v @…","""we believe ordering medication…","""spam""","""2004-02-10""",0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,…,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0
20365,"""re : notice""","""fri , 12 nov 2004 19 : 52 : 05…","""spam""","""2004-11-12""",1,0,0,0,0,0,1,1,0,0,0,0,0,1,0,0,0,0,1,0,1,0,1,0,1,0,1,0,0,0,0,0,…,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0
18554,"""2004 xp , adobe , autodesk , c…","""same day shipping ( see site f…","""spam""","""2004-03-26""",1,1,0,0,0,0,0,1,0,0,1,0,0,1,1,0,0,0,1,0,1,0,0,0,0,0,0,0,1,1,0,0,…,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
12186,"""in case you didn ' t notice""","""louise , office depot has the …","""ham""","""2001-07-16""",0,0,0,0,0,0,0,1,0,0,0,0,0,1,1,0,0,0,1,0,1,0,0,0,1,1,0,0,1,0,0,0,…,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
9592,"""financial freedom""","""dear friend , how would you li…","""spam""","""2002-05-18""",0,1,0,0,1,0,0,1,0,0,0,0,1,1,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,…,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
17468,"""schedule crawler : hourahead f…","""start date : 12 / 26 / 01 ; ho…","""ham""","""2001-12-26""",0,0,0,0,0,1,0,1,0,0,0,0,0,1,0,0,0,0,1,0,1,0,1,0,0,1,0,0,0,0,0,0,…,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
8174,"""re : progress""","""steve , thanks a lot . i think…","""ham""","""2001-01-11""",1,0,1,1,1,1,1,1,1,1,0,0,1,1,0,0,0,0,1,1,1,1,0,0,1,1,1,1,1,0,0,1,…,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,1,1,1,0


In [8]:
word_counts = parameter_training_dataset.group_by('Spam/Ham').agg(
    pl.sum(word).alias(f'count_{word}') for word in top_words
)
word_counts

Spam/Ham,count_only,count_mail,count_vince,count_over,count_what,count_01,count_need,count_m,count_thanks,count_some,count_power,count_market,count_like,count_d,count_7,count_corp,count_www,count_should,count_o,count_into,count_re,count_stock,count_12,count_year,count_inc,count_6,count_could,count_just,count_8,count_who,count_30,count_sent,count_were,count_deal,count_statements,count_report,…,count_june,count_center,count_executive,count_required,count_expected,count_electricity,count_capital,count_done,count_reserved,count_case,count_type,count_action,count_read,count_chairman,count_increase,count_contracts,count_oo,count_80,count_capacity,count_better,count_private,count_public,count_el,count_edu,count_equity,count_na,count_paid,count_daily,count_department,count_material,count_steve,count_volume,count_45,count_commission,count_opportunity,count_really,count_earnings
str,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,…,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32
"""spam""",2941,5134,185,4042,2090,1413,2701,14044,1045,2528,1180,1641,2587,14051,5461,946,2444,1683,14251,1925,13660,1184,2059,1803,5289,5462,1387,2501,5275,2146,2228,2702,1334,1119,639,1192,…,274,508,211,656,386,75,332,572,198,555,624,1707,2332,118,1065,164,7988,1587,62,583,849,977,10585,1064,269,9112,772,242,578,898,81,238,1192,263,514,548,94
"""ham""",1732,3145,2494,2954,2545,5787,3629,13363,4572,3194,1769,2037,2926,13753,7391,2710,935,2651,13888,1837,12504,400,3486,1888,5620,7068,1925,2205,7246,2367,2941,4091,1676,2304,140,1589,…,751,575,574,744,503,403,544,877,241,779,708,1308,2133,375,598,591,6442,1263,573,629,264,652,9893,3515,316,9521,374,758,603,418,1025,1101,1467,341,623,587,218


In [9]:
totals = parameter_training_dataset.group_by('Spam/Ham').len().rename({"len": "total"})

parameter_training_dataset = word_counts.join(totals, on='Spam/Ham')

# column naming is not very intuitive, but "melts" better
# +1 and +2 terms are used for Laplace Smoothing, assuming uniform probability
parameter_training_dataset = parameter_training_dataset.with_columns(
    (
        (pl.col(f"count_{word}") + 1) / 
        (pl.col("total") + 2 )
    ).alias(f"{word}") for word in top_words
)

parameter_training_dataset

Spam/Ham,count_only,count_mail,count_vince,count_over,count_what,count_01,count_need,count_m,count_thanks,count_some,count_power,count_market,count_like,count_d,count_7,count_corp,count_www,count_should,count_o,count_into,count_re,count_stock,count_12,count_year,count_inc,count_6,count_could,count_just,count_8,count_who,count_30,count_sent,count_were,count_deal,count_statements,count_report,…,june,center,executive,required,expected,electricity,capital,done,reserved,case,type,action,read,chairman,increase,contracts,oo,80,capacity,better,private,public,el,edu,equity,na,paid,daily,department,material,steve,volume,45,commission,opportunity,really,earnings
str,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,…,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""spam""",2941,5134,185,4042,2090,1413,2701,14044,1045,2528,1180,1641,2587,14051,5461,946,2444,1683,14251,1925,13660,1184,2059,1803,5289,5462,1387,2501,5275,2146,2228,2702,1334,1119,639,1192,…,0.019186,0.035512,0.014791,0.045838,0.027001,0.005302,0.023233,0.039978,0.013884,0.038792,0.043606,0.119166,0.162771,0.008303,0.074374,0.011512,0.557385,0.110793,0.004395,0.040745,0.059304,0.068234,0.738575,0.074304,0.018838,0.635805,0.053931,0.016954,0.040396,0.062722,0.005721,0.016675,0.083234,0.018419,0.035931,0.038303,0.006628
"""ham""",1732,3145,2494,2954,2545,5787,3629,13363,4572,3194,1769,2037,2926,13753,7391,2710,935,2651,13888,1837,12504,400,3486,1888,5620,7068,1925,2205,7246,2367,2941,4091,1676,2304,140,1589,…,0.053653,0.041096,0.041025,0.053154,0.035959,0.028824,0.038884,0.062643,0.017266,0.055651,0.050585,0.093393,0.152255,0.026826,0.042737,0.042237,0.459689,0.090183,0.040953,0.044949,0.018907,0.04659,0.705908,0.250856,0.022617,0.679366,0.026755,0.054152,0.043094,0.029894,0.073202,0.078624,0.104737,0.024401,0.044521,0.041952,0.015625


In [10]:
probabilities = parameter_training_dataset.select([
    "Spam/Ham", *[ f"{word}" for word in top_words ]
    ])

probabilities = probabilities.unpivot(index="Spam/Ham", variable_name="word").pivot("Spam/Ham", index="word")

probabilities

word,spam,ham
str,f64,f64
"""only""",0.205261,0.123644
"""mail""",0.358264,0.224458
"""vince""",0.012977,0.178011
"""over""",0.282076,0.21083
"""what""",0.145887,0.18165
…,…,…
"""45""",0.083234,0.104737
"""commission""",0.018419,0.024401
"""opportunity""",0.035931,0.044521
"""really""",0.038303,0.041952


# Testing the model

fitting is done, now its time to test the model

> Note: apparently writing a funtion that does the prediction is terribly inefficent.
> This is probably due to how python handles passig parameters around.

In [11]:
import numpy as np

total_emails = df_spam.shape[0]

spam_emails = totals.filter(pl.col("Spam/Ham") == "spam").select("total").row(0)[0]

p_spam = spam_emails/total_emails

p_ham = 1 - p_spam

In [26]:
email_text = df_test.sample()

print(email_text)

print(f"Ground Truth: {email_text["Spam/Ham"][0]}\n---")

words = pl.Series(
    ' '.join(email_text['Message'].str.to_lowercase()).split()
)

spam_score = np.log(p_spam)

ham_score = np.log(p_ham)

for word in words:
    word_probs = probabilities.filter(pl.col("word") == word)
    if not word_probs.is_empty():
        P_word_spam = word_probs["spam"][0]
        P_word_ham = word_probs["ham"][0]
        spam_score += np.log(P_word_spam)
        ham_score += np.log(P_word_ham)

print(f"Spam Score: {spam_score}")
print(f"Ham Score : {ham_score}")

spam_probability = np.pow(10, spam_score) / (np.pow(10, spam_score) + np.pow(10, ham_score))

print(f"Computed spam probability: {spam_probability}")
print(f"Verdict: {"spam" if spam_score > ham_score else "ham"}")

shape: (1, 5)
┌────────────┬─────────────────────────────────┬─────────────────────┬──────────┬────────────┐
│ Message ID ┆ Subject                         ┆ Message             ┆ Spam/Ham ┆ Date       │
│ ---        ┆ ---                             ┆ ---                 ┆ ---      ┆ ---        │
│ i64        ┆ str                             ┆ str                 ┆ str      ┆ str        │
╞════════════╪═════════════════════════════════╪═════════════════════╪══════════╪════════════╡
│ 19439      ┆ tpntfhb up ~ d _ ate : medi ~ … ┆ boorgaten bekorstte ┆ spam     ┆ 2004-07-26 │
│            ┆                                 ┆ aggregaat           ┆          ┆            │
│            ┆                                 ┆ …                   ┆          ┆            │
└────────────┴─────────────────────────────────┴─────────────────────┴──────────┴────────────┘
Ground Truth: spam
---
Spam Score: -60.06694277371278
Ham Score : -54.58877083500348
Computed spam probability: 3.3252677253236977e

# Accuracy calculation

Lets see how good is this

> Note: as said before a function that does the prediction runs at `<10 it/s` however the approach below goes up to `50ish it/s` (likely just a skill issue)

In [28]:
from tqdm import tqdm

email_text = df_test.sample()

#print(email_text)

#print(f"Ground Truth: {email_text['Spam/Ham'][0]}\n---")

#predict(probabilities, email_text['Message'].str.to_lowercase().item(), p_spam)

num_correct = 0
total = len(df_test)

#print(f"Total emails in test set: {total}")

for email in tqdm(df_test.iter_rows()):
    ground_truth = email[3]
    email_message = email[2].lower()
    words = set(email_message.split())
    
    spam_score = np.log(p_spam)
    ham_score = np.log(p_ham)
    
    for word in words:
        word_probs = probabilities.filter(pl.col("word") == word)
        if not word_probs.is_empty():
            P_word_spam = word_probs["spam"][0]
            P_word_ham = word_probs["ham"][0]
            spam_score += np.log(P_word_spam)
            ham_score += np.log(P_word_ham)
    
    verdict = "spam" if spam_score > ham_score else "ham"
    if verdict == ground_truth:
        num_correct += 1

print(f"Accuracy: {num_correct/total}")

0it [00:00, ?it/s]

5000it [01:53, 44.18it/s]

Accuracy: 0.8724



