# SPAM Hit Man

Naive bayes approach to spam detection

[Dataset](https://github.com/MWiechmann/enron_spam_data)

In [3]:
import polars as pl

df_spam = pl.read_csv("enron_spam_data.csv")
df_spam

Message ID,Subject,Message,Spam/Ham,Date
i64,str,str,str,str
0,"""christmas tree farm pictures""",,"""ham""","""1999-12-10"""
1,"""vastar resources , inc .""","""gary , production from the hig…","""ham""","""1999-12-13"""
2,"""calpine daily gas nomination""","""- calpine daily gas nomination…","""ham""","""1999-12-14"""
3,"""re : issue""","""fyi - see note below - already…","""ham""","""1999-12-14"""
4,"""meter 7268 nov allocation""","""fyi . - - - - - - - - - - - - …","""ham""","""1999-12-14"""
…,…,…,…,…
33711,"""= ? iso - 8859 - 1 ? q ? good …","""hello , welcome to gigapharm o…","""spam""","""2005-07-29"""
33712,"""all prescript medicines are on…","""i got it earlier than expected…","""spam""","""2005-07-29"""
33713,"""the next generation online pha…","""are you ready to rock on ? let…","""spam""","""2005-07-30"""
33714,"""bloow in 5 - 10 times the time""","""learn how to last 5 - 10 times…","""spam""","""2005-07-30"""


## Most common words

In the naive bayes approach the presence of a word is a Bernoulli variable.
Each one of them conditionaly indipendent from one another, given the evidence.

The evidence in this case is also referred to as the ground truth.

The 'spam', 'ham' classification is the ground truth

Here we take top 500 most common words to build the bayesian network, every word becomes a variable in the network

In [4]:
from collections import Counter

df_filtered = df_spam.filter(pl.col("Message").is_not_null())
most_common_words = pl.Series(
    ' '.join(df_filtered['Message'].str.to_lowercase()).split()
).value_counts()

most_common_words = most_common_words.filter(pl.col('').str.contains(r'^\w+$'))

In [5]:
most_common_words['']
top_words = most_common_words.sort('count', descending=True)[:500]['']
top_words

"""the"""
"""to"""
"""and"""
"""of"""
"""a"""
…
"""street"""
"""90"""
"""put"""
"""sure"""
"""states"""


## Building and training the model

idk

In [6]:
df_filtered = df_filtered.with_columns(
    pl.col('Message').str.contains(f"{word}").cast(pl.Int32).alias(word) for word in top_words
)

In [7]:
df_filtered

Message ID,Subject,Message,Spam/Ham,Date,the,to,and,of,a,in,for,you,is,_,this,enron,on,that,i,s,with,be,your,we,as,from,have,will,it,are,ect,or,at,by,not,our,…,rights,point,v,quarter,texas,director,short,once,numbers,schedule,reply,low,special,another,july,issues,check,meter,move,ena,north,using,lon,soon,regarding,issue,employees,customer,k,stop,et,terms,street,90,put,sure,states
i64,str,str,str,str,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,…,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32
1,"""vastar resources , inc .""","""gary , production from the hig…","""ham""","""1999-12-13""",1,1,1,1,1,1,1,1,1,0,0,1,1,1,1,1,1,1,0,1,1,1,0,0,1,1,1,1,1,1,1,1,…,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0,0,1,0,0,0
2,"""calpine daily gas nomination""","""- calpine daily gas nomination…","""ham""","""1999-12-14""",0,0,0,0,1,1,0,0,0,0,0,0,1,0,1,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,…,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,"""re : issue""","""fyi - see note below - already…","""ham""","""1999-12-14""",0,1,0,1,1,1,1,1,1,0,1,0,1,0,1,1,1,1,0,1,1,1,1,0,1,1,1,1,1,1,1,0,…,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,1,0,0,0,0,0,0
4,"""meter 7268 nov allocation""","""fyi . - - - - - - - - - - - - …","""ham""","""1999-12-14""",1,1,1,1,1,1,1,0,1,0,1,0,1,0,1,1,0,1,0,0,1,1,1,0,1,1,1,1,1,1,0,0,…,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,1,1,0
5,"""mcmullen gas for 11 / 99""","""jackie , since the inlet to 3 …","""ham""","""1999-12-14""",1,1,1,1,1,1,1,0,1,0,1,0,1,0,1,1,0,1,0,1,1,1,0,0,0,0,0,1,1,0,0,0,…,0,0,1,0,0,0,0,0,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,1,0,0,0,0,0,0
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
33711,"""= ? iso - 8859 - 1 ? q ? good …","""hello , welcome to gigapharm o…","""spam""","""2005-07-29""",1,1,1,1,1,1,0,1,0,0,0,0,1,0,1,1,0,0,1,1,1,0,1,0,0,0,0,0,1,0,0,1,…,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0
33712,"""all prescript medicines are on…","""i got it earlier than expected…","""spam""","""2005-07-29""",1,1,1,1,1,1,1,1,1,0,0,0,1,1,1,1,1,1,1,1,1,0,0,1,1,0,1,1,1,0,0,1,…,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,1,0
33713,"""the next generation online pha…","""are you ready to rock on ? let…","""spam""","""2005-07-30""",1,1,0,1,1,1,0,1,1,0,0,0,1,0,1,1,0,1,1,1,0,0,1,0,1,1,0,1,1,0,0,1,…,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0
33714,"""bloow in 5 - 10 times the time""","""learn how to last 5 - 10 times…","""spam""","""2005-07-30""",0,1,0,0,1,1,0,0,0,0,0,0,1,0,1,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,…,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0


In [8]:
word_counts = df_filtered.group_by('Spam/Ham').agg(
    pl.sum(word).alias(f'count_{word}') for word in top_words
)
word_counts

Spam/Ham,count_the,count_to,count_and,count_of,count_a,count_in,count_for,count_you,count_is,count__,count_this,count_enron,count_on,count_that,count_i,count_s,count_with,count_be,count_your,count_we,count_as,count_from,count_have,count_will,count_it,count_are,count_ect,count_or,count_at,count_by,count_not,count_our,count_com,count_if,count_company,count_1,…,count_rights,count_point,count_v,count_quarter,count_texas,count_director,count_short,count_once,count_numbers,count_schedule,count_reply,count_low,count_special,count_another,count_july,count_issues,count_check,count_meter,count_move,count_ena,count_north,count_using,count_lon,count_soon,count_regarding,count_issue,count_employees,count_customer,count_k,count_stop,count_et,count_terms,count_street,count_90,count_put,count_sure,count_states
str,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,…,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32
"""spam""",13017,14801,12632,12114,16649,15888,11502,12538,14446,1725,7297,1,15456,5967,16698,16604,7513,12843,9526,11509,13003,6075,7013,4961,14547,9151,7452,15311,14745,5577,7201,11966,10362,8721,2467,9219,…,377,948,15849,374,169,864,745,1675,664,110,1408,5210,2027,530,288,157,1515,295,2687,1398,576,988,3220,911,450,624,90,1698,14924,1967,12811,713,359,1441,2059,2012,1129
"""ham""",13834,13948,12378,10996,16436,15037,13225,10979,14173,1223,8948,7439,14820,7753,16341,16388,8189,12244,5827,11856,13769,7930,8819,8204,12594,8789,10567,14768,15122,6770,7048,9763,8746,10077,1584,12767,…,316,1681,14738,464,1093,1054,1057,1943,671,2638,389,5644,866,950,989,1235,962,917,1341,1863,1334,1037,3056,1120,1339,2087,776,1074,14898,698,12563,606,572,1236,1991,2682,408


In [9]:
totals = df_filtered.group_by('Spam/Ham').len().rename({"len": "total"})

probabilities = word_counts.join(totals, on='Spam/Ham')

# column naming is not very intuitive, but "melts" better
# +1 and +2 terms are used for Laplace Smoothing, assuming 50% chance of getting a spam email
probabilities = probabilities.with_columns(
    (
        (pl.col(f"count_{word}") + 1) / 
        (pl.col("total") + 2 )
    ).alias(f"{word}") for word in top_words
)

probabilities

Spam/Ham,count_the,count_to,count_and,count_of,count_a,count_in,count_for,count_you,count_is,count__,count_this,count_enron,count_on,count_that,count_i,count_s,count_with,count_be,count_your,count_we,count_as,count_from,count_have,count_will,count_it,count_are,count_ect,count_or,count_at,count_by,count_not,count_our,count_com,count_if,count_company,count_1,…,rights,point,v,quarter,texas,director,short,once,numbers,schedule,reply,low,special,another,july,issues,check,meter,move,ena,north,using,lon,soon,regarding,issue,employees,customer,k,stop,et,terms,street,90,put,sure,states
str,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,…,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""spam""",13017,14801,12632,12114,16649,15888,11502,12538,14446,1725,7297,1,15456,5967,16698,16604,7513,12843,9526,11509,13003,6075,7013,4961,14547,9151,7452,15311,14745,5577,7201,11966,10362,8721,2467,9219,…,0.022428,0.056307,0.94043,0.02225,0.010087,0.051323,0.044262,0.099442,0.039457,0.006586,0.0836,0.309185,0.120328,0.031506,0.017147,0.009375,0.089949,0.017563,0.159487,0.083007,0.034235,0.05868,0.191112,0.054112,0.026759,0.037083,0.005399,0.100807,0.885546,0.116768,0.760176,0.042364,0.02136,0.085558,0.122226,0.119438,0.067046
"""ham""",13834,13948,12378,10996,16436,15037,13225,10979,14173,1223,8948,7439,14820,7753,16341,16388,8189,12244,5827,11856,13769,7930,8819,8204,12594,8789,10567,14768,15122,6770,7048,9763,8746,10077,1584,12767,…,0.019218,0.10197,0.893543,0.02819,0.066323,0.063959,0.064141,0.117854,0.04074,0.159988,0.023644,0.342225,0.052561,0.057654,0.060018,0.074932,0.058381,0.055653,0.081358,0.113004,0.080934,0.062928,0.185329,0.06796,0.081237,0.126584,0.047105,0.065171,0.903243,0.042376,0.761685,0.036799,0.034738,0.074992,0.120764,0.162655,0.024795


In [10]:
final_df = probabilities.select([
    "Spam/Ham", *[ f"{word}" for word in top_words ]
    ])

final_df = final_df.unpivot(index="Spam/Ham", variable_name="word").pivot("Spam/Ham", index="word")

final_df

word,spam,ham
str,f64,f64
"""the""",0.772398,0.838739
"""to""",0.878248,0.84565
"""and""",0.749555,0.75047
"""of""",0.71882,0.666687
"""a""",0.987896,0.996484
…,…,…
"""street""",0.02136,0.034738
"""90""",0.085558,0.074992
"""put""",0.122226,0.120764
"""sure""",0.119438,0.162655


# Testing the model

fitting is done, now its time to test the model

In [None]:
import numpy as np

total_emails = df_filtered.shape[0]

spam_emails = totals.filter(pl.col("Spam/Ham") == "spam").select("total").row(0)[0]

p_spam = spam_emails/total_emails

p_ham = 1 - p_spam

In [51]:
email_text = df_filtered.select("Message", "Spam/Ham")[33100]

print(email_text)

print(f"Ground Truth: {email_text["Spam/Ham"][0]}\n---")

words = pl.Series(
    ' '.join(email_text['Message'].str.to_lowercase()).split()
)

score_spam = np.log(p_spam)

score_ham = np.log(p_ham)

for word in words:
    word_probs = final_df.filter(pl.col("word") == word)
    if not word_probs.is_empty():
        P_word_spam = word_probs["spam"][0]
        P_word_ham = word_probs["ham"][0]
        score_spam += np.log(P_word_spam)
        score_ham += np.log(P_word_ham)

print(f"Spam Score: {score_spam}")
print(f"Ham Score : {score_ham}")

spam_probability = np.pow(10, score_spam) / (np.pow(10, score_spam) + np.pow(10, score_ham))

print(f"Computed spam probability: {spam_probability}")
print(f"Verdict: {"spam" if score_spam > score_ham else "ham"}")

shape: (1, 2)
┌─────────────────────────┬──────────┐
│ Message                 ┆ Spam/Ham │
│ ---                     ┆ ---      │
│ str                     ┆ str      │
╞═════════════════════════╪══════════╡
│ hello ,                 ┆ spam     │
│ did you ejaculate befo… ┆          │
└─────────────────────────┴──────────┘
Ground Truth: spam
---
Spam Score: -65.7027319955643
Ham Score : -70.79989174359073
Computed spam probability: 0.9999920046628925
Verdict: spam
