# SPAM Hit Man

Naive bayes approach to spam detection

[Dataset](https://github.com/MWiechmann/enron_spam_data)

In [10]:
import polars as pl

df_spam = pl.read_csv("enron_spam_data.csv")

df_spam = df_spam.filter(pl.col("Message").is_not_null())

df_spam = df_spam.sample(fraction=1, shuffle=True)

test_size = 5000

df_test, df_train = df_spam.head(test_size), df_spam.tail(-test_size)

## Most common words

In the naive bayes approach the presence of a word is a Bernoulli variable.
Each one of them conditionaly indipendent from one another, given the evidence.

The evidence in this case is also referred to as the ground truth.

The 'spam', 'ham' classification is the ground truth

Here we take top 500 most common words to build the bayesian network, every word becomes a variable in the network

In [None]:
most_common_words = pl.Series(
    ' '.join(df_train['Message'].str.to_lowercase()).split()
).value_counts()

most_common_words = most_common_words.filter(pl.col('').str.contains(r'^\w+$'))

In [13]:
most_common_words['']
top_words = most_common_words.sort('count', descending=True)[:500]['']
top_words

"""the"""
"""to"""
"""and"""
"""of"""
"""a"""
…
"""customer"""
"""k"""
"""communications"""
"""90"""
"""events"""


## Building and training the model

idk

In [15]:
parameter_training_dataset = df_train.with_columns(
    pl.col('Message').str.contains(f"{word}").cast(pl.Int32).alias(word) for word in top_words
)

In [16]:
parameter_training_dataset

Message ID,Subject,Message,Spam/Ham,Date,the,to,and,of,a,in,for,you,is,_,this,enron,on,that,i,s,with,be,your,we,as,from,have,will,it,are,ect,or,at,by,not,our,…,director,july,special,schedule,conference,low,employees,once,v,issues,reply,texas,rights,north,lon,using,another,move,stop,quarter,check,ees,soon,numbers,meter,regarding,sure,issue,et,states,mmbtu,less,customer,k,communications,90,events
i64,str,str,str,str,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,…,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32
18401,"""order prescriptions directly f…","""order prescriptions directly f…","""spam""","""2004-02-22""",0,0,1,0,1,1,1,0,1,0,0,0,1,0,1,1,0,1,0,1,1,1,0,0,1,0,1,1,0,0,0,0,…,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
30751,"""your reliable pharmacy at the …","""save more on your prescription…","""spam""","""2004-12-11""",1,1,1,1,1,1,1,1,1,0,1,0,1,1,1,1,1,1,1,1,1,0,1,1,1,1,0,1,1,0,0,1,…,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,1,0,0,0
14651,"""re : enrononline patent applic…","""netco . mark mark haedicke""","""ham""","""2002-01-08""",0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,…,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0
10817,"""more then 70 great pornstars s…","""come explore the world ' s lar…","""spam""","""2005-07-18""",1,0,0,0,1,1,0,0,1,0,0,0,1,0,1,1,0,0,0,0,0,0,0,0,0,0,1,1,1,1,0,0,…,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
4783,"""you are our lucky winner ! !""","""de national lottery po box 101…","""spam""","""2005-02-26""",1,1,1,1,1,1,1,1,1,0,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,…,1,0,0,0,0,0,0,0,1,0,1,0,0,1,1,0,0,0,0,1,0,0,1,1,0,0,0,0,1,1,0,0,0,1,0,1,0
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
7384,"""message from charles shen at w…","""dear vince : how are you ? tha…","""ham""","""2000-10-25""",1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,…,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0
5626,"""re : trip to san francisco 3 /…","""bryan , i talked to vasant abo…","""ham""","""2000-03-01""",1,1,1,1,1,1,1,1,1,0,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,0,1,1,…,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,1,0
9652,"""free step - by - step seminar …","""affluent senior lead program m…","""spam""","""2002-06-05""",1,1,1,1,1,1,1,0,1,0,1,0,1,0,1,1,1,1,0,1,1,1,0,0,1,1,1,1,1,1,1,1,…,0,0,1,0,0,1,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0
28749,"""oneok letter""","""attached is a revised draft of…","""ham""","""2001-08-07""",1,1,1,1,1,1,1,1,1,0,1,0,1,0,1,1,0,1,0,1,1,1,0,1,1,0,0,1,1,0,0,0,…,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0


In [17]:
word_counts = parameter_training_dataset.group_by('Spam/Ham').agg(
    pl.sum(word).alias(f'count_{word}') for word in top_words
)
word_counts

Spam/Ham,count_the,count_to,count_and,count_of,count_a,count_in,count_for,count_you,count_is,count__,count_this,count_enron,count_on,count_that,count_i,count_s,count_with,count_be,count_your,count_we,count_as,count_from,count_have,count_will,count_it,count_are,count_ect,count_or,count_at,count_by,count_not,count_our,count_if,count_com,count_company,count_1,…,count_director,count_july,count_special,count_schedule,count_conference,count_low,count_employees,count_once,count_v,count_issues,count_reply,count_texas,count_rights,count_north,count_lon,count_using,count_another,count_move,count_stop,count_quarter,count_check,count_ees,count_soon,count_numbers,count_meter,count_regarding,count_sure,count_issue,count_et,count_states,count_mmbtu,count_less,count_customer,count_k,count_communications,count_90,count_events
str,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,…,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32
"""ham""",11705,11821,10498,9296,13913,12714,11193,9287,11983,1022,7546,6296,12553,6555,13833,13872,6953,10388,4939,10012,11647,6693,7474,6920,10654,7429,8949,12501,12796,5726,5954,8261,8531,7421,1342,10814,…,908,845,730,2221,821,4781,659,1647,12478,1059,324,926,255,1131,2582,877,796,1124,591,391,820,1479,940,561,766,1108,2283,1777,10623,345,623,1102,897,12625,421,1041,289
"""spam""",11131,12640,10784,10340,14207,13568,9801,10717,12305,1477,6219,1,13181,5063,14250,14166,6390,10933,8136,9814,11082,5167,5956,4256,12390,7783,6318,13079,12579,4756,6131,10200,7431,8852,2101,7857,…,741,247,1735,92,59,4459,77,1434,13520,139,1192,139,326,501,2712,829,449,2297,1684,321,1274,701,790,556,263,388,1736,541,10940,959,0,1633,1447,12744,251,1224,596


In [18]:
totals = parameter_training_dataset.group_by('Spam/Ham').len().rename({"len": "total"})

parameter_training_dataset = word_counts.join(totals, on='Spam/Ham')

# column naming is not very intuitive, but "melts" better
# +1 and +2 terms are used for Laplace Smoothing, assuming uniform probability
parameter_training_dataset = parameter_training_dataset.with_columns(
    (
        (pl.col(f"count_{word}") + 1) / 
        (pl.col("total") + 2 )
    ).alias(f"{word}") for word in top_words
)

parameter_training_dataset

Spam/Ham,count_the,count_to,count_and,count_of,count_a,count_in,count_for,count_you,count_is,count__,count_this,count_enron,count_on,count_that,count_i,count_s,count_with,count_be,count_your,count_we,count_as,count_from,count_have,count_will,count_it,count_are,count_ect,count_or,count_at,count_by,count_not,count_our,count_if,count_com,count_company,count_1,…,director,july,special,schedule,conference,low,employees,once,v,issues,reply,texas,rights,north,lon,using,another,move,stop,quarter,check,ees,soon,numbers,meter,regarding,sure,issue,et,states,mmbtu,less,customer,k,communications,90,events
str,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,…,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""ham""",11705,11821,10498,9296,13913,12714,11193,9287,11983,1022,7546,6296,12553,6555,13833,13872,6953,10388,4939,10012,11647,6693,7474,6920,10654,7429,8949,12501,12796,5726,5954,8261,8531,7421,1342,10814,…,0.065087,0.060576,0.052341,0.159101,0.058857,0.342403,0.047258,0.118001,0.893527,0.075899,0.023271,0.066375,0.01833,0.081054,0.184949,0.062867,0.057067,0.080553,0.042389,0.028068,0.058786,0.105972,0.067378,0.040241,0.054919,0.079407,0.16354,0.127309,0.760705,0.024774,0.04468,0.078978,0.064299,0.904053,0.030216,0.07461,0.020765
"""spam""",11131,12640,10784,10340,14207,13568,9801,10717,12305,1477,6219,1,13181,5063,14250,14166,6390,10933,8136,9814,11082,5167,5956,4256,12390,7783,6318,13079,12579,4756,6131,10200,7431,8852,2101,7857,…,0.051589,0.017243,0.120698,0.006466,0.004172,0.310088,0.005423,0.099771,0.940068,0.009734,0.082945,0.009734,0.022735,0.034902,0.188625,0.057707,0.031287,0.159772,0.117152,0.022388,0.088646,0.048808,0.054995,0.038726,0.018355,0.027046,0.120768,0.037683,0.76069,0.066745,7e-05,0.113606,0.100674,0.886116,0.017521,0.08517,0.041507


In [19]:
probabilities = parameter_training_dataset.select([
    "Spam/Ham", *[ f"{word}" for word in top_words ]
    ])

probabilities = probabilities.unpivot(index="Spam/Ham", variable_name="word").pivot("Spam/Ham", index="word")

probabilities

word,ham,spam
str,f64,f64
"""the""",0.838178,0.773969
"""to""",0.846484,0.878885
"""and""",0.751754,0.749844
"""of""",0.665688,0.718974
"""a""",0.996277,0.987833
…,…,…
"""customer""",0.064299,0.100674
"""k""",0.904053,0.886116
"""communications""",0.030216,0.017521
"""90""",0.07461,0.08517


# Testing the model

fitting is done, now its time to test the model

> Note: apparently writing a funtion that does the prediction is terribly inefficent.
> This is probably due to how python handles passig parameters around.

In [21]:
import numpy as np

total_emails = df_spam.shape[0]

spam_emails = totals.filter(pl.col("Spam/Ham") == "spam").select("total").row(0)[0]

p_spam = spam_emails/total_emails

p_ham = 1 - p_spam

In [None]:
email_text = df_test.sample()

print(email_text)

print(f"Ground Truth: {email_text["Spam/Ham"][0]}\n---")

words = pl.Series(
    ' '.join(email_text['Message'].str.to_lowercase()).split()
)

spam_score = np.log(p_spam)

ham_score = np.log(p_ham)

for word in words:
    word_probs = probabilities.filter(pl.col("word") == word)
    if not word_probs.is_empty():
        P_word_spam = word_probs["spam"][0]
        P_word_ham = word_probs["ham"][0]
        spam_score += np.log(P_word_spam)
        ham_score += np.log(P_word_ham)

print(f"Spam Score: {spam_score}")
print(f"Ham Score : {ham_score}")

spam_probability = np.pow(10, spam_score) / (np.pow(10, spam_score) + np.pow(10, ham_score))

print(f"Computed spam probability: {spam_probability}")
print(f"Verdict: {"spam" if spam_score > ham_score else "ham"}")

shape: (1, 5)
┌────────────┬───────────────────────────┬─────────────────────────┬──────────┬────────────┐
│ Message ID ┆ Subject                   ┆ Message                 ┆ Spam/Ham ┆ Date       │
│ ---        ┆ ---                       ┆ ---                     ┆ ---      ┆ ---        │
│ i64        ┆ str                       ┆ str                     ┆ str      ┆ str        │
╞════════════╪═══════════════════════════╪═════════════════════════╪══════════╪════════════╡
│ 25441      ┆ are you ready to get it ? ┆ hello !                 ┆ spam     ┆ 2005-06-24 │
│            ┆                           ┆ viagra is the # 1 med … ┆          ┆            │
└────────────┴───────────────────────────┴─────────────────────────┴──────────┴────────────┘
Ground Truth: spam
---
Spam Score: -25.85226975054966
Ham Score : -31.174482023240977
Computed spam probability: 0.9999952380409193
Verdict: spam


# Accuracy calculation

Lets see how good is this

> Note: as said before a function that does the prediction runs at `<10 it/s` however the approach below goes up to `50ish it/s` (likely just a skill issue)

In [28]:
from tqdm import tqdm

email_text = df_test.sample()

#print(email_text)

#print(f"Ground Truth: {email_text['Spam/Ham'][0]}\n---")

#predict(probabilities, email_text['Message'].str.to_lowercase().item(), p_spam)

num_correct = 0
total = len(df_test)

#print(f"Total emails in test set: {total}")

for email in tqdm(df_test.iter_rows()):
    ground_truth = email[3]
    email_message = email[2].lower()
    words = set(email_message.split())
    
    spam_score = np.log(p_spam)
    ham_score = np.log(p_ham)
    
    for word in words:
        word_probs = probabilities.filter(pl.col("word") == word)
        if not word_probs.is_empty():
            P_word_spam = word_probs["spam"][0]
            P_word_ham = word_probs["ham"][0]
            spam_score += np.log(P_word_spam)
            ham_score += np.log(P_word_ham)
    
    verdict = "spam" if spam_score > ham_score else "ham"
    if verdict == ground_truth:
        num_correct += 1

print(f"Accuracy: {num_correct/total}")

5000it [01:51, 44.68it/s]

Accuracy: 0.8548



