In [142]:
import numpy as np
import pandas as pd
import string
from sklearn.model_selection import train_test_split

In [143]:
df = pd.read_csv('spam_ham_dataset.csv', on_bad_lines='skip')
df = df.drop(['Unnamed: 0', 'label_num'], axis=1)
df.head()

Unnamed: 0,label,text
0,ham,Subject: enron methanol ; meter # : 988291\r\n...
1,ham,"Subject: hpl nom for january 9 , 2001\r\n( see..."
2,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar..."
3,spam,"Subject: photoshop , windows , office . cheap ..."
4,ham,Subject: re : indian springs\r\nthis deal is t...


In [144]:
df['text'][3]

'Subject: photoshop , windows , office . cheap . main trending\r\nabasements darer prudently fortuitous undergone\r\nlighthearted charm orinoco taster\r\nrailroad affluent pornographic cuvier\r\nirvin parkhouse blameworthy chlorophyll\r\nrobed diagrammatic fogarty clears bayda\r\ninconveniencing managing represented smartness hashish\r\nacademies shareholders unload badness\r\ndanielson pure caffein\r\nspaniard chargeable levin\r\n'

In [145]:
def clean_text(s): 
    s = s.replace("Subject: ", "")
    for cs in s:
        if  not cs in string.ascii_letters:
            s = s.replace(cs, ' ')
    return s.rstrip('\r\n')

def remove_little(s): 
    words = s.split()
    words_clean = [w for w in words if len(w) > 2]
    resultString = ' '.join(words_clean)
    return resultString

def split_text(s):
    words = s.split()
    return list(set(words))

df['text'] = df['text'].apply(lambda x: clean_text(x))
df['text'] = df['text'].apply(lambda x: remove_little(x))
df['text'] = df['text'].apply(lambda x: split_text(x))

df.head()

Unnamed: 0,label,text
0,ham,"[preliminary, gas, from, asap, provided, for, ..."
1,ham,"[for, xls, hpl, hplnol, file, attached, januar..."
2,ham,"[conference, brad, prevail, outside, years, we..."
3,spam,"[undergone, chlorophyll, photoshop, managing, ..."
4,ham,"[need, whether, giving, lets, pvr, associated,..."


In [146]:
trainset, testset = train_test_split(df, test_size=0.1)
trainsize = len(trainset)
testsize = len(testset)

print(f"train: {trainsize}, test: {testsize}")

train: 4653, test: 518


In [147]:
trainset.head()

Unnamed: 0,label,text
1554,spam,"[managing, commercial, paste, browser, sale, a..."
2366,ham,"[need, these, make, for, one, requests, and, s..."
2401,ham,"[these, paste, new, send, inbox, thanks, folde..."
409,ham,"[teco, for, enron, hpl, actuals, iferc, novemb..."
2445,spam,"[secrets, help, city, brenton, stars, cock, ra..."


In [148]:
spamset = trainset[trainset['label']=='spam']
hamset = trainset[trainset['label']=='ham']

p_spam = len(spamset)/len(trainset)
p_ham = len(hamset)/len(trainset)

In [149]:
vocab = {}
def count_words(s, vocab):
    for w in s:
        if not w in vocab:
             vocab[w] = 1
        else:
             vocab[w] += 1

def count_table(df):
     vocab = {}
     df['text'].apply(lambda x: count_words(x, vocab))
     vocab = sorted(vocab.items(), key=lambda x:x[1], reverse=True)
     return pd.DataFrame(vocab), set([w for w,_ in vocab])

total, vocab = count_table(trainset)
spamcount,_ = count_table(spamset)
hamcount,_ = count_table(hamset)

In [150]:
temp = pd.merge(total, spamcount, on=0, how='outer').fillna(0)
count = pd.merge(temp,hamcount,on=0, how='outer').fillna(0)
count = count.astype({'1_x':int, '1_y':int, 1:int})
count = count.rename(columns={0:'word', '1_x':'total', '1_y':'spam', 1:'ham'})
count.head()

Unnamed: 0,word,total,spam,ham
0,for,3237,730,2507
1,the,3234,807,2427
2,and,2532,776,1756
3,you,2465,774,1691
4,this,2037,503,1534


In [151]:
spamsize = len(spamset)
hamsize = len(hamset)

count['prob-spam'] = np.log((count['spam']+1)/(spamsize+2))
count['prob-ham'] = np.log((count['ham']+1)/(hamsize+2))
count.head()

Unnamed: 0,word,total,spam,ham,prob-spam,prob-ham
0,for,3237,730,2507,-0.613446,-0.276556
1,the,3234,807,2427,-0.513298,-0.308974
2,and,2532,776,1756,-0.55242,-0.632434
3,you,2465,774,1691,-0.554997,-0.67013
4,this,2037,503,1534,-0.985284,-0.767511


In [152]:
# count = count[count['total']>=500]

In [153]:
testset.head()

Unnamed: 0,label,text
1964,ham,"[jpanos, all, need, jason, from, for, and, gre..."
3889,spam,"[strategy, first, software, dedicated, line, p..."
17,ham,"[first, morris, make, from, and, rodriguez, br..."
499,spam,"[more, injections]"
4310,ham,"[for, xls, hpl, attached, hplno, march, file, ..."


In [154]:
len(testset)

518

In [155]:
def predict(s):
    ps, ph = np.log(p_spam), np.log(p_ham)
    for w in s:
        if w not in vocab:
            continue
        
        ps += count[count['word']==w]['prob-spam'].values[0]
        ph += count[count['word']==w]['prob-ham'].values[0]

    if ps > ph:
        return 'spam'
    else: return 'ham'

In [156]:
testset['y_pred'] = testset['text'].apply(lambda x: predict(x))
testset['correct'] = (testset['label']==testset['y_pred']).astype(int)
testset.head()

Unnamed: 0,label,text,y_pred,correct
1964,ham,"[jpanos, all, need, jason, from, for, and, gre...",ham,1
3889,spam,"[strategy, first, software, dedicated, line, p...",spam,1
17,ham,"[first, morris, make, from, and, rodriguez, br...",ham,1
499,spam,"[more, injections]",spam,1
4310,ham,"[for, xls, hpl, attached, hplno, march, file, ...",ham,1


In [140]:
ex1 = 'click here to earn free credits in your AI course'
ex2 = 'hello darkness my old friend, Ive come to talk with you again'

print(predict(ex1.split()))
print(predict(ex2.split()))

spam
spam
