In [1]:
import pandas as pd

## Spam Detector

In [2]:
path = 'sms.tsv.txt'
data = pd.read_csv(path,sep='\t',header=None,names=['label','message'])
data

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [3]:
data.label.value_counts()

ham     4825
spam     747
Name: label, dtype: int64

In [4]:
# spam example
print(data[data.label=='spam'].iloc[10,1])

SMS. ac Sptv: The New Jersey Devils and the Detroit Red Wings play Ice Hockey. Correct or Incorrect? End? Reply END SPTV


In [5]:
# ham example 
print(data[data.label=='ham'].iloc[10,1])

Eh u remember how 2 spell his name... Yes i did. He v naughty make until i v wet.


**Naive Bayes**

- based on Baye's theorem 
    - $\text{p(A|B)} = \frac{\text{p(B|A)p(B)}}{\text{p(B)}}$
    - "prob of A given B"
- setting 
    - labels: 1,2,...,k
        - 1 = spam, 2 = ham
    - data: n documents 
    - vocabulary: $w_1,w_2,...,w_d$ (d words)
- for each document 
    - x = [$n_1,n_2,...,n_d$]   "count vector"
    - $n_i$ = number of times word $w_i$ appears in the document
    - $\text{p(label = i|x) =} \frac{\text{p(x|label = i\right)p(label = i)}}{\text{p(x)}}$
    - $\text{p(x|label=i) = p(}n_1\text{|label=i)p(}n_2\text{|label=i)...p(}n_d\text{|label=i)}$
    - $p(n_j|label=i) = \frac{\text{number of times word appears in label-i documents +} \alpha}{\text{total count of words in                                  label-i documents + d}\alpha}$
        - given the label, probability that $w_j$ appears $n_j$ times 
        - smoothing paramater $\alpha$
            - $0 \leq \alpha \leq 1$
            - prevents 0 probabilities
    - $\text{p(label=i)} = \frac{\text{number of label-i documents}}{\text{number of documents}}$
- prediction rule: predict the label with the largest probability 
    - $\text{p(label=1|x)}$ \
      $\text{p(label=2|x)}$ \
      $\text{.}$ \
      $\text{.}$ \
      $\text{.}$ \
      $\text{p(label=k|x)}$

In [6]:
# p(label=ham)
4825/(4825+747)

0.8659368269921034

In [7]:
# p(label=spam)
747/(4825+747)

0.13406317300789664

In [1]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix

In [9]:
pipe = Pipeline(steps=[
    ('vect',CountVectorizer(max_features=1000)),
    ('clf',MultinomialNB())
])

In [10]:
X = data.message
y = data.label

In [11]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y)

In [12]:
# fit pipeline to training data
pipe.fit(X_train,y_train)

Pipeline(steps=[('vect', CountVectorizer(max_features=1000)),
                ('clf', MultinomialNB())])

In [13]:
# evaluate the pipeline 
y_test_pred = pipe.predict(X_test)
confusion_matrix(y_test,y_test_pred)

array([[1197,   10],
       [  12,  174]], dtype=int64)

## How does Naive Bayes choose between spam and ham?

In [22]:
# store vocabulary 
words = pipe['vect'].get_feature_names_out()

In [23]:
# NB counts the number of times each word appears in each class 
pipe['clf'].feature_count_

array([[ 0.,  0.,  0., ...,  8., 17., 36.],
       [26., 12., 10., ...,  8.,  1.,  0.]])

In [24]:
pipe['clf'].classes_

array(['ham', 'spam'], dtype='<U4')

In [25]:
ham_word_count = pipe['clf'].feature_count_[0]
spam_word_count = pipe['clf'].feature_count_[1]

In [27]:
# create a dataframe 
df = pd.DataFrame({'words':words,
                   'ham':ham_word_count,
                   'spam':spam_word_count}).set_index('words')
# add 1 to the columns to avoid dividing by 0 
df.spam = df.spam + 1
df.ham = df.ham + 1
df 

Unnamed: 0_level_0,ham,spam
words,Unnamed: 1_level_1,Unnamed: 2_level_1
000,1.0,27.0
03,1.0,13.0
04,1.0,11.0
0800,1.0,10.0
08000839402,1.0,12.0
...,...,...
you,1489.0,236.0
your,296.0,204.0
yours,9.0,9.0
yourself,18.0,2.0


In [28]:
# convert counts into frequencies
df.spam = df.spam/df.spam.sum()
df.ham = df.ham/df.ham.sum()

In [29]:
df

Unnamed: 0_level_0,ham,spam
words,Unnamed: 1_level_1,Unnamed: 2_level_1
000,0.000025,0.002507
03,0.000025,0.001207
04,0.000025,0.001022
0800,0.000025,0.000929
08000839402,0.000025,0.001114
...,...,...
you,0.037762,0.021917
your,0.007507,0.018945
yours,0.000228,0.000836
yourself,0.000456,0.000186


In [30]:
# calculate the ratio of ham-spam and spam-ham for each word 
df['ham_ratio'] = df.ham/df.spam
df['spam_ratio'] = df.spam/df.ham
df

Unnamed: 0_level_0,ham,spam,ham_ratio,spam_ratio
words,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
000,0.000025,0.002507,0.010114,98.870449
03,0.000025,0.001207,0.021007,47.604290
04,0.000025,0.001022,0.024826,40.280553
0800,0.000025,0.000929,0.027308,36.618685
08000839402,0.000025,0.001114,0.022757,43.942422
...,...,...,...,...
you,0.037762,0.021917,1.722979,0.580390
your,0.007507,0.018945,0.396240,2.523720
yours,0.000228,0.000836,0.273085,3.661868
yourself,0.000456,0.000186,2.457762,0.406874


In [31]:
# top 20 spam words
df.sort_values(by='spam_ratio',ascending=False).head(20)

Unnamed: 0_level_0,ham,spam,ham_ratio,spam_ratio
words,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
claim,2.5e-05,0.008079,0.003139,318.582559
prize,2.5e-05,0.006779,0.003741,267.3164
uk,2.5e-05,0.005386,0.004708,212.388373
150p,2.5e-05,0.004736,0.005355,186.755293
tone,2.5e-05,0.0039,0.006502,153.798477
18,2.5e-05,0.003436,0.007381,135.489134
1000,2.5e-05,0.003158,0.008032,124.503529
guaranteed,2.5e-05,0.003065,0.008275,120.84166
cs,2.5e-05,0.003065,0.008275,120.84166
500,2.5e-05,0.002786,0.009103,109.856055


In [32]:
df.sort_values(by='ham_ratio',ascending=False).head(20)

Unnamed: 0_level_0,ham,spam,ham_ratio,spam_ratio
words,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
gt,0.005757,9.3e-05,61.990211,0.016132
lt,0.005681,9.3e-05,61.170957,0.016348
he,0.004463,9.3e-05,48.062895,0.020806
she,0.003246,9.3e-05,34.954832,0.028608
lor,0.002866,9.3e-05,30.858563,0.032406
da,0.002866,9.3e-05,30.858563,0.032406
did,0.002561,9.3e-05,27.581548,0.036256
later,0.002435,9.3e-05,26.216124,0.038144
come,0.004362,0.000186,23.485278,0.04258
work,0.001801,9.3e-05,19.389009,0.051576
