<a href="https://colab.research.google.com/github/sizhky/naive-bayes-demo/blob/main/spam-ham/step_2_coarse_tuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%%capture
!pip install torch_snippets

In [2]:
## Setup and imports
import os
from pathlib import Path
import urllib.request
import pandas as pd
from sklearn.model_selection import train_test_split
from torch_snippets import unzip_file, line

if not os.path.exists('SMSSpamCollection'):
    urllib.request.urlretrieve ("https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip", "smsspamcollection.zip")
    # Extracting the dataset
    unzip_file('smsspamcollection.zip', './')

Exception: No module named 'sklego'


In [3]:
df = pd.read_csv('SMSSpamCollection', sep='\t', header=None)
df.columns = ['class','content']
trn_df, val_df = train_test_split(df)

In [4]:
from sklearn.feature_extraction import text
print([f for f in dir(text) if 'Vector' in f])

from sklearn import naive_bayes
print([f for f in dir(naive_bayes) if 'NB' in f])

['CountVectorizer', 'HashingVectorizer', 'TfidfVectorizer', 'VectorizerMixin', '_VectorizerMixin']
['BaseDiscreteNB', 'BaseNB', 'BernoulliNB', 'CategoricalNB', 'ComplementNB', 'GaussianNB', 'MultinomialNB', '_BaseDiscreteNB', '_BaseNB']


In [5]:
from sklearn.feature_extraction.text import (
    CountVectorizer, TfidfVectorizer
)
from sklearn.naive_bayes import (
    MultinomialNB,
    BernoulliNB
)
from sklearn.metrics import confusion_matrix, classification_report
import numpy as np

# print([f for f in dir(naive_bayes) if 'NB' in f])

classes = ["HAM", "SPAM"]
def experiment(**kwargs):
    _vectorizer = kwargs.get('vectorizer', CountVectorizer)
    vectorizer = _vectorizer(
        max_df=kwargs.get('max_df', 1.0),
        min_df=kwargs.get('min_df', 1),
        stop_words=kwargs.get('stop_words')
    )
    vectorizer.fit(trn_df['content'])
    words = vectorizer.get_feature_names()

    _nb_model = kwargs.get('nb', MultinomialNB)

    trn = vectorizer.transform(trn_df['content']).todense()
    val = vectorizer.transform(val_df['content']).todense()

    if _nb_model == BernoulliNB:
        trn = trn > 0
        val = val > 0

    nb = _nb_model(kwargs.get('alpha', 1.0))
    nb.fit(trn, trn_df['class'])

    _y = nb.predict(val)
    y = val_df['class']

    print(confusion_matrix(y, _y))
    print(classification_report(y, _y))
    
    feature_probs = np.array(nb.feature_log_prob_)
    k = kwargs.get('k', 15)
    for i in range(len(feature_probs)):
        print(f'Top {k} features for {classes[i]}')
        _f = np.argsort(feature_probs[i])[-k:]
        print([words[j] for j in _f])


In [6]:
experiment()
line()
experiment(stop_words='english')

[[1192    5]
 [  26  170]]
              precision    recall  f1-score   support

         ham       0.98      1.00      0.99      1197
        spam       0.97      0.87      0.92       196

    accuracy                           0.98      1393
   macro avg       0.98      0.93      0.95      1393
weighted avg       0.98      0.98      0.98      1393

Top 15 features for HAM
['but', 'can', 'have', 'for', 'of', 'that', 'is', 'it', 'me', 'my', 'and', 'in', 'the', 'to', 'you']
Top 15 features for SPAM
['text', 'mobile', 'is', 'on', 'ur', 'txt', 'now', 'or', 'the', 'for', 'your', 'free', 'you', 'call', 'to']
[[1191    6]
 [  23  173]]
              precision    recall  f1-score   support

         ham       0.98      0.99      0.99      1197
        spam       0.97      0.88      0.92       196

    accuracy                           0.98      1393
   macro avg       0.97      0.94      0.96      1393
weighted avg       0.98      0.98      0.98      1393

Top 15 features for HAM
['need', '

In [7]:
experiment()
line()
experiment(vectorizer=TfidfVectorizer)

[[1192    5]
 [  26  170]]
              precision    recall  f1-score   support

         ham       0.98      1.00      0.99      1197
        spam       0.97      0.87      0.92       196

    accuracy                           0.98      1393
   macro avg       0.98      0.93      0.95      1393
weighted avg       0.98      0.98      0.98      1393

Top 15 features for HAM
['but', 'can', 'have', 'for', 'of', 'that', 'is', 'it', 'me', 'my', 'and', 'in', 'the', 'to', 'you']
Top 15 features for SPAM
['text', 'mobile', 'is', 'on', 'ur', 'txt', 'now', 'or', 'the', 'for', 'your', 'free', 'you', 'call', 'to']
[[1197    0]
 [  73  123]]
              precision    recall  f1-score   support

         ham       0.94      1.00      0.97      1197
        spam       1.00      0.63      0.77       196

    accuracy                           0.95      1393
   macro avg       0.97      0.81      0.87      1393
weighted avg       0.95      0.95      0.94      1393

Top 15 features for HAM
['have', '

In [8]:
experiment()
line()
experiment(nb=BernoulliNB)

[[1192    5]
 [  26  170]]
              precision    recall  f1-score   support

         ham       0.98      1.00      0.99      1197
        spam       0.97      0.87      0.92       196

    accuracy                           0.98      1393
   macro avg       0.98      0.93      0.95      1393
weighted avg       0.98      0.98      0.98      1393

Top 15 features for HAM
['but', 'can', 'have', 'for', 'of', 'that', 'is', 'it', 'me', 'my', 'and', 'in', 'the', 'to', 'you']
Top 15 features for SPAM
['text', 'mobile', 'is', 'on', 'ur', 'txt', 'now', 'or', 'the', 'for', 'your', 'free', 'you', 'call', 'to']
[[1197    0]
 [  48  148]]
              precision    recall  f1-score   support

         ham       0.96      1.00      0.98      1197
        spam       1.00      0.76      0.86       196

    accuracy                           0.97      1393
   macro avg       0.98      0.88      0.92      1393
weighted avg       0.97      0.97      0.96      1393

Top 15 features for HAM
['but', 'c

In [9]:
experiment()
line()
experiment(vectorizer=TfidfVectorizer, nb=BernoulliNB)

[[1192    5]
 [  26  170]]
              precision    recall  f1-score   support

         ham       0.98      1.00      0.99      1197
        spam       0.97      0.87      0.92       196

    accuracy                           0.98      1393
   macro avg       0.98      0.93      0.95      1393
weighted avg       0.98      0.98      0.98      1393

Top 15 features for HAM
['but', 'can', 'have', 'for', 'of', 'that', 'is', 'it', 'me', 'my', 'and', 'in', 'the', 'to', 'you']
Top 15 features for SPAM
['text', 'mobile', 'is', 'on', 'ur', 'txt', 'now', 'or', 'the', 'for', 'your', 'free', 'you', 'call', 'to']
[[1197    0]
 [  48  148]]
              precision    recall  f1-score   support

         ham       0.96      1.00      0.98      1197
        spam       1.00      0.76      0.86       196

    accuracy                           0.97      1393
   macro avg       0.98      0.88      0.92      1393
weighted avg       0.97      0.97      0.96      1393

Top 15 features for HAM
['but', 'c

In [10]:
# best combination:
#     CountVectorizer,
#     MultinomialNB,
#     stop_words='english'