<a href="https://colab.research.google.com/github/sizhky/naive-bayes-demo/blob/main/spam-ham/step_0_data_inspection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%%capture
!pip install torch_snippets

In [None]:
## Setup and imports
import os
from pathlib import Path
import urllib.request
import pandas as pd
from sklearn.model_selection import train_test_split
from torch_snippets import unzip_file

Exception: No module named 'sklego'


In [None]:
if not os.path.exists('SMSSpamCollection'):
    urllib.request.urlretrieve ("https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip", "smsspamcollection.zip")
    # Extracting the dataset
    unzip_file('smsspamcollection.zip', './')

In [61]:
df = pd.read_csv('SMSSpamCollection', sep='\t', header=None)
df.columns = ['class','content']

In [62]:
from collections import Counter
def get_word_count(sentences):
    words = [word for sentence in sentences for word in sentence.split()]
    return Counter(words)

In [63]:
ham_sentences = df[df['class'] == 'ham'].content.tolist()
ham_word_count = get_word_count(ham_sentences)
print("HAM")
print(ham_word_count.most_common(20))

spam_sentences = df[df['class'] == 'spam'].content.tolist()
spam_word_count = get_word_count(spam_sentences)
print("SPAM")
print(spam_word_count.most_common(20))

HAM
[('to', 1538), ('you', 1462), ('I', 1439), ('the', 1029), ('a', 977), ('i', 742), ('and', 739), ('in', 736), ('u', 651), ('is', 645), ('my', 621), ('me', 541), ('of', 499), ('for', 481), ('that', 399), ('it', 376), ('your', 374), ('on', 352), ('have', 349), ('at', 334)]
SPAM
[('to', 607), ('a', 360), ('your', 187), ('call', 185), ('or', 185), ('the', 178), ('2', 169), ('for', 169), ('you', 164), ('is', 143), ('Call', 136), ('on', 136), ('have', 128), ('and', 119), ('from', 116), ('ur', 107), ('with', 101), ('&', 98), ('4', 93), ('of', 93)]


Both lists are giving similar top words. 
Let's remove the common words to see if the classes have different set of most_common words

In [64]:
def remove_common_words(word_count, common_words):
    for word in common_words:
        del word_count[word]
        
all_words = get_word_count(df.content)
common_words = all_words.most_common(50)
common_words = [word for word,count in common_words]
common_words[:10]

['to', 'you', 'I', 'a', 'the', 'and', 'in', 'is', 'i', 'u']

In [65]:
ham_sentences = df[df['class'] == 'ham'].content.tolist()
ham_word_count = get_word_count(ham_sentences)
remove_common_words(ham_word_count, common_words)
print("HAM")
print(ham_word_count.most_common(20))

spam_sentences = df[df['class'] == 'spam'].content.tolist()
spam_word_count = get_word_count(spam_sentences)
remove_common_words(spam_word_count, common_words)
print("SPAM")
print(spam_word_count.most_common(20))

HAM
[('got', 200), ('come', 198), ('all', 193), ('was', 191), ('?', 181), ('am', 176), ('out', 167), ('...', 162), ('about', 143), ('want', 142), ('going', 141), ('then', 138), ("I'll", 138), ('time', 138), ('need', 136), ('How', 132), ('n', 131), ('But', 131), ('what', 131), ('still', 129)]
SPAM
[('Call', 136), ('&', 98), ('FREE', 89), ('mobile', 81), ('our', 76), ('To', 73), ('claim', 73), ('Your', 71), ('txt', 68), ('text', 68), ('now', 64), ('Txt', 63), ('reply', 58), ('free', 56), ('contact', 56), ('-', 55), ('now!', 49), ('send', 46), ('won', 45), ('only', 45)]


Words like 'Your' are still coming as common words. This is likely because of capitalization. Let's lower case and retry

In [66]:
df['content'] = df['content'].str.lower()

all_words = get_word_count(df.content)
common_words = all_words.most_common(50)
common_words = [word for word,count in common_words]
common_words[:10]

ham_sentences = df[df['class'] == 'ham'].content.tolist()
ham_word_count = get_word_count(ham_sentences)
remove_common_words(ham_word_count, common_words)
print("HAM")
print(ham_word_count.most_common(20))

spam_sentences = df[df['class'] == 'spam'].content.tolist()
spam_word_count = get_word_count(spam_sentences)
remove_common_words(spam_word_count, common_words)
print("SPAM")
print(spam_word_count.most_common(20))

HAM
[('got', 228), ('like', 223), ('was', 221), ('come', 218), ('know', 208), ('am', 204), ('its', 203), ('then', 195), ('good', 189), ('?', 181), ('he', 180), ('out', 173), ("i'll", 168), ('...', 162), ('going', 157), ('ü', 157), ('ok', 156), ('want', 154), ('love', 153), ('time', 153)]
SPAM
[('free', 180), ('txt', 136), ('text', 112), ('mobile', 109), ('claim', 106), ('reply', 101), ('&', 98), ('stop', 90), ('our', 85), ('now!', 71), ('new', 69), ('send', 66), ('only', 66), ('won', 64), ('nokia', 64), ('win', 58), ('prize', 58), ('cash', 56), ('contact', 56), ('-', 55)]


# Inferences
* Sentences should be lower cased
* Common words should be removed 
* Words like free, claim, won, win, prize are clearly spammy words.
* A simple naive bayes model should be able to separate spam from ham