In [1]:
# Naive Bayes Spam Filtering

In [2]:
# https://towardsdatascience.com/how-to-build-and-apply-naive-bayes-classification-for-spam-filtering-2b8d3308501

In [3]:
# https://towardsdatascience.com/comparing-a-variety-of-naive-bayes-classification-algorithms-fc5fa298379e

In [6]:
import pandas as pd

In [12]:
sms_data = pd.read_csv('SMSSpamCollection', header=None, sep='\t', names=['Label', 'SMS'])

In [13]:
sms_data.head()

Unnamed: 0,Label,SMS
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [14]:
sms_data.groupby('Label').count()

Unnamed: 0_level_0,SMS
Label,Unnamed: 1_level_1
ham,4825
spam,747


In [None]:
# Scratch Model

In [16]:
sms_data_clean = sms_data.copy()

In [4]:
# Data Preparation

In [5]:
# Remove Punctuation
# Convert all text to lower-case
# Split into separate words

In [17]:
sms_data_clean['SMS'] = sms_data_clean['SMS'].str.replace('\W+', ' ').str.replace('\s+', ' ').str.strip()
sms_data_clean['SMS'] = sms_data_clean['SMS'].str.lower()
sms_data_clean['SMS'] = sms_data_clean['SMS'].str.split()

In [20]:
sms_data_clean['Label'].value_counts() / sms_data.shape[0] * 100

ham     86.593683
spam    13.406317
Name: Label, dtype: float64

In [None]:
# Train/Test Splitting

In [18]:
train_data = sms_data_clean.sample(frac=0.8,random_state=1).reset_index(drop=True)
test_data = sms_data_clean.drop(train_data.index).reset_index(drop=True)
train_data = train_data.reset_index(drop=True)

In [21]:
train_data['Label'].value_counts() / train_data.shape[0] * 100

ham     86.54105
spam    13.45895
Name: Label, dtype: float64

In [22]:
test_data['Label'].value_counts() / test_data.shape[0] * 100

ham     86.983842
spam    13.016158
Name: Label, dtype: float64

In [25]:
train_data.shape

(4458, 2)

In [26]:
test_data.shape

(1114, 2)

In [27]:
# Prepare Vocabulary

In [28]:
vocabulary = list(set(train_data['SMS'].sum()))

In [29]:
vocabulary[11:20]

['seem', 'withdraw', 'short', 'country', 'bt', 'save', 'bruv', 'pen', 'sweets']

In [30]:
len(vocabulary)

7783

In [31]:
word_counts_per_sms = pd.DataFrame([
    [row[1].count(word) for word in vocabulary]
    for _, row in train_data.iterrows()], columns=vocabulary)

In [32]:
# Concatenate these counts to the DF
train_data = pd.concat([train_data.reset_index(), word_counts_per_sms], axis=1).iloc[:,1:]

In [34]:
# set alpha = 1
alpha = 1

In [35]:
# the size of the vocabulary
Nvoc = len(train_data.columns) - 3

In [36]:
# probability of message to be spam
Pspam = train_data['Label'].value_counts()['spam'] / train_data.shape[0]

In [37]:
# probability of non-spam message
Pham = train_data['Label'].value_counts()['ham'] / train_data.shape[0]

In [38]:
# the number of words in spam messages
Nspam = train_data.loc[train_data['Label'] == 'spam', 'SMS'].apply(len).sum()

In [39]:
# the number of words in non-spam messages
Nham = train_data.loc[train_data['Label'] == 'ham', 'SMS'].apply(len).sum()

In [40]:
# Probability belongs to spam
def p_w_spam(word):
    if word in train_data.columns:
        return (train_data.loc[train_data['Label'] == 'spam', word].sum() + alpha) / (Nspam + alpha*Nvoc)
    else:
        return 1

In [41]:
# Probability does not belong to spam
def p_w_ham(word):
    if word in train_data.columns:
        return (train_data.loc[train_data['Label'] == 'ham', word].sum() + alpha) / (Nham + alpha*Nvoc)
    else:
        return 1

In [42]:
# Classifier
def classify(message):
    p_spam_given_message = Pspam
    p_ham_given_message = Pham
    for word in message:
        p_spam_given_message *= p_w_spam(word)
        p_ham_given_message *= p_w_ham(word)
    if p_ham_given_message > p_spam_given_message:
        return 'ham'
    elif p_ham_given_message < p_spam_given_message:
        return 'spam'
    else:
        return 'needs human classification'

In [43]:
# Predict

In [44]:
test_data['predicted'] = test_data['SMS'].apply(classify)

In [45]:
correct = (test_data['predicted'] == test_data['Label']).sum() / test_data.shape[0] * 100

In [46]:
test_data.loc[test_data['predicted'] != test_data['Label']]

Unnamed: 0,Label,SMS,predicted
56,spam,"[money, i, have, won, wining, number, 946, wot...",ham
99,ham,"[gettin, rdy, to, ship, comp]",spam
142,ham,"[have, you, laid, your, airtel, line, to, rest]",spam
218,spam,"[hi, babe, its, chloe, how, r, u, i, was, smas...",ham
245,ham,[anytime],spam
404,ham,"[nokia, phone, is, lovly]",spam
491,spam,"[hi, this, is, amy, we, will, be, sending, you...",ham
588,ham,"[we, have, sent, jd, for, customer, service, c...",spam
646,ham,"[a, boy, loved, a, gal, he, propsd, bt, she, d...",needs human classification
912,spam,"[dating, i, have, had, two, of, these, only, s...",ham


In [47]:
correct

99.10233393177738

In [57]:
# Other Implemetations

In [58]:
from sklearn import preprocessing
from sklearn.model_selection import train_test_split

In [59]:
sms_data_clean = sms_data.copy()
sms_data_clean['SMS'] = sms_data_clean['SMS'].str.replace('\W+', ' ').str.replace('\s+', ' ').str.strip()
sms_data_clean['SMS'] = sms_data_clean['SMS'].str.lower()
sms_data_clean['SMS'] = sms_data_clean['SMS'].str.split()

In [60]:
le = preprocessing.LabelEncoder()
sms_data_clean['Label'] = le.fit_transform(sms_data_clean['Label'])

In [61]:
X_train, X_test, y_train, y_test = train_test_split(sms_data_clean['SMS'], sms_data_clean['Label'], test_size=0.2, random_state=42)

In [62]:
# train part to the vocabulary and count the repetitions of every vocabulary word in every message (in both train and test parts)

In [63]:
vocabulary = list(set(X_train.sum()))

In [64]:
X_train_voc = pd.DataFrame([
    [row.count(word) for word in vocabulary]
    for row in X_train], columns=vocabulary)

In [65]:
X_test_voc = pd.DataFrame([
    [row.count(word) for word in vocabulary]
    for row in X_test], columns=vocabulary)

In [69]:
from sklearn import naive_bayes as nb
from sklearn import metrics

In [66]:
# Gaussian

In [96]:
'''This approach is built on the assumption of a normal distribution of probabilities. It means, that spam and not-spam classes of messages have frequencies of the words from vocabulary distributed by the Gaussian law'''

'This approach is built on the assumption of a normal distribution of probabilities. It means, that spam and not-spam classes of messages have frequencies of the words from vocabulary distributed by the Gaussian law'

In [70]:
classif_gauss = nb.GaussianNB()

In [71]:
classif_gauss.fit(X_train_voc, y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [72]:
res_gauss = classif_gauss.predict(X_test_voc)

In [73]:
print( "Accuracy: {:.2f}%".format(metrics.accuracy_score(y_test, res_gauss) * 100) )

Accuracy: 91.03%


In [74]:
# Multinomial

In [97]:
'''Multinomial classification suits best for the discrete values like word counts. So we expect it to show the best accuracy. In this case distribution of probabilities for each event bases on the formula'''

'Multinomial classification suits best for the discrete values like word counts. So we expect it to show the best accuracy. In this case distribution of probabilities for each event bases on the formula'

In [98]:
'''Ny is the total number of features of the event y (total number of words in all spam messages), Nyi — count of each feature (summary number of repetitions of a word in all spam messages), n — the number of features (number of words in the vocabulary) and α is a smoothing Laplace parameter to discard the influence of words absent in the vocabulary. The same formula applies to the set of not-spam messages.'''

'Ny is the total number of features of the event y (total number of words in all spam messages), Nyi — count of each feature (summary number of repetitions of a word in all spam messages), n — the number of features (number of words in the vocabulary) and α is a smoothing Laplace parameter to discard the influence of words absent in the vocabulary. The same formula applies to the set of not-spam messages.'

In [75]:
classif_multinom = nb.MultinomialNB()

In [76]:
classif_multinom.fit(X_train_voc, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [77]:
res_multinom = classif_multinom.predict(X_test_voc)

In [78]:
print( "Accuracy: {:.2f}%".format(metrics.accuracy_score(y_test, res_multinom) * 100) )

Accuracy: 99.19%


In [79]:
# Complement

In [99]:
'''This approach is almost the same as the Multinomial, though now we count the occurrences of a word in the complement to the class.'''

'This approach is almost the same as the Multinomial, though now we count the occurrences of a word in the complement to the class.'

In [100]:
'''Nc — total number of words in the opposite class (for the spam parameter — number of non-spam words), Nci — repetitions of a word in the opposite class (for a word from spam message — the number of repetitions in all non-spam messages). We also use the same smoothing parameters. After the calculation of basic values we start working with the real parameters:'''

'Nc — total number of words in the opposite class (for the spam parameter — number of non-spam words), Nci — repetitions of a word in the opposite class (for a word from spam message — the number of repetitions in all non-spam messages). We also use the same smoothing parameters. After the calculation of basic values we start working with the real parameters:'

In [101]:
'''It is the weight for each word in the message of k words.'''

'It is the weight for each word in the message of k words.'

In [102]:
'''So, the classification result is the class with the minimum value of the sum of weights for each word in the message.'''

'So, the classification result is the class with the minimum value of the sum of weights for each word in the message.'

In [80]:
classif_complement = nb.ComplementNB()

In [81]:
classif_complement.fit(X_train_voc, y_train)

ComplementNB(alpha=1.0, class_prior=None, fit_prior=True, norm=False)

In [82]:
res_complement = classif_complement.predict(X_test_voc)

In [83]:
print( "Accuracy: {:.2f}%".format(metrics.accuracy_score(y_test, res_complement) * 100) )

Accuracy: 98.12%


In [84]:
# Bernoulli

In [103]:
'''Bernoulli formula is close to the multinomial one, though the input is the set of boolean values (the word is present in the message or not) instead of the set of frequencies.'''

'Bernoulli formula is close to the multinomial one, though the input is the set of boolean values (the word is present in the message or not) instead of the set of frequencies.'

In [104]:
'''So, the algorithm explicitly penalizes the non-occurrence of a feature (word in the message is absent in the vocabulary) while the multinomial approach uses the smoothing parameter for the absent values. sklearn Bernoulli algorithm binarizes input values, so, no additional actions required.'''

'So, the algorithm explicitly penalizes the non-occurrence of a feature (word in the message is absent in the vocabulary) while the multinomial approach uses the smoothing parameter for the absent values. sklearn Bernoulli algorithm binarizes input values, so, no additional actions required.'

In [85]:
classif_bernoulli = nb.BernoulliNB()

In [86]:
classif_bernoulli.fit(X_train_voc, y_train)

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

In [87]:
res_bernoulli = classif_bernoulli.predict(X_test_voc)

In [88]:
print( "Accuracy: {:.2f}%".format(metrics.accuracy_score(y_test, res_bernoulli) * 100) )

Accuracy: 98.30%


In [89]:
# Categorical

In [105]:
'''Categorical Naive Bayes is suitable for the categorical values — if the example has the set of features or not. In our case, it means, that the vocabulary is treated as the set of features, and the occurrence of a word in the message is treated as the matching with the feature. All formulas are the same as for the multinomial approach but with the occurrences instead of repetitions.
Since the algorithm needs categorical values, we convert the frequencies of words to the presence of words: 1 — the message contains the word, 0 — the word is absent in the message.'''

'Categorical Naive Bayes is suitable for the categorical values — if the example has the set of features or not. In our case, it means, that the vocabulary is treated as the set of features, and the occurrence of a word in the message is treated as the matching with the feature. All formulas are the same as for the multinomial approach but with the occurrences instead of repetitions.\nSince the algorithm needs categorical values, we convert the frequencies of words to the presence of words: 1 — the message contains the word, 0 — the word is absent in the message.'

In [90]:
classif_cat = nb.CategoricalNB()

AttributeError: module 'sklearn.naive_bayes' has no attribute 'CategoricalNB'

In [91]:
X_train_voc_cat = X_train_voc.applymap(lambda el: 1 if el > 0 else 0)

In [92]:
classif_cat.fit(X_train_voc_cat, y_train)

NameError: name 'classif_cat' is not defined

In [93]:
X_test_voc_cat = X_test_voc.applymap(lambda el: 1 if el > 0 else 0)

In [94]:
res_cat = classif_cat.predict(X_test_voc_cat)

NameError: name 'classif_cat' is not defined

In [95]:
print( "Accuracy: {:.2f}%".format(metrics.accuracy_score(y_test, res_cat) * 100) )

NameError: name 'res_cat' is not defined