In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

Part 1: Deriving Bayes theorem from conditional probability

**Conditional probability**

I covered conditional probability in more depth here.
Conditional probability tells us the probability of an event occurring, given another event.

**P(A|B) = P(A ∩ B) / P(B) is the probability A occurs, in cases where we knowB occurs. It’s calculated as the probability that both A and B occur, divided by the probability that B occurs.
But what if we wanted to find the reverse, the probability of B, in cases where A occurs?**

**sometimes it’s easier to use Bayes theorem.**

**Deriving Bayes theorem**

**We start with the formula for conditional probability which can be written either, “A given B” or “B given A”.**

**We’ll start with the 1st formula, P(A|B)= P(A∩B) / P(B).
Multiple both sides by P(B). This will cancel out the P(B) denominator on the right, leaving us with below.**

**What we can now see (more easily if we swapped the left and right sides) is that P(A∩B)= P(A|B) * P(B) . We’ll plug this back into our 2nd original formula**

![img](http://https://miro.medium.com/max/2310/1*UTDk1rC_gv90NNSyeOe4eA.png)

 Part 2: Predicting if an SMS message is spam

Bayesian inference has a long history in spam detection. We’ll get into the basics here with some real data.

![img](http://https://miro.medium.com/max/2738/1*vmYmcPTWQlJLLaY9DAXwGg.png)

**In our case, the probability an SMS is spam, given some word, is equal to the probability of the word, given it is in a spam SMS, multiplied by the probability of spam, all divided by the probability of the word.**

In [None]:
#inspect tje dataframe
dir = '../input/sms-spam-collection-dataset/spam.csv'
import pandas as pd
df = pd.read_csv(dir, encoding='ISO-8859-1')
df.head()

The columns in the original CSV don’t make sense. So we’ll move the useful information into 2 new columns, one of which is a boolean indicating if the SMS is spam.
FYI, “ham” means “not spam”.

In [None]:
import pandas as pd
df['sms'] = df['v2']
df['spam'] = np.where(df['v1'] == 'spam', 1, 0)
df.head()

now drop the old columns

In [None]:
df = df[['sms', 'spam']]
df.head()

check the no fo records

In [None]:
len(df)

That’s a lot. Let’s work with a sample of 25% of the original data.

In [None]:
sample_df = df.sample(frac=0.25)
len(sample_df)


Now split the data into 2 separate dataframes, one for spam and one for ham.

In [None]:
spam_df = sample_df.loc[df['spam'] == 1]
ham_df = sample_df.loc[df['spam'] == 0]

print(len(spam_df))
print(len(ham_df))

We’ll use sklearn’s TFIDF vectorizer to eyeball some words important in the spam messages, and pick one to plug into our formula.

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer_spam = TfidfVectorizer(stop_words ='english',
max_features=30)

vectorizer_spam.fit(spam_df['sms'])
vectorizer_spam.vocabulary_

We need to pick a word to use in our formula so I’m going to choose the word “win”, though it would be interesting to try this for other words as well.

P(W|S) = probability of the word “win” being in a spam message

P(S) = probability of a spam message overall

P(W) = probability of the word “win” in a message overall

In [None]:
word = 'win'

Calculate P(W|S)

In [None]:
word = 'win'
spam_count  = 0
spam_with_word_count = 0

for idx,row in spam_df.iterrows():
    spam_count += 1
    
    if word in row.sms:
        spam_with_word_count += 1

probability_of_word_given_spam = spam_count / spam_with_word_count
print(probability_of_word_given_spam)


Calculate P(S)

In [None]:
probability_of_spam = len(spam_df) / (len(sample_df))
print(probability_of_spam)


Calculate P(W)

In [None]:
sms_count = 0
word_in_sms_count = 0
for idx,row in sample_df.iterrows():
    sms_count += 1
    
    if word in row.sms:
        word_in_sms_count += 1
probability_of_word = word_in_sms_count / sms_count
print(probability_of_word)

Now putting it all together

In [None]:
(probability_of_word_given_spam * probability_of_spam) / probability_of_word

Boom. What this tells us is that if an SMS contains the word “win”, there is a 58% probability that the message is spam.**