In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt

import seaborn as sns

from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

from sklearn.feature_extraction.text import CountVectorizer

from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
df_spam = pd.read_csv("../input/sms-spam-collection-dataset/spam.csv")

### Let us use only 2 columns

In [None]:


df_spam_data = df_spam.loc[:,["v1","v2"]]

df_spam_data.head()

### Make Spam Ham Labels as numeric

- Spam as 1
- Ham as 0

In [None]:
df_spam_data['v1'] = df_spam_data['v1'].map({"spam": 1,"ham":0})
df_spam_data['v1']

## Preprocessing of data
1. Removal of stopwords

In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

In [None]:
def all_stop():
    stop_words = stopwords.words('english')
    stop_words.extend([w.capitalize() for w in stop_words])
    return stop_words

### Let us remove from in the data set

In [None]:
def remove_stop_words(sent):
    words = sent.split()
    stop_words = all_stop()
    for w in words:
        if w in stop_words:
            sent = sent.replace(w,'')
    return sent

In [None]:
df_spam_data['v2'] = df_spam_data['v2'].apply(remove_stop_words)

df_spam_data['v2'].head()

### Let us now build a naive bayes model on top of the data

#### First let us split the data into training and test set

In [None]:
X_train,X_test,y_train,y_test = train_test_split( df_spam_data["v2"],  df_spam_data["v1"],random_state=100,train_size=0.75)

### Let us convert the train set into bag of words

1. First we will try CountVectorizer
2. Then we will try TfIDfVectorizer
3. Gauzzian Naive Bayes and Bernoulli Naive Bayes 

In [None]:
cv = CountVectorizer()

x_bag = cv.fit_transform(X_train)

x_test_bag = cv.transform(X_test)

In [None]:
x_bag = x_bag.toarray()
x_test_bag  = x_test_bag.toarray()


In [None]:
mn = MultinomialNB()
mn = mn.fit(x_bag,y_train)

y_test_pred = mn.predict(x_test_bag)




### Lets evaluate accuracy 

In [None]:
def get_metrics(cf):
    acc = (cf[0,0]+ cf[1,1])/(cf[0,0]+ cf[1,1]+ cf[0,1] + cf[1,0])
    sen = cf[1,1]/(cf[1,1]+cf[1,0])
    spe = cf[0,0]/(cf[0,0] + cf[0,1])
    return acc,sen,spe

### Let us evaluate test set 

In [None]:
cf = confusion_matrix(y_test,y_test_pred)
print(cf)

print(" accuracy:{0} \n sensitivity: {1}\n specificity: {2}".format(get_metrics(cf)[0],get_metrics(cf)[1],get_metrics(cf)[2]))

### Evaluation of train set

In [None]:
y_train_pred = mn.predict(x_bag)
cf = confusion_matrix(y_train,y_train_pred)
print(cf)

print(" accuracy:{0} \n sensitivity: {1}\n specificity: {2}".format(get_metrics(cf)[0],get_metrics(cf)[1],get_metrics(cf)[2]))

### Let us repeat same using tfidf vectorizer

In [None]:
tfidf = TfidfVectorizer()

x_bag = tfidf.fit_transform(X_train)

x_test_bag = tfidf.transform(X_test)


x_bag = x_bag.toarray()
x_test_bag  = x_test_bag.toarray()

x_bag

### Predictions on train and test set

In [None]:
mn = MultinomialNB()
mn = mn.fit(x_bag,y_train)

y_train_pred = mn.predict(x_bag)

y_test_pred = mn.predict(x_test_bag)

In [None]:

print("Training----------------------------------------------------------------------------\n")
cf = confusion_matrix(y_train,y_train_pred)
print(cf)

print(" accuracy:{0} \n sensitivity: {1}\n specificity: {2}".format(get_metrics(cf)[0],get_metrics(cf)[1],get_metrics(cf)[2]))


print("Testing----------------------------------------------------------------------------\n")
cf = confusion_matrix(y_test,y_test_pred)
print(cf)

print(" accuracy:{0} \n sensitivity: {1}\n specificity: {2}".format(get_metrics(cf)[0],get_metrics(cf)[1],get_metrics(cf)[2]))

### The count vectorizer does well while applyiing naive bayes

#### Now let us try bernoulli naive bayes. 

The bag of words in bernoulli naive bayes

- In bernoulli Naive bayes the frequency of the words doesnt matter
- If the word is present it will consider as 1 no matter how many times the word is present in text


In [None]:
bn = BernoulliNB()

cv = CountVectorizer()

x_bag_bernoulli = cv.fit_transform(X_train)
x_bag_bernoulli = x_bag_bernoulli.toarray()

x_text_bag_bern = cv.transform(X_test)
x_text_bag_bern = x_text_bag_bern.toarray()

##### Fitting the model

bernoulli_model = bn.fit(x_bag_bernoulli,y_train)

y_train_pred = bn.predict(x_bag_bernoulli)
y_test_pred = bn.predict(x_text_bag_bern)
print("train metrics\n",confusion_matrix(y_train,y_train_pred))
print("sensitivity")
print("train metrics\n",confusion_matrix(y_test,y_test_pred))

In [None]:
cf = confusion_matrix(y_train,y_train_pred)

si = cf[1,1]/(cf[1,0]+ cf[1,1])

sp = cf[0,0]/(cf[0,0]+ cf[0,1])

acc = (cf[1,1] + cf[0,0])/(cf[0,0]+ cf[0,1] +cf[1,0]+ cf[1,1])

print("sensitivity = {0}\nSpecificity = {1}\nAccuracy = {2}".format(si,sp,acc))

In [None]:
#### Evalyuating on test set 
cf = confusion_matrix(y_test,y_test_pred)

si = cf[1,1]/(cf[1,0]+ cf[1,1])

sp = cf[0,0]/(cf[0,0]+ cf[0,1])

acc = (cf[1,1] + cf[0,0])/(cf[0,0]+ cf[0,1] +cf[1,0]+ cf[1,1])

print("sensitivity = {0}\nSpecificity = {1}\nAccuracy = {2}".format(si,sp,acc))

## We want to predict spam messages correctly 
1. Here sensitivity is very important
2. As a result we will go with bernoulliNB
3. The Sensitivity is higher

# I am  new to data science
## Please suggest alternate approaches
Feedback welcomed