 ## 1. Importing the required libraries 

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from wordcloud import WordCloud # to Visualize our dataset in the WordCloud  

In [None]:
data = pd.read_csv("../input/spam.csv", encoding = 'ISO-8859-1')

In [None]:
data.shape

In [None]:
data.head()

## 2. Data Pre-processing

In [None]:
#Removing the NaN variables
data = data.drop(["Unnamed: 2","Unnamed: 3","Unnamed: 4"], axis = 1)

In [None]:
data.head()

In [None]:
# Re-naming the columns for our convenience 
data.columns = ["labels", "data"]
data.columns

In [None]:
#Converting the labels to the binary format 
data["b_labels"] = data["labels"].map({'ham': 0, 'spam':1})
data.head()

In [None]:
Y = data["b_labels"].values

In [None]:
#Fitting CV to convert the text data to vectors
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split

cv = CountVectorizer(decode_error = 'ignore')
X = cv.fit_transform(data["data"])

## 3. Splitting the data into train and test datasets

In [None]:
Xtrain, Xtest, Ytrain , Ytest = train_test_split(X,Y, test_size = 0.33)

 ## 4. Building the model, training it and printing the scores

In [None]:
from sklearn.naive_bayes import MultinomialNB
#fitting the model
model = MultinomialNB()
model.fit(Xtrain, Ytrain)
#Printing the scores
print("train score:", model.score(Xtrain, Ytrain))
print("test score:", model.score(Xtest, Ytest))

#Predicting Xtest
Ypred = model.predict(Xtest)

 #### We can also try fitting Tf-Idf Vectorizor to the Data

In [None]:
"""tfidf = TfidfVectorizer(decode_error = "ignore")
Xt = tfidf.fit_transform(data['data'])

Xt_train, Xt_test, Y_train , Y_test = train_test_split(X,Y, test_size = 0.33)
model.fit(Xt_train, Ytrain)
print("train score:", model.score(Xt_train, Ytrain))
print("test score:", model.score(Xt_test, Ytest))"""

## 5. Visualizng the data with Word Cloud

In [None]:
def visualize(label):
    words = ''
    for msg in data[data['labels']== label]['data']:
        msg = msg.lower()
        words += msg + ' '
    wordcloud = WordCloud(width = 600, height = 400).generate(words)
    plt.imshow(wordcloud)
    plt.axis('off')
    plt.show()
    
visualize('spam')
visualize('ham')

## 6. Implementing confusion matrix to predict the test data.

In [None]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(Ypred,Ytest)
print(cm)
accuracy = (cm[0][0]+cm[1][1])/(cm[0][1]+cm[0][0]+cm[1][1]+cm[1][0])
print(accuracy)

## 7. Figuring out where our model is getting wrong. 
#### It should not show a lot as our accuracy is around 98%

In [None]:
data['predictions'] = model.predict(X)
data.head()

In [None]:
#Figuring out where we are predicitng not spam in the place of spam
sneaky_spam = data[(data['predictions'] == 0) & (data['b_labels'] == 1)]["data"]
for msg in sneaky_spam:
    print(msg)

In [None]:
#Figuring out where we are predicitng spam in the place of not spam
sneaky_not_spam = data[(data['predictions'] == 1) & (data['b_labels'] == 0)]["data"]
for msg in sneaky_not_spam:
    print(msg)