# Spam Filter using Naive Bayes Classifier

In [None]:
import os
print(os.listdir("../input"))

**Import libraries**

In [None]:
import numpy as np
import pandas as pd

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

**Read csv file**

In [None]:
df = pd.read_csv('../input/spam.csv', encoding='latin-1')[['v1', 'v2']]
df.columns = ['label', 'message']
df.head()

**Describe dataset and visualize ham/spam count**

In [None]:
df.groupby('label').describe()

In [None]:
sns.countplot(data=df, x='label')

In [None]:
import string
from nltk.corpus import stopwords
from nltk import PorterStemmer as Stemmer
def process(text):
    # lowercase it
    text = text.lower()
    # remove punctuation
    text = ''.join([t for t in text if t not in string.punctuation])
    # remove stopwords
    text = [t for t in text.split() if t not in stopwords.words('english')]
    # stemming
    st = Stemmer()
    text = [st.stem(t) for t in text]
    # return token list
    return text

In [None]:
# Testing
process('It\'s holiday and we are playing cricket. Jeff is playing very well!!!')

In [None]:
# Test with our dataset
df['message'][:20].apply(process)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
tfidfv = TfidfVectorizer(analyzer=process)
data = tfidfv.fit_transform(df['message'])

In [None]:
mess = df.iloc[2]['message']
print(mess)

In [None]:
print(tfidfv.transform([mess]))

**A better view**

In [None]:
j = tfidfv.transform([mess]).toarray()[0]
print('index\tidf\ttfidf\tterm')
for i in range(len(j)):
    if j[i] != 0:
        print(i, format(tfidfv.idf_[i], '.4f'), format(j[i], '.4f'), tfidfv.get_feature_names()[i],sep='\t')

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
spam_filter = Pipeline([
    ('vectorizer', TfidfVectorizer(analyzer=process)), # messages to weighted TFIDF score
    ('classifier', MultinomialNB())                    # train on TFIDF vectors with Naive Bayes
])

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(df['message'], df['label'], test_size=0.20, random_state = 21)

In [None]:
spam_filter.fit(x_train, y_train)

**Predict for test cases**

In [None]:
predictions = spam_filter.predict(x_test)

In [None]:
count = 0
for i in range(len(y_test)):
    if y_test.iloc[i] != predictions[i]:
        count += 1
print('Total number of test cases', len(y_test))
print('Number of wrong of predictions', count)

In [None]:
x_test[y_test != predictions]

In [None]:
from sklearn.metrics import classification_report
print(classification_report(predictions, y_test))

Looking at precision column (for ham, it is 1.00), we can say that all number of wrong predictions came from spam predicted as ham. It is ok and cost of predicting spam as ham is negligible to that of predicting ham as spam.

Function to predict whether passed message is ham or spam

In [None]:
def detect_spam(s):
    return spam_filter.predict([s])[0]
detect_spam('Your cash-balance is currently 500 pounds - to maximize your cash-in now, send COLLECT to 83600.')