# Importing Different Packages Required



In [1]:
#We import nltk for using natural language processing techniques

import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/pinkman/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk import TweetTokenizer
from nltk.stem import PorterStemmer
import string
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import cross_val_score


# Reading The Dataset

In [3]:
# In this section, we read the data from 
# https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip(downloaded and 
# unzipped) and label it accordingly.


messages=pd.read_csv('SMSSpamCollection', sep='\t', names= ["label","sms"])
messages.head()

Unnamed: 0,label,sms
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


# Exploring the Data

In [4]:
messages.describe()

Unnamed: 0,label,sms
count,5572,5572
unique,2,5169
top,ham,"Sorry, I'll call later"
freq,4825,30


# Refining the Data

In [5]:
#In this section, we drop the duplicate messages

messages.drop_duplicates(subset='sms', inplace=True)

In [6]:
#these are redundant, just to check the messages
messages.describe()

Unnamed: 0,label,sms
count,5169,5169
unique,2,5169
top,ham,Eh sorry leh... I din c ur msg. Not sad alread...
freq,4516,1


In [7]:
messages.groupby('label').count().reset_index()

Unnamed: 0,label,sms
0,ham,4516
1,spam,653


In [8]:
#This function at first tokenizes the sentences into words, then removes all the punctuations and the stopwords.
# Then it does stemming e.g. "go", "going", "goes" all these words are replaced with its root word "go".

def refinement(text):
    token = TweetTokenizer()
    wordsandpuncts = token.tokenize(text)
    puncts = ["'", "!","?",",",".","@", "#","$","%","^","&","*","(",")","<",">","/",";",":","|"]
    words = [x for x in wordsandpuncts if x not in puncts]
    
    stop_words = set(stopwords.words('english'))
    start_words = [w for w in words if not w in stop_words]
    stemmer = PorterStemmer() 
    clean_words=[stemmer.stem(word) for word in start_words]                 
    
    return clean_words

In [9]:
#Just for checking how does the function work?
messages['sms'].apply(refinement).head()

0    [Go, jurong, point, crazi, .., avail, bugi, n,...
1               [Ok, lar, ..., joke, wif, u, oni, ...]
2    [free, entri, 2, wkli, comp, win, FA, cup, fin...
3    [U, dun, say, earli, hor, ..., U, c, alreadi, ...
4      [nah, I, think, goe, usf, live, around, though]
Name: sms, dtype: object

# Train-Test Split

In [10]:
#Train-test Split

from sklearn.model_selection import train_test_split

X = messages['sms']
y = messages['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

#print("Shape of X is {}".format(X.shape))
#print("Shape of X_train is {} and shape of y_train is {}".format(X_train.shape, y_train.shape))
#print("Shape of X_test is {} and shape of y_test is {}".format(X_test.shape, y_test.shape))


# Fitting Models with 3 Classifiers: NaiveBayes, SVM, Logistic Regression

In [11]:
# This is the pipeline for our three classifiers:
# 1. Naive-Bayes : This classifier check if certain word or a certain set of words appearing in a message
#    to declare it as spam or ham. Some words have probabilities of appearing in spam and legitimate mail.
#    It learns about these probabilities(it also considers the frequency of the words appearing).
#    from the training batch. Then, given a message from the test set or the user, it checks the set of words 
#    appearing in the message and using Bayes theorem count the probability of the message being spam or ham.
#
# 2. SVM : SVM or Support Vector Machine checks the training data and tries to find a separating hyperplane
#          between spams(negative points) and hams(positive points) which maximizes the distance of the plane
#          from both the set of points.
# 
# 3. Logistic Regression: Logistic Regression model indicates spam messages as 0 and ham messages as 1. It 
#                         measures the relationship between the categorical dependent variable and one or more 
#                         independent variables by estimating probabilities using a logistic function. So, from
#                         the training set it tries to produce the logistic or the sigmoid function, that helps
#                         determine if some messages from the test set is spam or ham, according to its logistic
#                         function value is 0 or 1.
#
# We do these classifications using pipeline in three steps:
# 1. First take the training set and create a bag of words calling our "refinement" function as an analyzer and
#    convert the collection of text documents to a matrix of token counts using "CountVectorizer".
# 2. Then, we produce the tf-idf vector from them.
# 3. Then we use the desired classifier.

NB = Pipeline([
    ('bow',CountVectorizer(analyzer=refinement)),
    ('tfidf',TfidfTransformer()), 
    ('classifier',MultinomialNB())
])

SVM = Pipeline([
    ('bow',CountVectorizer(analyzer=refinement)),
    ('tfidf',TfidfTransformer()), 
    ('classifier',LinearSVC())
])

LR = Pipeline([
    ('bow',CountVectorizer(analyzer=refinement)),
    ('tfidf',TfidfTransformer()), 
    ('classifier',LogisticRegression())
])

# Calculate cross-validation score

In [12]:
#This produces 5-fold cross validation
def cross_val(i):
    print("The cross validation scores are: {}".format(cross_val_score(i, X=X_train, y=y_train, cv=5)))

# Fit training data to Model

In [13]:
# We fit and train the data in the pipeline
def fitting(i):
    i.fit(X_train, y_train)

# Final Evauations on Test Data

In [14]:
#Then we use the model on the test data
def final_result(i):
    y_test_predicted = i.predict(X_test)

#This produces the accuracy score
    print("The fraction of correctly classified samples is {}".format(accuracy_score(y_test, y_test_predicted)))
    print("The number of correctly classified samples is {}".format(accuracy_score(y_test, y_test_predicted, normalize=False)))
    print pd.DataFrame(confusion_matrix(y_test, y_test_predicted), 
                 index={'true ham', 'true spam'}, 
                 columns={'pred ham', 'pred spam'})

#This produces the precision and recall on the test data
    print("The classification report is the following:")
    print(classification_report(y_test,y_test_predicted))

# User Input and Results

In [15]:
# Here we ask for the user input for which classifier they want to use and print the cross-validation score and
# also the final evaluations on the test data. This also produces the classification report, which contains
# precision and recall
def classifier(i):
    if i==NB:
        print("The classifier used here is Naive-Bayes.")
    elif i== SVM:
        print("The classifier used here is SVM.")
    elif i== LR:
        print("The classifier used here is Logistic Regression.")
    cross_val(i)
    fitting (i)
    final_result(i)
note= '''This is the code for spam filter. We have used three classifier. Choose which classifier you
         want to use:
         Enter 1 for Naive-Bayes
         Enter 2 for SVM
         Enter 3 for Logistic Regression'''

print(note)
switch = int(input("Enter choice: "))

if switch == 1:
    classifier(NB)
elif switch == 2:
    classifier(SVM)
elif switch == 3:
    classifier(LR)
else:
    print("wrong choice")

Enter choice: 2
The classifier used here is SVM.
The cross validation scores are: [0.98840206 0.98195876 0.98064516 0.99225806 0.97674419]
The fraction of correctly classified samples is 0.986078886311
The number of correctly classified samples is 1275
           pred spam  pred ham
true spam       1123         1
true ham          17       152
The classification report is the following:
             precision    recall  f1-score   support

        ham       0.99      1.00      0.99      1124
       spam       0.99      0.90      0.94       169

avg / total       0.99      0.99      0.99      1293

