# A classification model to filter the spam messages.

##### Import the required libraries

In [4]:
import os
import pandas as pd
import numpy as np

##### Set path to the data directory

In [5]:
os.chdir("/Users/ajeet/Google Drive/NLP /smsspamcollection")

In [6]:
ls

SMSSpamCollection  readme


##### Load the data 

In [7]:
data = pd.read_csv("SMSSpamCollection",delimiter='\t',header=None)

In [8]:
data.head()

Unnamed: 0,0,1
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [9]:
#Rename the column names
data.rename(columns={0:'Prediction', 1:"message"},inplace= True)

In [10]:
data.head()

Unnamed: 0,Prediction,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


##### Encode the prediction variables in form of 0 and 1

In [11]:
data['Prediction'] = data['Prediction'].replace('spam',1)
data['Prediction'] = data['Prediction'].replace('ham',0)

In [12]:
data.head()

Unnamed: 0,Prediction,message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [13]:
data.shape

(5572, 2)

In [14]:
data['Prediction'].value_counts()

0    4825
1     747
Name: Prediction, dtype: int64

So, there is 747 Spam messages and 4825 is not a spam.

##### <font color = red>Model is good only when with accuracy it also have a less error rate to detecting - not a spam to SPAM. Because we may miss any important information .</font>

##### Extract the message column from the data and clean it and build bag of words model to feed into machine learning model

In [15]:
message = data['message']

In [16]:
type(message)

pandas.core.series.Series

In [17]:
message.count()

5572

In [18]:
message.head()

0    Go until jurong point, crazy.. Available only ...
1                        Ok lar... Joking wif u oni...
2    Free entry in 2 a wkly comp to win FA Cup fina...
3    U dun say so early hor... U c already then say...
4    Nah I don't think he goes to usf, he lives aro...
Name: message, dtype: object

## 1- Filtering characters from the message

In [19]:
import re

In [20]:
#like take the first message-
message[0]

'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...'

In [21]:
message[0] = re.sub('[^a-zA-Z]', ' ', message[0])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [22]:
message[0]

'Go until jurong point  crazy   Available only in bugis n great world la e buffet    Cine there got amore wat   '

##### Will store all the cleaned meaasged in a list name- Corpus

In [23]:
corpus = []
#loop to filter the characyer from whole dataset
for i in range(0, message.count()):
    msg = re.sub('[^a-zA-Z]', ' ', message[i])
    corpus.append(msg)


In [24]:
corpus[1]

'Ok lar    Joking wif u oni   '

In [25]:
message[1]

'Ok lar... Joking wif u oni...'

In [26]:
corpus[5571]

'Rofl  Its true to its name'

In [27]:
message[5571]

'Rofl. Its true to its name'

In [28]:
type(corpus)

list

###### Now corpus is list coontaining all the characters only , will proceed next cleaning process

## 2: Make all lower case

In [29]:
for i in range(0, message.count()):
    corpus[i] = corpus[i].lower()
    

In [30]:
corpus[0]

'go until jurong point  crazy   available only in bugis n great world la e buffet    cine there got amore wat   '

In [31]:
corpus[len(corpus)-1]

'rofl  its true to its name'

##### Now the characters are in lower case

## 3-Removing all Insignificant words(STOPWORDS)

In [32]:
import nltk

In [33]:
#nltk.download()

In [34]:
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

In [35]:
for i in range(0, message.count()):
    review = corpus[i].split()
    ps = PorterStemmer()
    review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus[i] = review

In [36]:
len(corpus)

5572

In [37]:
corpus[0]

'go jurong point crazi avail bugi n great world la e buffet cine got amor wat'

In [38]:
corpus[len(corpus)-1]

'rofl true name'

## 4-Creating the Bag of Words model(Tokenization)

In [39]:
# Creating the Bag of Words model
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 5000)
X = cv.fit_transform(corpus).toarray()
y = data['Prediction'].values

In [40]:
X.shape

(5572, 5000)

In [41]:
vocab = cv.get_feature_names()
print(vocab[0:10])

['aa', 'aah', 'aaniy', 'aaooooright', 'aathi', 'ab', 'abbey', 'abdomen', 'abeg', 'abel']


In [42]:
y[0:5]

array([0, 0, 1, 0, 0])

In [43]:
y

array([0, 0, 1, ..., 0, 0, 0])

In [44]:
X

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

# Model Building

## 1-Naive Bayes

In [45]:
# Splitting the dataset into the Training set and Test set
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

# Fitting Naive Bayes to the Training set
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)

# Predicting the Test set results
y_pred = classifier.predict(X_test)

# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)




In [46]:
cm

array([[822, 133],
       [ 16, 144]])

In [47]:
# calculate accuracy
from sklearn import metrics
print(metrics.accuracy_score(y_test, y_pred))

0.8663677130044843


In [48]:
(822+144)/(822+144+133+16)

0.8663677130044843

#### Accuracy 86.63%

## 2-Random Forest Model

In [49]:
from sklearn.ensemble import RandomForestClassifier

In [50]:
# Initialize a Random Forest classifier with 100 trees
classifier = RandomForestClassifier(n_estimators = 500) 

classifier.fit(X_train, y_train)

# Predicting the Test set results
y_pred_RF = classifier.predict(X_test)

In [51]:
# calculate accuracy
from sklearn import metrics
print(metrics.accuracy_score(y_test, y_pred_RF))

0.9820627802690582


In [52]:
# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred_RF)

In [53]:
cm

array([[955,   0],
       [ 20, 140]])

#### FPR is critical in this case

Because if any message is not spam and it ir predicted as SPAM then we loose any important information, SO in spam filtering, FPR shud be very low

Here we get 0% FPR so it is good 