In [6]:
#importing libraries

import sys
import nltk      #nlp
import sklearn   #importing ML classifier
import pandas   #pandas dataframe for storing data
import numpy

print('Python: {}'.format(sys.version))
print('NLTK: {}'.format(nltk.__version__))
print('Scikit-learn: {}'.format(sklearn.__version__))
print('Pandas: {}'.format(pandas.__version__))
print('Numpy: {}'.format(numpy.__version__))

Python: 2.7.15 |Anaconda, Inc.| (default, May  1 2018, 18:37:12) [MSC v.1500 32 bit (Intel)]
NLTK: 3.3
Scikit-learn: 0.19.2
Pandas: 0.23.4
Numpy: 1.15.1


## 1.Load the Dataset

In [7]:
import pandas as pd
import numpy as np

#Loading dataset of sms messages
df = pd.read_table('SMSSpamCollection', header = None, encoding='utf-8')

In [8]:
#print useful information about dataset
print(df.info())
print(df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
0    5572 non-null object
1    5572 non-null object
dtypes: object(2)
memory usage: 43.6+ KB
None
      0                                                  1
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...


In [9]:
#check class distribution(ham,spam)

classes = df[0]
print(classes.value_counts())

ham     4825
spam     747
Name: 0, dtype: int64


## 2.Preprocess the data

In [10]:
#Convert class labels into binary values , 0=ham, 1=spam

from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
Y = encoder.fit_transform(classes)

print(df[:10])
print(Y[:10])

      0                                                  1
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...
5  spam  FreeMsg Hey there darling it's been 3 week's n...
6   ham  Even my brother is not like to speak with me. ...
7   ham  As per your request 'Melle Melle (Oru Minnamin...
8  spam  WINNER!! As a valued network customer you have...
9  spam  Had your mobile 11 months or more? U R entitle...
[0 0 1 0 0 1 0 0 1 1]


In [11]:
# Storing sms message data

text_messages = df[1]
print(text_messages[:10])

0    Go until jurong point, crazy.. Available only ...
1                        Ok lar... Joking wif u oni...
2    Free entry in 2 a wkly comp to win FA Cup fina...
3    U dun say so early hor... U c already then say...
4    Nah I don't think he goes to usf, he lives aro...
5    FreeMsg Hey there darling it's been 3 week's n...
6    Even my brother is not like to speak with me. ...
7    As per your request 'Melle Melle (Oru Minnamin...
8    WINNER!! As a valued network customer you have...
9    Had your mobile 11 months or more? U R entitle...
Name: 1, dtype: object


In [12]:
# Use regular expressions to replace email addr, urls, phone, other symbols

# replace email addresses with 'emailaddr'
processed = text_messages.str.replace(r'^.+@[^\.].*\.[a-z]{2.}$', 'emailaddr')

#replace urls with webaddr
processed = processed.str.replace(r'^http\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(/\S*)?$','webaddress')

#replace money symbols with 'moneysymb'
processed = processed.str.replace(r'£|\$','moneysymb')

#replace 10 digit phone number with 'phonenumber'
processed = processed.str.replace(r'^\(?[\d]{3}\)?[\s-]?[\d]{3}[\s-]?[\d]{4}$','phonenumber')

#replace normal number with 'numbr'
processed = processed.str.replace(r'\d+(\.\d+)?', 'numbr')

In [13]:
#remove punctuation
processed = processed.str.replace(r'[^\w\d\s]', ' ')

#replace whitespace with single space
processed = processed.str.replace(r'\s+', ' ')

#remove leading and trailing whitespace
processed = processed.str.replace(r'^\s+|\s+?$', '')

In [14]:
#change words to lower case
processed = processed.str.lower()
print(processed)

0       go until jurong point crazy available only in ...
1                                 ok lar joking wif u oni
2       free entry in numbr a wkly comp to win fa cup ...
3             u dun say so early hor u c already then say
4       nah i don t think he goes to usf he lives arou...
5       freemsg hey there darling it s been numbr week...
6       even my brother is not like to speak with me t...
7       as per your request melle melle oru minnaminun...
8       winner as a valued network customer you have b...
9       had your mobile numbr months or more u r entit...
10      i m gonna be home soon and i don t want to tal...
11      six chances to win cash from numbr to numbr nu...
12      urgent you have won a numbr week free membersh...
13      i ve been searching for the right words to tha...
14                      i have a date on sunday with will
15      xxxmobilemovieclub to use your credit click th...
16                                 oh k i m watching here
17      eh u r

In [15]:
#remove stop words from text messages

from nltk.corpus import stopwords
#stopwords are a,the,an ..
stop_words = set(stopwords.words('english')) 

processed = processed.apply(lambda x: ' '.join(term for term in x.split() if term not in stop_words))


In [16]:
# remove word stems using a porter stemmer
ps = nltk.PorterStemmer()
#stem of friendship is friend

processed = processed.apply(lambda x: ' '.join(ps.stem(term) for term in x.split()))

In [17]:
print processed


0       go jurong point crazi avail bugi n great world...
1                                   ok lar joke wif u oni
2       free entri numbr wkli comp win fa cup final tk...
3                     u dun say earli hor u c alreadi say
4                    nah think goe usf live around though
5       freemsg hey darl numbr week word back like fun...
6           even brother like speak treat like aid patent
7       per request mell mell oru minnaminungint nurun...
8       winner valu network custom select receivea num...
9       mobil numbr month u r entitl updat latest colo...
10      gonna home soon want talk stuff anymor tonight...
11      six chanc win cash numbr numbr numbr pound txt...
12      urgent numbr week free membership numbr numbr ...
13      search right word thank breather promis wont t...
14                                            date sunday
15      xxxmobilemovieclub use credit click wap link n...
16                                             oh k watch
17      eh u r

In [18]:
from nltk.tokenize import word_tokenize

#creating a bag of words
all_words = []

for message in processed:
    words = word_tokenize(message)   #selecting tokens from a message
    for w in words:
        all_words.append(w)          #appending tokens to a list
      
all_words = nltk.FreqDist(all_words)    

In [19]:
#print total no of words and also 15 most common words
print('Number of words:{}'.format(len(all_words)))
print('Most Common words:{}'.format(all_words.most_common(15)))

Number of words:6567
Most Common words:[(u'numbr', 2961), (u'u', 1207), (u'call', 679), (u'go', 456), (u'get', 452), (u'ur', 391), (u'gt', 318), (u'lt', 316), (u'come', 304), (u'ok', 293), (u'free', 284), (u'day', 276), (u'know', 275), (u'love', 266), (u'like', 261)]


In [20]:
#Use the 1500 most common words as features
word_features = list(all_words.keys())[:1500]

In [21]:
#define a find_features function
def find_features(message):
    words = word_tokenize(message)
    features = {}
    for word in word_features:
        features[word] = (word in words)
        
    return features

#Lets see an example
features = find_features(processed[0])
for key,value in features.items():
    if value == True:
        print key

avail
buffet
world
great


In [22]:
print processed[0]

go jurong point crazi avail bugi n great world la e buffet cine got amor wat


In [23]:
#find features for all messages
messages = zip(processed, Y)

#define a seed for reproducibility
seed = 1
np.random.seed = seed
np.random.shuffle(messages)

#call find_features function for each sms message
featuresets = [(find_features(text),label) for (text,label) in messages]

In [24]:
#splitting training and testing datasets using sklearn
from sklearn import model_selection

training, testing = model_selection.train_test_split(featuresets, test_size=0.25, random_state=seed)

In [25]:
print(len(training))
print(len(testing))

4179
1393


## 4.Sci-kit Learn Classifier with NLTK

In [26]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

In [27]:
from numpy.core.umath_tests import inner1d

In [35]:
#Define models to train
name = ['K Nearest neighbors', 'Decision Tree', 'Random Forest', 'Logistic Regression','SGD Classifier','Naive Bayes','SVM Linear']

classifier = [
    KNeighborsClassifier(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    LogisticRegression(),
    SGDClassifier(max_iter=100),
    MultinomialNB(),
    SVC(kernel='linear')
]

models = zip(name,classifier)

In [29]:
#Wrap models in NLTk
from nltk.classify.scikitlearn import SklearnClassifier

for name,model in models:
    nltk_model = SklearnClassifier(model)
    nltk_model.train(training)
    accuracy = nltk.classify.accuracy(nltk_model,testing)*100
    print('{}: Accuracy: {}'.format(name, accuracy))

K Nearest neighbors: Accuracy: 93.1801866475
Decision Tree: Accuracy: 94.615936827
Random Forest: Accuracy: 95.1902368988
Logistic Regression: Accuracy: 94.7595118449
SGD Classifier: Accuracy: 95.2620244078
Naive Bayes: Accuracy: 94.8312993539
SVM Linear: Accuracy: 94.9030868629


In [36]:
#Ensemble method- Voting classifier
from sklearn.ensemble import VotingClassifier
#DEFIne  models to train
name = ['K Nearest neighbors', 'Decision Tree', 'Random Forest', 'Logistic Regression','SGD Classifier','Naive Bayes','SVM Linear']

classifier = [
    KNeighborsClassifier(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    LogisticRegression(),
    SGDClassifier(max_iter=100),
    MultinomialNB(),
    SVC(kernel='linear')
]

models = zip(name,classifier)

nltk_ensemble = SklearnClassifier(VotingClassifier(estimators=models,voting='hard',n_jobs=-1))
nltk_ensemble.train(training)
accuracy=nltk.classify.accuracy(nltk_ensemble,testing)*100
print('Ensemble method Accuracy: {}'.format(accuracy))

Ensemble method Accuracy: 95.4773869347


  if diff:


In [32]:
#make class label prediction for testing set
txt_features, labels = zip(*testing)

prediction = nltk_ensemble.classify_many(txt_features)

  if diff:


In [34]:
#print confusion matrix and classification report

print(classification_report(labels,prediction))

pd.DataFrame(
    confusion_matrix(labels,prediction),
    index = [['actual','actual'], ['ham','spam']],
    columns = [['predicted','predicted'], ['ham','spam']])

             precision    recall  f1-score   support

          0       0.96      0.99      0.97      1207
          1       0.90      0.71      0.80       186

avg / total       0.95      0.95      0.95      1393



Unnamed: 0_level_0,Unnamed: 1_level_0,predicted,predicted
Unnamed: 0_level_1,Unnamed: 1_level_1,ham,spam
actual,ham,1193,14
actual,spam,54,132
