In [2]:
# import all the libraries

import sys
import nltk
import sklearn
import pandas as pd
import numpy as np

print('Python : {}'.format(sys.version))
print('NLTK : {}'.format(nltk.__version__))
print('Sklearn : {}'.format(sklearn.__version__))
print('Pandas : {}'.format(pd.__version__))
print('Numpy : {}'.format(np.__version__))

Python : 3.5.2 (default, Nov 23 2017, 16:37:01) 
[GCC 5.4.0 20160609]
NLTK : 3.4
Sklearn : 0.20.0
Pandas : 0.23.4
Numpy : 1.15.4


## Load the Dataset

In [3]:
# load the dataset

dataset = pd.read_table('SMSSpamCollection', header = None, encoding='utf-8')

# print information about the dataset

print(dataset.info())
print(dataset.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
0    5572 non-null object
1    5572 non-null object
dtypes: object(2)
memory usage: 87.1+ KB
None
      0                                                  1
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...


In [4]:
# separate the classes value(ham/spam) 

classes = dataset[0]
print(classes.value_counts())

ham     4825
spam     747
Name: 0, dtype: int64


## Preprocess the data

In [5]:
# convert classes labels into binary values

from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
Y = encoder.fit_transform(classes)

In [6]:
# store the SMS message data

X = dataset[1]

In [7]:
# use regular expressions to replace email, urls, phone numbers, other numbers, symbols, etc

# replace email addresses with 'emailaddr'
processed = X.str.replace(r'^([0-9a-zA-Z]([-.\w]*[0-9a-zA-Z])*@(([0-9a-zA-Z])+([-\w]*[0-9a-zA-Z])*\.)+[a-zA-Z]{2,9})$', 'emailaddr')

# replace web urls with 'webaddr'
processed = processed.str.replace(r'^((((H|h)(T|t)|(F|f))(T|t)(P|p)((S|s)?))\://)?(www.|[a-zA-Z0-9].)[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,6}(\:[0-9]{1,5})*(/($|[a-zA-Z0-9\.\,\;\?\'\\\+&amp;%\$#\=~_\-]+))*$', 'webaddr')

# replace money symbols with 'moneysymb'
processed = processed.str.replace(r'£|\$', 'moneysymb')

# replace 10 digit numbers with 'phoneno'
processed = processed.str.replace(r'^\D?(\d{3})\D?\D?(\d{3})\D?(\d{4})$', 'phoneno')

# replace numbers with 'number'
processed = processed.str.replace(r'\d+(\.\d+)?', 'number')

In [8]:
# remove punctuations
processed = processed.str.replace(r'[^\w\d\s]', '')

# replace white spaces in between words with single spaces
processed = processed.str.replace(r'\s+', ' ')

# remove leading and trailing whitespaces
processed = processed.str.replace(r'^\s+|\s+?$', '')

In [9]:
# change words to lowercase
processed = processed.str.lower()

In [10]:
# remove stop words

from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))

processed = processed.apply(lambda x: ' '.join(term for term in x.split() if term not in stop_words))

In [14]:
# remove word stems

ps = nltk.PorterStemmer()
processed = processed.apply(lambda x: ' '.join(ps.stem(term) for term in x.split()))

In [15]:
# tokenize each word

from nltk.tokenize import word_tokenize

all_words = []
for message in processed:
    words  =word_tokenize(message)
    for w in words:
        all_words.append(w)

all_words = nltk.FreqDist(all_words)

In [16]:
# print total words
print('Total words : {}'.format(len(all_words)))

# print 15 most common words
print('15 most common words : {}'.format(all_words.most_common(15)))

Total words : 7392
15 most common words : [('number', 2359), ('u', 1132), ('call', 655), ('im', 474), ('go', 452), ('get', 447), ('ur', 390), ('come', 300), ('dont', 298), ('ok', 278), ('ltgt', 276), ('free', 275), ('know', 270), ('moneysymbnumb', 260), ('like', 257)]


In [17]:
# use 1500 most common words as features
word_features = list(all_words.keys())[:1500]

In [22]:
# define find_features function

def find_features(message):
    words = word_tokenize(message)
    features = {}
    for word in word_features:
        features[word] = (word in words)
        
    return features



In [23]:
# find features of all messages

messages = list(zip(processed, Y))

# define seed
seed = 1
np.random.seed = seed

# shuffle the dataset
np.random.shuffle(messages)

# call find_features function for every message in messages
featureset = [(find_features(text), label) for (text, label) in messages]

In [26]:
# split training and testing dataset
from sklearn.model_selection import train_test_split

training, testing = train_test_split(featureset, test_size = 0.25, random_state = seed)

## Scikit Learn Classifiers with NLTK

In [30]:
# import sklearn classifiers

from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

In [32]:
# define models

names = ['K Neighbours', 'Decision Tree', 'Random Forest', 'Logistic Regression', 'SGD', 'Naive Bayes']

classifiers = [
    KNeighborsClassifier(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    LogisticRegression(),
    SGDClassifier(max_iter = 100),
    MultinomialNB(),
    SVC(kernel = 'linear')
]

models = list(zip(names, classifiers))

In [35]:
# wrap models with nltk

from nltk.classify.scikitlearn import SklearnClassifier

for name, model in models:
    nltk_model = SklearnClassifier(model)
    nltk_model.train(training)
    accuracy = nltk.classify.accuracy(nltk_model, testing) * 100
    print('{} accuracy : {}'.format(name, accuracy))

K Neighbours accuracy : 92.60588657573582
Decision Tree accuracy : 94.54414931801867
Random Forest accuracy : 94.83129935391243




Logistic Regression accuracy : 93.82627422828428
SGD accuracy : 94.54414931801867
Naive Bayes accuracy : 94.61593682699211


In [36]:
# ensemble method - Voter classifier

from sklearn.ensemble import VotingClassifier

names = ['K Neighbours', 'Decision Tree', 'Random Forest', 'Logistic Regression', 'SGD', 'Naive Bayes']

classifiers = [
    KNeighborsClassifier(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    LogisticRegression(),
    SGDClassifier(max_iter = 100),
    MultinomialNB(),
    SVC(kernel = 'linear')
]

models = list(zip(names, classifiers))

nltk_ensemble = SklearnClassifier(VotingClassifier(estimators = models, voting = 'hard', n_jobs = -1))
nltk_ensemble.train(training)
accuracy = nltk.classify.accuracy(nltk_ensemble, testing) * 100
print('NLTK Ensemble accuracy : {}'.format(accuracy))

NLTK Ensemble accuracy : 94.54414931801867


In [37]:
# make class label prediction fro==or testing sets

txt_features, labels = zip(*testing)

prediction = nltk_ensemble.classify_many(txt_features)

In [None]:
# print classification report and confusion matrix

print(classification_report(labels, prediction))
pd.DataFrame(
    confusion_matrix(labels, prediction),
    index = [['actual','actual'],['ham','spam']],
    columns = [['predicted','predicted'],['ham','spam']]
)

              precision    recall  f1-score   support

           0       0.94      1.00      0.97      1200
           1       0.96      0.63      0.76       193

   micro avg       0.95      0.95      0.95      1393
   macro avg       0.95      0.81      0.87      1393
weighted avg       0.95      0.95      0.94      1393



Unnamed: 0_level_0,Unnamed: 1_level_0,predicted,predicted
Unnamed: 0_level_1,Unnamed: 1_level_1,ham,spam
actual,ham,1195,5
actual,spam,71,122
