In [1]:
import sys
import nltk
import sklearn
import pandas
import numpy
nltk.download('stopwords')
nltk.download('punkt')
print('python: {}'.format(sys.version))
print('nltk:{}'.format(nltk.__version__))
print('sklearn: {}'.format(sklearn.__version__))
print('pandas: {}'.format(pandas.__version__))
print('numpy: {}'.format(numpy.__version__))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...


python: 3.7.1 (default, Dec 10 2018, 22:54:23) [MSC v.1915 64 bit (AMD64)]
nltk:3.4
sklearn: 0.20.1
pandas: 0.23.4
numpy: 1.15.4


[nltk_data]   Package punkt is already up-to-date!


In [None]:
#everything in python are objects
#class is an object constructor or a bluprint for creating objects
# a method inside a class is a function that belongs to the object

## 1.LOAD THE DATATSET

In [2]:
import pandas as pd
import numpy as np

#load the datset of sms messages
df = pd.read_table('SMSSpamCollection', header = None, encoding='utf-8')

In [3]:
#print useful information about head
print(df.info())
print(df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
0    5572 non-null object
1    5572 non-null object
dtypes: object(2)
memory usage: 87.1+ KB
None
      0                                                  1
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...


In [4]:
#check the class distribution
classes=df[0]
print(classes.value_counts())

ham     4825
spam     747
Name: 0, dtype: int64


## 2.Preprocess the Data

In [5]:
#convert  class labels to decimal values, ham=0, spam=1
from sklearn.preprocessing import LabelEncoder
encoder=LabelEncoder()
Y=encoder.fit_transform(classes)

print(classes[:10])
print(Y[:10])

0     ham
1     ham
2    spam
3     ham
4     ham
5    spam
6     ham
7     ham
8    spam
9    spam
Name: 0, dtype: object
[0 0 1 0 0 1 0 0 1 1]


In [6]:
#store the sms message data
text_messages = df[1]
print(text_messages[:10])

0    Go until jurong point, crazy.. Available only ...
1                        Ok lar... Joking wif u oni...
2    Free entry in 2 a wkly comp to win FA Cup fina...
3    U dun say so early hor... U c already then say...
4    Nah I don't think he goes to usf, he lives aro...
5    FreeMsg Hey there darling it's been 3 week's n...
6    Even my brother is not like to speak with me. ...
7    As per your request 'Melle Melle (Oru Minnamin...
8    WINNER!! As a valued network customer you have...
9    Had your mobile 11 months or more? U R entitle...
Name: 1, dtype: object


# 2.1 Regular expressions

In [7]:
#use regular expressions to replace email addresses, urls, phone numbers, numbers etc.

#replace email adresses with 'email addr'
processed=text_messages.str.replace(r'^.+@[^\.].*\.[a-z]{2,}$','emailaddr')

# replace urls with 'webaddress'
processed=processed.str.replace(r'^http\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(/\S*)?$', 'webaddress')

#replace money symbols with moneysymb 
processed=processed.str.replace(r'£|\$', 'moneysymb')

#replace 10 digit phone numbers with'phonenumbers'
processed=processed.str.replace(r'^\(?[\d]{3}\)?[\s-]?[\d]{3}[\s-]?[\d]{4}$', 'phone number')

 # replace normal numbers with 'nmber'
processed=processed.str.replace(r'\d+(\.\d+)?', 'numbers')


In [8]:
#remove punctuation
processed=processed.str.replace(r'[^\w\d\s]', ' ')

#replace whitespace between terms with a single space
processed=processed.str.replace(r'\s+', ' ')

#replace loading and trailing whitespace
processed=processed.str.replace(r'^\s+|\s+?$','')

In [9]:
#changing the words to lower case, HELLO, Helo, hello are all the same words
processed = processed.str.lower()
print(processed)

0       go until jurong point crazy available only in ...
1                                 ok lar joking wif u oni
2       free entry in numbers a wkly comp to win fa cu...
3             u dun say so early hor u c already then say
4       nah i don t think he goes to usf he lives arou...
5       freemsg hey there darling it s been numbers we...
6       even my brother is not like to speak with me t...
7       as per your request melle melle oru minnaminun...
8       winner as a valued network customer you have b...
9       had your mobile numbers months or more u r ent...
10      i m gonna be home soon and i don t want to tal...
11      six chances to win cash from numbers to number...
12      urgent you have won a numbers week free member...
13      i ve been searching for the right words to tha...
14                      i have a date on sunday with will
15      xxxmobilemovieclub to use your credit click th...
16                                 oh k i m watching here
17      eh u r

In [10]:
#remove stop words from text messages

from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))

processed = processed.apply(lambda x: ' '.join(term for term in x.split() if term not in stop_words))

In [11]:
# remove word stems using porter stemmer

ps=nltk.PorterStemmer()

processed=processed.apply(lambda x: ' '.join(ps.stem(term) for term in x.split()) )

In [12]:
print(processed)

0       go jurong point crazi avail bugi n great world...
1                                   ok lar joke wif u oni
2       free entri number wkli comp win fa cup final t...
3                     u dun say earli hor u c alreadi say
4                    nah think goe usf live around though
5       freemsg hey darl number week word back like fu...
6           even brother like speak treat like aid patent
7       per request mell mell oru minnaminungint nurun...
8       winner valu network custom select receivea mon...
9       mobil number month u r entitl updat latest col...
10      gonna home soon want talk stuff anymor tonight...
11      six chanc win cash number number number pound ...
12      urgent number week free membership moneysymbnu...
13      search right word thank breather promis wont t...
14                                            date sunday
15      xxxmobilemovieclub use credit click wap link n...
16                                             oh k watch
17      eh u r

In [14]:
from nltk.tokenize import word_tokenize

# creating a bag of words
all_words=[]

for message in processed:
    words=word_tokenize(message)
    for w in words:
        all_words.append(w)
        
all_words=nltk.FreqDist(all_words)

In [15]:
# print the total number and the 15 most common words

print('The total number of words: {}'.format(len(all_words)))
print('15 most common words: {}'.format(all_words.most_common(15)))

The total number of words: 6578
15 most common words: [('number', 2752), ('u', 1207), ('call', 674), ('go', 456), ('get', 451), ('ur', 391), ('gt', 318), ('lt', 316), ('come', 304), ('moneysymbnumb', 303), ('ok', 293), ('free', 284), ('day', 276), ('know', 275), ('love', 266)]


In [16]:
# use 1500 most common words as features(the more features we have the longer it takes for the algorithm to learn)
word_features=list(all_words.most_common(1500))

In [17]:
# define find features functions
def find_features(message):
    words = word_tokenize(message)
    features = {}
    for word,numb in word_features:
        features[word] = (word in words)
    return features

#lets see some results
features = find_features(processed[0])
for (key,value) in features.items():
    if value == True:
        print (key)
    

go
got
n
great
wat
e
world
point
avail
crazi
bugi
la
cine


In [18]:
# find features of all meassages
messages = [[processed[i],Y[i]] for i in range(len(processed))]

# define a seed for reproducibilty
seed = 1
np.random.seed = seed
np.random.shuffle(messages)

#call find features function for each sms message
featuresets=[(find_features(text), label) for (text,label) in messages]

In [19]:
#splitting training and testing data sets using sklearn
from sklearn import model_selection

training,testing =  model_selection.train_test_split(featuresets,test_size=0.25, random_state=seed)

In [20]:
print('training: {}'.format(len(training)))
print('testing: {}'.format(len(testing)))

training: 4179
testing: 1393


## 3. SKlearn Classifiers with NLTK

In [21]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix


In [22]:
# define models to train
names=['K Nearest Neighbours', 'Decision Tree', 'Random Forrest','Logistic Regression','SGD Classifier', 'Naive Bayes', 'SVM Linear']
classifiers=[
    KNeighborsClassifier(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    LogisticRegression(),
    SGDClassifier(max_iter=100),
    MultinomialNB(),
    SVC(kernel = 'linear')
]
models= [[names[i],classifiers[i]] for i in range(len(classifiers))]


In [25]:
# wrap models in NLTK
from nltk.classify.scikitlearn import SklearnClassifier

for name, model in models:
    nltk_model= SklearnClassifier(model)
    nltk_model.train(training)
    accuracy= nltk.classify.accuracy(nltk_model, testing) *100
    print('{} accuracy= {}'.format(name, accuracy))
    

K Nearest Neighbours accuracy= 93.89806173725772
Decision Tree accuracy= 97.48743718592965
Random Forrest accuracy= 98.1335247666906




Logistic Regression accuracy= 98.92318736539842




SGD Classifier accuracy= 97.98994974874373
Naive Bayes accuracy= 98.63603732950466
SVM Linear accuracy= 98.42067480258436


In [29]:
# ensemble method - voting classifier
from sklearn.ensemble import VotingClassifier

#define models to train
classifiers=[
    KNeighborsClassifier(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    LogisticRegression(),
    SGDClassifier(max_iter=100),
    MultinomialNB(),
    SVC(kernel = 'linear')
]

models= [[names[i],classifiers[i]] for i in range(len(classifiers))]

nltk_ensemble=SklearnClassifier(VotingClassifier(estimators = models, voting = 'hard', n_jobs = -1))
# n_jobs defines how many cores of comuter to be used to train these in parallel
nltk_ensemble.train(training)
accuracy= nltk.classify.accuracy(nltk_model,testing) * 100
print('ensemble method accuracy: {}'.format(accuracy))


ensemble method accuracy: 98.42067480258436


In [30]:
#make class label prediction for testing set
txt_features, labels= zip(*testing)

prediction=nltk_ensemble.classify_many(txt_features)

In [33]:
# print a confusion matrix and a classification report
print(classification_report(labels,prediction))

pd.DataFrame(
confusion_matrix(labels,prediction),
index=[['actual','actual'],['ham','spam']],
columns=[['predicted','predicted'],['ham','spam']]
)

              precision    recall  f1-score   support

           0       0.99      1.00      1.00      1207
           1       1.00      0.94      0.97       186

   micro avg       0.99      0.99      0.99      1393
   macro avg       1.00      0.97      0.98      1393
weighted avg       0.99      0.99      0.99      1393



Unnamed: 0_level_0,Unnamed: 1_level_0,predicted,predicted
Unnamed: 0_level_1,Unnamed: 1_level_1,ham,spam
actual,ham,1207,0
actual,spam,12,174
