In [17]:
import pandas as pd
import numpy as np
import nltk
import sys
import sklearn

print("Python:",sys.version)
print("NLTK:",nltk.__version__)
print("Pandas:",pd.__version__)
print("Scikit learn:",sklearn.__version__)
print("Numpy:",np.__version__)

Python: 3.7.4 (default, Aug  9 2019, 18:34:13) [MSC v.1915 64 bit (AMD64)]
NLTK: 3.4.5
Pandas: 0.25.1
Scikit learn: 0.21.3
Numpy: 1.16.5


## 1. Loading the Dataset

In [18]:
df=pd.read_table("SMSSpamCollection",header=None,encoding='utf-8')

In [19]:
#print useful information
print(df.info())
print(df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
0    5572 non-null object
1    5572 non-null object
dtypes: object(2)
memory usage: 87.2+ KB
None
      0                                                  1
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...


In [20]:
#Check Class distribution
classes=df[0]
print(classes.value_counts())

ham     4825
spam     747
Name: 0, dtype: int64


## Preprocess the Data

In [21]:
#Classification 0:Ham 1:Spam
from sklearn.preprocessing import LabelEncoder

encoder=LabelEncoder()
Y=encoder.fit_transform(classes)

print(classes[:10])
print(Y[:10])

0     ham
1     ham
2    spam
3     ham
4     ham
5    spam
6     ham
7     ham
8    spam
9    spam
Name: 0, dtype: object
[0 0 1 0 0 1 0 0 1 1]


In [22]:
#store sms message data
text_messages=df[1]
print(text_messages[:10])

0    Go until jurong point, crazy.. Available only ...
1                        Ok lar... Joking wif u oni...
2    Free entry in 2 a wkly comp to win FA Cup fina...
3    U dun say so early hor... U c already then say...
4    Nah I don't think he goes to usf, he lives aro...
5    FreeMsg Hey there darling it's been 3 week's n...
6    Even my brother is not like to speak with me. ...
7    As per your request 'Melle Melle (Oru Minnamin...
8    WINNER!! As a valued network customer you have...
9    Had your mobile 11 months or more? U R entitle...
Name: 1, dtype: object


### Some common regular expressions
From Wikipedia

^  Matches the starting position within the string. In line-based tools, it matches the starting position of any line.

.  Matches any single character (many applications exclude newlines, and exactly which characters are considered newlines is flavor-, character-encoding-, and platform-specific, but it is safe to assume that the line feed character is included). Within POSIX bracket expressions, the dot character matches a literal dot. For example, a.c matches "abc", etc., but [a.c] matches only "a", ".", or "c".

[ ]  A bracket expression. Matches a single character that is contained within the brackets. For example, [abc] matches "a", "b", or "c". [a-z] specifies a range which matches any lowercase letter from "a" to "z". These forms can be mixed: [abcx-z] matches "a", "b", "c", "x", "y", or "z", as does [a-cx-z].

The - character is treated as a literal character if it is the last or the first (after the ^, if present) character within the brackets: [abc-], [-abc]. Note that backslash escapes are not allowed. The ] character can be included in a bracket expression if it is the first (after the ^) character: []abc].

[^ ]  Matches a single character that is not contained within the brackets. For example, [^abc] matches any character other than "a", "b", or "c". [^a-z] matches any single character that is not a lowercase letter from "a" to "z". Likewise, literal characters and ranges can be mixed.

$    Matches the ending position of the string or the position just before a string-ending newline. In line-based tools, it matches the ending position of any line.

( )   Defines a marked subexpression. The string matched within the parentheses can be recalled later (see the next entry, \n). A marked subexpression is also called a block or capturing group. BRE mode requires \( \).

\n   Matches what the nth marked subexpression matched, where n is a digit from 1 to 9. This construct is vaguely defined in the POSIX.2 standard. Some tools allow referencing more than nine capturing groups. Also known as a backreference.

*    Matches the preceding element zero or more times. For example, ab*c matches "ac", "abc", "abbbc", etc. [xyz]* matches "", "x", "y", "z", "zx", "zyx", "xyzzy", and so on. (ab)* matches "", "ab", "abab", "ababab", and so on.

{m,n}  Matches the preceding element at least m and not more than n times. For example, a{3,5} matches only "aaa", "aaaa", and "aaaaa". This is not found in a few older instances of regexes. BRE mode requires \{m,n\}.

In [23]:
# use regular expressions to replace address,urls,phone numbers,symbols
#replace email address with emailaddr

processed=text_messages.replace(r'^(?:(?:[\w\.\-_]+@[\w\d]+(?:\.[\w]{2,6})+)[,;]?\s?)+$','emailaddr')

#replace urls with 'webaddress'
processed=processed.str.replace(r'^(http(s?)\:\/\/)*[0-9a-zA-Z]([-.\w]*[0-9a-zA-Z])*(:(0-9)*)*(\/?)([a-zA-Z0-9\-\.\?\,\'\/\\\+&amp;%\$#_]*)?$','webaddress')

#process money symbols with moneysmb

processed=processed.str.replace(r'£$\|','moneysmb')

#process 10 digit phone number with phone number

processed=processed.str.replace(r'([+]?\d[ ]?[(]?\d{3}[)]?[ ]?\d{2,3}[- ]?\d{2}[- ]?\d{2})','phonenumbr')

#process normal numbers with numbr

processed=processed.str.replace(r'\d+(\.\d)+?','numbr')

In [24]:
#remove punctuation
processed=processed.str.replace(r'[^\w\d\s]'," ")
#replace whitespace with single space
processed=processed.str.replace(r'\s+',' ')
#replace trailing whitespace
processed=processed.str.replace(r'^\s+|\s+?$',' ')

In [25]:
#change words to lower
pd.set_option('display.max_rows', None)
processed=processed.str.lower()
print(processed)

0       go until jurong point crazy available only in ...
1                                ok lar joking wif u oni 
2       free entry in 2 a wkly comp to win fa cup fina...
3            u dun say so early hor u c already then say 
4       nah i don t think he goes to usf he lives arou...
5       freemsg hey there darling it s been 3 week s n...
6       even my brother is not like to speak with me t...
7       as per your request melle melle oru minnaminun...
8       winner as a valued network customer you have b...
9       had your mobile 11 months or more u r entitled...
10      i m gonna be home soon and i don t want to tal...
11      six chances to win cash from 100 to 20 000 pou...
12      urgent you have won a 1 week free membership i...
13      i ve been searching for the right words to tha...
14                     i have a date on sunday with will 
15      xxxmobilemovieclub to use your credit click th...
16                                oh k i m watching here 
17      eh u r

In [26]:
from nltk.corpus import stopwords
stop_words=set(stopwords.words('english'))
processed=processed.apply(lambda x:' '.join(term for term in x.split() if term not in stop_words))

In [27]:
#Remove word stems using porter stemmer
ps=nltk.PorterStemmer()
processed=processed.apply(lambda x:' '.join(ps.stem(term) for term in x.split()))

In [28]:
pd.set_option('display.max_rows', None)
print(processed)

0       go jurong point crazi avail bugi n great world...
1                                   ok lar joke wif u oni
2       free entri 2 wkli comp win fa cup final tkt 21...
3                     u dun say earli hor u c alreadi say
4                    nah think goe usf live around though
5       freemsg hey darl 3 week word back like fun sti...
6           even brother like speak treat like aid patent
7       per request mell mell oru minnaminungint nurun...
8       winner valu network custom select receivea 900...
9       mobil 11 month u r entitl updat latest colour ...
10      gonna home soon want talk stuff anymor tonight...
11      six chanc win cash 100 20 000 pound txt csh11 ...
12      urgent 1 week free membership 100 000 prize ja...
13      search right word thank breather promis wont t...
14                                            date sunday
15      xxxmobilemovieclub use credit click wap link n...
16                                             oh k watch
17       eh u 

In [31]:
from nltk.tokenize import word_tokenize
all_words=[]

for message in processed:
    words=word_tokenize(message)
    for w in words:
        all_words.append(w)

all_words=nltk.FreqDist(all_words)

In [32]:
print('Number of words:',len(all_words))
print('Most common words:',all_words.most_common(15))

Number of words: 7007
Most common words: [('u', 1207), ('call', 679), ('2', 528), ('go', 456), ('get', 452), ('phonenumbr', 412), ('ur', 391), ('gt', 318), ('4', 316), ('lt', 316), ('come', 304), ('free', 284), ('day', 276), ('know', 275), ('ok', 274)]


In [33]:
#use 1500 most common words as features
word_features=list(all_words.keys())[:1500]

In [34]:
#define find features
def find_features(message):
    words=word_tokenize(message)
    features={}
    for word in word_features:
        features[word]=(word in words)
    return features
features=find_features(processed[0])
for key,value in features.items():
    if value==True:
        print(key)

go
jurong
point
crazi
avail
bugi
n
great
world
la
e
buffet
cine
got
amor
wat


In [43]:
messages=list(zip(processed,Y))
seed=1
np.random.seed=seed
np.random.shuffle(messages)

featuresets=[(find_features(text),label) for (text,label) in messages]

In [44]:
from sklearn import model_selection
train,test=model_selection.train_test_split(featuresets,test_size=0.25,random_state=seed)

In [45]:
print(len(train))
print(len(test))

4179
1393


## Scikit learn classifiers with NLTK

In [46]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression,SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report,accuracy_score,confusion_matrix

In [53]:
#Define models to train
names=['K nearest neighbors','Decision Trees','Random forest','Logistic Regression','SGD classifier','Naive Bayes','SVM linear']

classifier=[KNeighborsClassifier(),
            DecisionTreeClassifier(),
            RandomForestClassifier(),
            LogisticRegression(),
            SGDClassifier(max_iter=100),
            MultinomialNB(),
            SVC(kernel='linear')]
models=list(zip(names,classifier))


[('K nearest neighbors', KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')), ('Decision Trees', DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')), ('Random forest', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_lea

In [55]:
#wrap models in nltk
from nltk.classify.scikitlearn import SklearnClassifier

for name,model in models:
    nltk_model=SklearnClassifier(model)
    nltk_model.train(train)
    accuracy=nltk.classify.accuracy(nltk_model,test)*100
    print('{}:Accuracy:{}'.format(name,accuracy))

K nearest neighbors:Accuracy:92.60588657573582
Decision Trees:Accuracy:97.84637473079684




Random forest:Accuracy:98.63603732950466




Logistic Regression:Accuracy:99.13854989231874
SGD classifier:Accuracy:99.13854989231874
Naive Bayes:Accuracy:98.7078248384781
SVM linear:Accuracy:99.28212491026561


In [57]:
#ensemble method -Voting classifier
from sklearn.ensemble import VotingClassifier
names=['K nearest neighbors','Decision Trees','Random forest','Logistic Regression','SGD classifier','Naive Bayes','SVM linear']

classifier=[KNeighborsClassifier(),
            DecisionTreeClassifier(),
            RandomForestClassifier(),
            LogisticRegression(),
            SGDClassifier(max_iter=100),
            MultinomialNB(),
            SVC(kernel='linear')]
models=list(zip(names,classifier))

nltk_ensemble=SklearnClassifier(VotingClassifier(estimators=models,voting='hard',n_jobs=-1))
nltk_ensemble.train(train)
accuracy=nltk.classify.accuracy(nltk_ensemble,test)*100
print('Ensemble method accuracy:',accuracy)

Ensemble method accuracy: 99.21033740129216


In [58]:
#make class label predictions for testing set
txt_features,labels=zip(*test)
prediction=nltk_ensemble.classify_many(txt_features)

In [61]:
#print confusion matrix  and a classification report
print(classification_report(labels,prediction))
pd.DataFrame(
    confusion_matrix(labels,prediction),
    index=[['actual','actual'],['ham','spam']],
    columns=[['predicted','predicted'],['ham','spam']])

              precision    recall  f1-score   support

           0       0.99      1.00      1.00      1205
           1       1.00      0.94      0.97       188

    accuracy                           0.99      1393
   macro avg       1.00      0.97      0.98      1393
weighted avg       0.99      0.99      0.99      1393



Unnamed: 0_level_0,Unnamed: 1_level_0,predicted,predicted
Unnamed: 0_level_1,Unnamed: 1_level_1,ham,spam
actual,ham,1205,0
actual,spam,11,177
