In [2]:
import sys
import nltk
import sklearn
import pandas
import numpy

print('Python:{}'.format(sys.version))
print('NLTK:{}'.format(nltk.__version__))
print('Sklearn:{}'.format(sklearn.__version__))
print('Pandas:{}'.format(pandas.__version__))
print('Numpy:{}'.format(numpy.__version__))

Python:3.6.4 |Anaconda, Inc.| (default, Jan 16 2018, 10:21:59) [MSC v.1900 32 bit (Intel)]
NLTK:3.2.5
Sklearn:0.19.1
Pandas:0.22.0
Numpy:1.14.0


!. Load the dataset

In [3]:
import pandas as pd
import numpy as np

#load the dataset of sms messages
df=pd.read_table('SMSSpamCollection',header=None,encoding='utf-8')

In [5]:
#print useful information about the dataset
print(df.info())
print(df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
0    5572 non-null object
1    5572 non-null object
dtypes: object(2)
memory usage: 43.6+ KB
None
      0                                                  1
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...


In [8]:
#check class distribution
classes=df[0]
print(classes.value_counts())

ham     4825
spam     747
Name: 0, dtype: int64


2.Preprocess The data

In [41]:
# convert calss tables to binary value, 0=ham, 1=spam

from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
Y=encoder.fit_transform(classes)

print(Y[:20])

[0 0 1 0 0 1 0 0 1 1 0 1 1 0 0 1 0 0 0 1]


In [14]:
#store the sms message data

text_message = df[1]
print(text_message[:10])

0    Go until jurong point, crazy.. Available only ...
1                        Ok lar... Joking wif u oni...
2    Free entry in 2 a wkly comp to win FA Cup fina...
3    U dun say so early hor... U c already then say...
4    Nah I don't think he goes to usf, he lives aro...
5    FreeMsg Hey there darling it's been 3 week's n...
6    Even my brother is not like to speak with me. ...
7    As per your request 'Melle Melle (Oru Minnamin...
8    WINNER!! As a valued network customer you have...
9    Had your mobile 11 months or more? U R entitle...
Name: 1, dtype: object


2.1 REGULAR EXPRESSIONS

^	Matches the starting position within the string. In line-based tools, it matches the starting position of any line.

.	Matches any single character (many applications exclude newlines, and exactly which characters are considered newlines is flavor-, character-encoding-, and platform-specific, but it is safe to assume that the line feed character is included). Within POSIX bracket expressions, the dot character matches a literal dot. For example, a.c matches "abc", etc., but [a.c] matches only "a", ".", or "c".

[ ]	A bracket expression. Matches a single character that is contained within the brackets. For example, [abc] matches "a", "b", or "c". [a-z] specifies a range which matches any lowercase letter from "a" to "z". These forms can be mixed: [abcx-z] matches "a", "b", "c", "x", "y", or "z", as does [a-cx-z].
The - character is treated as a literal character if it is the last or the first (after the ^, if present) character within the brackets: [abc-], [-abc]. Note that backslash escapes are not allowed. The ] character can be included in a bracket expression if it is the first (after the ^) character: []abc].
        
[^ ]	Matches a single character that is not contained within the brackets. For example, [^abc] matches any character other than "a", "b", or "c". [^a-z] matches any single character that is not a lowercase letter from "a" to "z". Likewise, literal characters and ranges can be mixed.

$	Matches the ending position of the string or the position just before a string-ending newline. In line-based tools, it matches the ending position of any line.

( )	Defines a marked subexpression. The string matched within the parentheses can be recalled later (see the next entry, \n). A marked subexpression is also called a block or capturing group. BRE mode requires \( \).

\n	Matches what the nth marked subexpression matched, where n is a digit from 1 to 9. This construct is vaguely defined in the POSIX.2 standard. Some tools allow referencing more than nine capturing groups.

*	Matches the preceding element zero or more times. For example, ab*c matches "ac", "abc", "abbbc", etc. [xyz]* matches "", "x", "y", "z", "zx", "zyx", "xyzzy", and so on. (ab)* matches "", "ab", "abab", "ababab", and so on.

{m,n}	Matches the preceding element at least m and not more than n times. For example, a{3,5} matches only "aaa", "aaaa", and "aaaaa". This is not found in a few older instances of regexes. BRE mode requires \{m,n\}.


In [19]:
#use regular expressions to replace email,url,phone number,other number,symbols

#replace email address with 'emailaddr'
processed=text_message.str.replace(r'^\w+@[a-zA-Z_]+?\.[a-zA-Z]{2,3}$','emailaddr')

#replace url address with 'webaddr'
processed=processed.str.replace(r'^http\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(/\S*)?$','webaddr')

#replace money symbol with 'moneysym'
processed=processed.str.replace(r'£|\$','moneysym')

#replace 10 digit phone number with 'phonenum'
processed=processed.str.replace(r'^[2-9]\d{2}-\d{3}-\d{4}$','phonenum')

#replace normal numbers with 'numbr'
processed=processed.str.replace(r'^[0-9]+$','numbr')

#remove punctuations
processed=processed.str.replace(r'[^\w\d\s]',' ')

#replace dual white spaces between terms with a single space
processed=processed.str.replace(r'\s+',' ')
                                

In [None]:
#change words to lower case
processed=processed.str.lower()
print(processed)

In [22]:
#remove stopwords from text messages
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))

processed= processed.apply(lambda x: ' '.join(term for term in x.split() if term not in stop_words))

In [None]:
print(processed)

In [25]:
#remove words stems using a Porter Stemmer
ps=nltk.PorterStemmer()

processed = processed.apply(lambda x: ' '.join(ps.stem(term) for term in x.split()))

In [None]:
print(processed)

In [29]:
from nltk.tokenize import word_tokenize

#creating a bag of words
all_words=[]

for message in processed:
    words=word_tokenize(message)
    for w in words:
        all_words.append(w)
        
        
all_words = nltk.FreqDist(all_words)

In [31]:
#print the total number of words and the 15 most common words

print('No of words:{}'.format(len(all_words)))
print('Most Common words:{}'.format(all_words.most_common(15)))

No of words:7315
Most Common words:[('u', 1207), ('call', 674), ('2', 519), ('go', 456), ('get', 452), ('ur', 391), ('4', 323), ('gt', 318), ('lt', 316), ('come', 304), ('ok', 293), ('free', 284), ('day', 276), ('know', 275), ('love', 266)]


In [32]:
#use the 1500 most common words as features
word_features = list(all_words.keys())[:1500]

In [35]:
#define a find_feature function
def find_feature(message):
    words = word_tokenize(message)
    features = {}
    for word in word_features:
        features[word]=(word in words)
    return features

#lets see an example
features = find_feature(processed[0])
for key,value in features.items():
    if value==True:
        print(key)

go
jurong
point
crazi
avail
bugi
n
great
world
la
e
buffet
cine
got
amor
wat


In [36]:
print(processed[0])

go jurong point crazi avail bugi n great world la e buffet cine got amor wat


In [46]:
#find_feature for all messages
messages = list(zip(processed,Y))

#define a seed for reproduciblity
seed=1
np.random.seed=seed
np.random.shuffle(messages)

#call find_features function for each SMS messages
featuresets=[(find_feature(text),label) for (text,label) in messages]

In [48]:
#split training and tetsing set using sklearn
from sklearn import model_selection

training ,testing = model_selection.train_test_split(featuresets,test_size=0.25,random_state=seed)

In [49]:
print('traning:{}'.format(len(training)))
print('testing:{}'.format(len(testing)))

traning:4179
testing:1393


Deploying Sklearn Classifier

In [54]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression,SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score,confusion_matrix

In [55]:
#define models to train
names=['K Nearest Neighbors','Decision Tree','Random Forest','Logistic Regression','SGD Classifier','Naive Bayes','SVM Linear']

classifier=[
    KNeighborsClassifier(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    LogisticRegression(),
    SGDClassifier(max_iter=100),
    MultinomialNB(),
    SVC(kernel='linear')
]

models=list(zip(names,classifier))
print(models)

[('K Nearest Neighbors', KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')), ('Decision Tree', DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')), ('Random Forest', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))

In [57]:
#wrap models in nltk
from nltk.classify.scikitlearn import SklearnClassifier

for name,model in models:
    nltk_model=SklearnClassifier(model)
    nltk_model.train(training)
    accuracy=nltk.classify.accuracy(nltk_model,testing)*100
    print('{}:Accuracy:{}'.format(name,accuracy))

K Nearest Neighbors:Accuracy:92.17516152189519
Decision Tree:Accuracy:96.8413496051687
Random Forest:Accuracy:97.77458722182341
Logistic Regression:Accuracy:98.49246231155779
SGD Classifier:Accuracy:98.20531227566404
Naive Bayes:Accuracy:98.49246231155779
SVM Linear:Accuracy:98.27709978463747


In [58]:
#ensemble method-voting classifier
from sklearn.ensemble import VotingClassifier

nltk_ensemble=SklearnClassifier(VotingClassifier(estimators=models,voting='hard',n_jobs=-1))
nltk_ensemble.train(training)
accuracy=nltk.classify.accuracy(nltk_ensemble,testing)*100
print('Ensemble Method Accuracy:{}'.format(accuracy))

Ensemble Method Accuracy:98.49246231155779


  if diff:


In [59]:
#make class label prediction for testing data
txt_features, labels = zip(*testing)

prediction = nltk_ensemble.classify_many(txt_features)

  if diff:


In [62]:
#print a confusion matrix and a classification report

print(classification_report(labels,prediction))

pd.DataFrame(
    confusion_matrix(labels,prediction),
    index=[['actual','actual'],['ham','spam']],
    columns=[['predicted','predicted'],['ham','spam']]
)

             precision    recall  f1-score   support

          0       0.99      1.00      0.99      1208
          1       0.98      0.90      0.94       185

avg / total       0.98      0.98      0.98      1393



Unnamed: 0_level_0,Unnamed: 1_level_0,predicted,predicted
Unnamed: 0_level_1,Unnamed: 1_level_1,ham,spam
actual,ham,1205,3
actual,spam,18,167
