In [1]:
import sys
import nltk
import sklearn
import numpy as np
import pandas as pd

In [2]:
print("Python: {}".format(sys.version))
print("NLTK: {}".format(nltk.__version__))
print("Scikit-learn: {}".format(sklearn.__version__))

Python: 3.7.4 (default, Aug  9 2019, 18:34:13) [MSC v.1915 64 bit (AMD64)]
NLTK: 3.4.5
Scikit-learn: 0.21.3


## 1. Load the Dataset

In [3]:
df=pd.read_table("SMSSpamCollection", header=None, encoding="utf-8")

In [4]:
print(df.info())
print(df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
0    5572 non-null object
1    5572 non-null object
dtypes: object(2)
memory usage: 87.2+ KB
None
      0                                                  1
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...


In [5]:
classes=df[0]
print(classes.value_counts())

ham     4825
spam     747
Name: 0, dtype: int64


## 2. Preprocess the data

In [6]:
from sklearn.preprocessing import LabelEncoder
encoder=LabelEncoder()
Y=encoder.fit_transform(classes)
print(Y[0:9])

[0 0 1 0 0 1 0 0 1]


In [7]:
text_messages=df[1]
print(text_messages[:10])

0    Go until jurong point, crazy.. Available only ...
1                        Ok lar... Joking wif u oni...
2    Free entry in 2 a wkly comp to win FA Cup fina...
3    U dun say so early hor... U c already then say...
4    Nah I don't think he goes to usf, he lives aro...
5    FreeMsg Hey there darling it's been 3 week's n...
6    Even my brother is not like to speak with me. ...
7    As per your request 'Melle Melle (Oru Minnamin...
8    WINNER!! As a valued network customer you have...
9    Had your mobile 11 months or more? U R entitle...
Name: 1, dtype: object


In [8]:
# replacing stuff like emails or phone numbers to a much general form.
processed=text_messages.str.replace(r'^.+@[^\].*[a-z]{2,}$', 'emailaddr')
processed=processed.str.replace(r'^http\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(/\S*)?$','webaddress')
processed=processed.str.replace(r'£|\$','moneysymb')
processed=processed.str.replace(r'^\(?[\d]{3}\)?[\s-]?[\d]{3}[\s-]?[\d]{4}$','phonenumbr')
processed=processed.str.replace(r'\d+(\.\d+)?', 'numbr')

In [9]:
#remove punctutaion
processed=processed.str.replace(r'[^\w\d\s]', ' ')
#replacing whitespace with single space
processed=processed.str.replace(r'\s+', ' ')
# remove leading and traiiling whitespaces
processed=processed.str.replace(r'^\s+|\s+?$', '')

In [10]:
#change words to lowercase
processed=processed.str.lower()
print(processed)

0       go until jurong point crazy available only in ...
1                                 ok lar joking wif u oni
2       free entry in numbr a wkly comp to win fa cup ...
3             u dun say so early hor u c already then say
4       nah i don t think he goes to usf he lives arou...
                              ...                        
5567    this is the numbrnd time we have tried numbr c...
5568                  will ü b going to esplanade fr home
5569    pity was in mood for that so any other suggest...
5570    the guy did some bitching but i acted like i d...
5571                            rofl its true to its name
Name: 1, Length: 5572, dtype: object


In [11]:
# remove stop words from text messages
# nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
processed=processed.apply(lambda x: ' '.join(term for term in x.split() if term not in stop_words))

In [12]:
# remove word stems using porter stemmer
ps = nltk.PorterStemmer()
processed=processed.apply(lambda x:' '.join(ps.stem(term) for term in x.split()))

In [13]:
print(processed)

0       go jurong point crazi avail bugi n great world...
1                                   ok lar joke wif u oni
2       free entri numbr wkli comp win fa cup final tk...
3                     u dun say earli hor u c alreadi say
4                    nah think goe usf live around though
                              ...                        
5567    numbrnd time tri numbr contact u u moneysymbnu...
5568                              ü b go esplanad fr home
5569                                    piti mood suggest
5570    guy bitch act like interest buy someth els nex...
5571                                       rofl true name
Name: 1, Length: 5572, dtype: object


In [14]:
#nltk.download('punkt')
from nltk.tokenize import word_tokenize

#creating a bag-of-words
all_words=[]
for message in processed:
    words=word_tokenize(message)
    for w in words:
        all_words.append(w)
        
all_words = nltk.FreqDist(all_words)

In [15]:
# print total number of words and also 15 most commmon ones
print('Number of words: {}'.format(len(all_words)))
print('Most Common Words: {}'.format(all_words.most_common(15)))

Number of words: 6584
Most Common Words: [('numbr', 2648), ('u', 1207), ('call', 674), ('go', 456), ('get', 452), ('ur', 391), ('gt', 318), ('lt', 316), ('come', 304), ('moneysymbnumbr', 303), ('ok', 293), ('free', 284), ('day', 276), ('know', 275), ('love', 266)]


In [16]:
# use 1500 most common words as features
word_features= list(all_words.keys())[:1500]

In [17]:
def find_features(message):
    words=word_tokenize(message)
    features={}
    for word in word_features:
        features[word] = (word in words)
        
    return features

features=find_features(processed[0])
for key,value in features.items():
    if value == True:
        print(key)

go
jurong
point
crazi
avail
bugi
n
great
world
la
e
buffet
cine
got
amor
wat


In [18]:
print(processed[0])

go jurong point crazi avail bugi n great world la e buffet cine got amor wat


In [19]:
messages =zip(processed, Y)

seed =1
np.random.seed = seed
#np.random.shuffle(messages)

featuresets=[(find_features(text), label) for (text, label) in messages]

In [20]:
from sklearn.model_selection import train_test_split
training, testing =train_test_split(featuresets, test_size=0.25, random_state=seed)

In [21]:
print(len(training))
print(len(testing))

4179
1393


## 3.Scikit-learn Classifiers with NLTK

In [22]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression ,SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

In [23]:
names=['K Nearest Neighbors', 'Decision Tree', 'Random Forest', 'Logistic Regression', 'SGD Classifier', 'Naive Bayes', 'SVM linear']

classifier=[
    KNeighborsClassifier(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    LogisticRegression(),
    SGDClassifier(max_iter = 100),
    MultinomialNB(),
    SVC(kernel = 'linear')
]

models = zip(names, classifier)
print(models)

<zip object at 0x000002985EC57AC8>


In [24]:
# wrap models in NLTK
from nltk.classify.scikitlearn import SklearnClassifier

for name, model in models:
    nltk_model=SklearnClassifier(model)
    nltk_model.train(training)
    accuracy=nltk.classify.accuracy(nltk_model, testing) * 100
    print('{}: Accuracy:{}'.format(name, accuracy))

K Nearest Neighbors: Accuracy:94.75951184493898
Decision Tree: Accuracy:97.98994974874373




Random Forest: Accuracy:98.7078248384781




Logistic Regression: Accuracy:98.56424982053123
SGD Classifier: Accuracy:98.63603732950466
Naive Bayes: Accuracy:98.7078248384781
SVM linear: Accuracy:98.77961234745155


In [26]:
from sklearn.ensemble import VotingClassifier
names=['K Nearest Neighbors', 'Decision Tree', 'Random Forest', 'Logistic Regression', 'SGD Classifier', 'Naive Bayes', 'SVM linear']

classifier=[
    KNeighborsClassifier(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    LogisticRegression(),
    SGDClassifier(max_iter = 100),
    MultinomialNB(),
    SVC(kernel = 'linear')
]
models = list(zip(names, classifier))


nltk_ensemble=SklearnClassifier(VotingClassifier(estimators=models, voting='hard', n_jobs=-1))
nltk_ensemble.train(training)
accuracy=nltk.classify.accuracy(nltk_ensemble, testing)*100
print("Ensemble method accuracy: {}".format(accuracy))

Ensemble method accuracy: 98.92318736539842


In [27]:
txt_features, labels = zip(*testing)
prediction=nltk_ensemble.classify_many(txt_features)

In [29]:
print(classification_report(labels, prediction))



              precision    recall  f1-score   support

           0       0.99      1.00      0.99      1208
           1       0.99      0.93      0.96       185

    accuracy                           0.99      1393
   macro avg       0.99      0.96      0.98      1393
weighted avg       0.99      0.99      0.99      1393



In [32]:
pd.DataFrame(
    confusion_matrix(labels, prediction),
    index= [['actual', 'actual'], ['ham', 'spam']],
    columns=[['predicted', 'predicted'], ['ham', 'spam']]
)

Unnamed: 0_level_0,Unnamed: 1_level_0,predicted,predicted
Unnamed: 0_level_1,Unnamed: 1_level_1,ham,spam
actual,ham,1206,2
actual,spam,13,172
