In [3]:
import pandas as pd
import numpy as np

# load the dataset of sms messages
df = pd.read_table('smsspamcollection\SMSSpamCollection', header=None, encoding='utf-8')

  """


In [4]:
print(df.info())
print(df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
0    5572 non-null object
1    5572 non-null object
dtypes: object(2)
memory usage: 87.1+ KB
None
      0                                                  1
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...


In [5]:
# Check class distribution
classes = df[0]
print(classes.value_counts())

ham     4825
spam     747
Name: 0, dtype: int64


In [8]:
# Preprocess the data
# convert class labels to binary values 0 -ham; 1 -spam

from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
y = encoder.fit_transform(classes)

print(classes[:10])
print(y[:10])

0     ham
1     ham
2    spam
3     ham
4     ham
5    spam
6     ham
7     ham
8    spam
9    spam
Name: 0, dtype: object
[0 0 1 0 0 1 0 0 1 1]


In [9]:
# Store the SMS message data
text_messages = df[1]
print(text_messages[:10])

0    Go until jurong point, crazy.. Available only ...
1                        Ok lar... Joking wif u oni...
2    Free entry in 2 a wkly comp to win FA Cup fina...
3    U dun say so early hor... U c already then say...
4    Nah I don't think he goes to usf, he lives aro...
5    FreeMsg Hey there darling it's been 3 week's n...
6    Even my brother is not like to speak with me. ...
7    As per your request 'Melle Melle (Oru Minnamin...
8    WINNER!! As a valued network customer you have...
9    Had your mobile 11 months or more? U R entitle...
Name: 1, dtype: object


In [18]:
# Check for regular expressions tutorial: https://www.geeksforgeeks.org/write-regular-expressions
# To try online use: regexlib.com


# Use regular expressions to replace all email addresses, urls, phone numbers, other numbers and symbols

# Replace email addresses with 'emailaddr'

processed = text_messages.str.replace(r'^.+@[^\.].*\.[a-z]{2,}$', 'emailaddr')

# replace urls with 'webaddr'
processed = processed.str.replace(r'^http\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(/\S*)?$', 'webaddr')

# replace money symbols with 'moneysymb' $/£
processed = processed.str.replace(r'£|\$', 'moneysymb')

# replace phone numbers with 'phonenumber'
processed = processed.str.replace(r'^\(?[\d]{3}\)?[\s-]?[\d]{3}[\s-]?[\d]{4}$', 'phonenumber')

# replace other numbers with 'numbr'
processed = processed.str.replace(r'\d+(\.\d+)?', 'numbr')


In [19]:
# remove punctuation
processed = processed.str.replace(r'[^\w\d\s]', ' ')

# remove white space between terms with a single space
processed = processed.str.replace(r'\s+|\s+?', ' ')

In [20]:
# change words to lowercase - Hello HELLO and hello all mean the same thing
processed = processed.str.lower()
print(processed)

0       go until jurong point crazy available only in ...
1                                ok lar joking wif u oni 
2       free entry in numbr a wkly comp to win fa cup ...
3            u dun say so early hor u c already then say 
4       nah i don t think he goes to usf he lives arou...
5       freemsg hey there darling it s been numbr week...
6       even my brother is not like to speak with me t...
7       as per your request melle melle oru minnaminun...
8       winner as a valued network customer you have b...
9       had your mobile numbr months or more u r entit...
10      i m gonna be home soon and i don t want to tal...
11      six chances to win cash from numbr to numbr nu...
12      urgent you have won a numbr week free membersh...
13      i ve been searching for the right words to tha...
14                     i have a date on sunday with will 
15      xxxmobilemovieclub to use your credit click th...
16                                oh k i m watching here 
17      eh u r

In [21]:
# remove stop words from text messages
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))

processed = processed.apply(lambda x: ' '.join(term for term in x.split() if term not in stop_words))

In [22]:
import nltk
# remove word stems using a Porter stemmer
ps = nltk.PorterStemmer()

processed = processed.apply(lambda x: ' '.join(ps.stem(term) for term in x.split()))

In [24]:
print(processed)

0       go jurong point crazi avail bugi n great world...
1                                   ok lar joke wif u oni
2       free entri numbr wkli comp win fa cup final tk...
3                     u dun say earli hor u c alreadi say
4                    nah think goe usf live around though
5       freemsg hey darl numbr week word back like fun...
6           even brother like speak treat like aid patent
7       per request mell mell oru minnaminungint nurun...
8       winner valu network custom select receivea mon...
9       mobil numbr month u r entitl updat latest colo...
10      gonna home soon want talk stuff anymor tonight...
11      six chanc win cash numbr numbr numbr pound txt...
12      urgent numbr week free membership moneysymbnum...
13      search right word thank breather promis wont t...
14                                            date sunday
15      xxxmobilemovieclub use credit click wap link n...
16                                             oh k watch
17      eh u r

In [26]:
from nltk.tokenize import word_tokenize

all_words = []

for message in processed:
    tokens = word_tokenize(message)
    for token in tokens:
        all_words.append(token)
        
all_words = nltk.FreqDist(all_words)

In [27]:
print('Number of words: {}'.format(len(all_words)))
print('Most common words: {}'.format(all_words.most_common(15)))

Number of words: 6579
Most common words: [('numbr', 2648), ('u', 1207), ('call', 674), ('go', 456), ('get', 451), ('ur', 391), ('gt', 318), ('lt', 316), ('come', 304), ('moneysymbnumbr', 303), ('ok', 293), ('free', 284), ('day', 276), ('know', 275), ('love', 266)]


In [33]:
# use the most common 1500 words as features
word_features = list(all_words.keys())[:1500]

In [35]:
# define find_features function
def find_features(message):
    words = word_tokenize(message)
    features = {}
    for word in word_features:
        features[word] = (word in words)
        
    return features

features = find_features(processed[0])
for key, value in features.items():
    if value == True:
        print(key)

go
jurong
point
crazi
avail
bugi
n
great
world
la
e
buffet
cine
got
amor
wat


In [38]:
# find features for all messages
messages = zip(processed, y)

# define a seed for reproduceability
seed = 1
np.random.seed = seed
#np.random.shuffle(item for item in messages.items()) 
# shuffle is not working in python3

# call find_features method for each SMS message
featuresets = [(find_features(text), label) for (text, label) in messages]

In [39]:
from sklearn import model_selection

training, testing = model_selection.train_test_split(featuresets, test_size = 0.25, random_state=seed)

In [41]:
print('Trinaing: {}'.format(len(training)))
print('Testing: {}'.format(len(testing)))

Trinaing: 4179
Testing: 1393


In [42]:
# Scikit-Learn Classifiers with NLTK
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

In [51]:
# Define models to train
names = ['K Nearest Neighbors', 'Decision Tree', 'Random Forest', 'Logistic Regression', 'SGD Classifier', 
         'Naive Bayes', 'SVM Linear']

classifiers = [ KNeighborsClassifier(),
                DecisionTreeClassifier(),
                RandomForestClassifier(),
                LogisticRegression(),
                SGDClassifier(),
                MultinomialNB(),
                SVC()
              ]

models = zip (names, classifiers)
    
# wrap models in NLTK
from nltk.classify.scikitlearn import SklearnClassifier
    
for name, model in models:
    print(name)
    nltk_model = SklearnClassifier(model)
    nltk_model.train(training)
    accuracy = nltk.classify.accuracy(nltk_model, testing) * 100
    print('{}: Accuracy :{}'.format(name, accuracy))

K Nearest Neighbors
K Nearest Neighbors: Accuracy :94.11342426417804
Decision Tree
Decision Tree: Accuracy :97.77458722182341
Random Forest




Random Forest: Accuracy :98.49246231155779
Logistic Regression




Logistic Regression: Accuracy :98.63603732950466
SGD Classifier




SGD Classifier: Accuracy :98.49246231155779
Naive Bayes
Naive Bayes: Accuracy :98.63603732950466
SVM Linear




SVM Linear: Accuracy :87.00646087580762


In [56]:
# wrap models in NLTK
from sklearn.ensemble import VotingClassifier

# Define models to train
models = [ ('K Nearest Neighbors', KNeighborsClassifier()),
                ('Decision Tree', DecisionTreeClassifier()),
                ('Random Forest', RandomForestClassifier()),
                ('Logistic Regression', LogisticRegression()),
                ('SGD Classifier', SGDClassifier(max_iter=100)),
                ('Naive Bayes', MultinomialNB()),
                ('SVM Linear', SVC(kernel='linear'))
         ]

nltk_ensemble = SklearnClassifier(VotingClassifier(estimators=models, voting='hard', n_jobs=-1))
nltk_ensemble.train(training)
accuracy = nltk.classify.accuracy(nltk_ensemble, testing) * 100
print('Ensemble method accuracy: {}'.format(accuracy))

Ensemble method accuracy: 98.63603732950466


In [57]:
# make class label predictions
txt_features, labels = zip(*testing)

prediction = nltk_ensemble.classify_many(txt_features)

In [58]:
# print a confusion matrix and a classification report
print(classification_report(labels, prediction))

pd.DataFrame(
    confusion_matrix(labels, prediction), index = [['actual', 'actual'], ['ham', 'spam']], 
    columns = [['predicted', 'predicted'], ['ham', 'spam']]
    )

              precision    recall  f1-score   support

           0       0.99      1.00      0.99      1208
           1       0.98      0.92      0.95       185

   micro avg       0.99      0.99      0.99      1393
   macro avg       0.98      0.96      0.97      1393
weighted avg       0.99      0.99      0.99      1393



Unnamed: 0_level_0,Unnamed: 1_level_0,predicted,predicted
Unnamed: 0_level_1,Unnamed: 1_level_1,ham,spam
actual,ham,1204,4
actual,spam,15,170
