In [1]:
import nltk
import numpy as np
import pandas as pd
import sklearn


In [2]:
#Load the dataset

df=pd.read_table("SMSSpamCollection",encoding='utf-8',header=None)
df.head()

Unnamed: 0,0,1
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
#Check class distribution
classes=df[0]
print(classes.value_counts())

ham     4825
spam     747
Name: 0, dtype: int64


In [4]:
#Data Preprocessing

#COnvert classs names to binary values

from sklearn.preprocessing import LabelEncoder

encoder=LabelEncoder()

y=encoder.fit_transform(classes)

y

array([0, 0, 1, ..., 0, 0, 0])

In [5]:
#Store the sms message data

text_messages=df[1]
print(text_messages[:10])

0    Go until jurong point, crazy.. Available only ...
1                        Ok lar... Joking wif u oni...
2    Free entry in 2 a wkly comp to win FA Cup fina...
3    U dun say so early hor... U c already then say...
4    Nah I don't think he goes to usf, he lives aro...
5    FreeMsg Hey there darling it's been 3 week's n...
6    Even my brother is not like to speak with me. ...
7    As per your request 'Melle Melle (Oru Minnamin...
8    WINNER!! As a valued network customer you have...
9    Had your mobile 11 months or more? U R entitle...
Name: 1, dtype: object


In [6]:
# use regular expressions to replace email addresses, URLs, phone numbers, other numbers

# Replace email addresses with 'email'
processed = text_messages.str.replace(r'^.+@[^\.].*\.[a-z]{2,}$',
                                 'emailaddress')

# Replace URLs with 'webaddress'
processed = processed.str.replace(r'^http\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(/\S*)?$',
                                  'webaddress')

# Replace money symbols with 'moneysymb' (£ can by typed with ALT key + 156)
processed = processed.str.replace(r'£|\$', 'moneysymb')
    
# Replace 10 digit phone numbers (formats include paranthesis, spaces, no spaces, dashes) with 'phonenumber'
processed = processed.str.replace(r'^\(?[\d]{3}\)?[\s-]?[\d]{3}[\s-]?[\d]{4}$',
                                  'phonenumbr')
    
# Replace numbers with 'numbr'
processed = processed.str.replace(r'\d+(\.\d+)?', 'numbr')

In [7]:
# Remove punctuation
processed = processed.str.replace(r'[^\w\d\s]', ' ')

# Replace whitespace between terms with a single space
processed = processed.str.replace(r'\s+', ' ')

# Remove leading and trailing whitespace
processed = processed.str.replace(r'^\s+|\s+?$', '')

In [8]:
#Change words to lowercase HELLO hello HelLo are the same

processed=processed.str.lower()
processed

0       go until jurong point crazy available only in ...
1                                 ok lar joking wif u oni
2       free entry in numbr a wkly comp to win fa cup ...
3             u dun say so early hor u c already then say
4       nah i don t think he goes to usf he lives arou...
                              ...                        
5567    this is the numbrnd time we have tried numbr c...
5568                  will ü b going to esplanade fr home
5569    pity was in mood for that so any other suggest...
5570    the guy did some bitching but i acted like i d...
5571                            rofl its true to its name
Name: 1, Length: 5572, dtype: object

In [9]:
#Remove stopwords

from nltk.corpus import stopwords

stop_words=set(stopwords.words('english'))

processed=processed.apply(lambda x:' '.join(term for term in x.split() if term not in stop_words))


In [10]:
#Remove word stems using porterstemmer

ps=nltk.PorterStemmer()

processed=processed.apply(lambda x: ' '.join(ps.stem(term) for term in x.split()))

In [11]:
from nltk.tokenize import word_tokenize

#Creating a bag-of-words

all_words=[]

for message in processed:
    words=word_tokenize(message)
    for w in words:
        all_words.append(w)
        
all_words=nltk.FreqDist(all_words)


In [12]:
#Print the total number of words and 15 common words

print('NUmber of words:{}'.format(len(all_words)))
print('MOst common words:{}'.format(all_words.most_common(15)))

NUmber of words:6579
MOst common words:[('numbr', 2648), ('u', 1207), ('call', 674), ('go', 456), ('get', 451), ('ur', 391), ('gt', 318), ('lt', 316), ('come', 304), ('moneysymbnumbr', 303), ('ok', 293), ('free', 284), ('day', 276), ('know', 275), ('love', 266)]


In [26]:
#Use 1500 most common words as features

word_features=list(all_words.keys())[:1500]


In [28]:
#Defining a find features function

def find_features(message):
    words=word_tokenize(message)
    features={}
    for word in word_features:
        features[word]=(word in words)
    return features

In [30]:
#Example
features=find_features(processed[1])
for key,value in features.items():
    if value==True:
        print(key)

ok
lar
joke
wif
u
oni


In [32]:
features

{'go': False,
 'jurong': False,
 'point': False,
 'crazi': False,
 'avail': False,
 'bugi': False,
 'n': False,
 'great': False,
 'world': False,
 'la': False,
 'e': False,
 'buffet': False,
 'cine': False,
 'got': False,
 'amor': False,
 'wat': False,
 'ok': True,
 'lar': True,
 'joke': True,
 'wif': True,
 'u': True,
 'oni': True,
 'free': False,
 'entri': False,
 'numbr': False,
 'wkli': False,
 'comp': False,
 'win': False,
 'fa': False,
 'cup': False,
 'final': False,
 'tkt': False,
 'numbrst': False,
 'may': False,
 'text': False,
 'receiv': False,
 'question': False,
 'std': False,
 'txt': False,
 'rate': False,
 'c': False,
 'appli': False,
 'numbrovernumbr': False,
 'dun': False,
 'say': False,
 'earli': False,
 'hor': False,
 'alreadi': False,
 'nah': False,
 'think': False,
 'goe': False,
 'usf': False,
 'live': False,
 'around': False,
 'though': False,
 'freemsg': False,
 'hey': False,
 'darl': False,
 'week': False,
 'word': False,
 'back': False,
 'like': False,
 'fun': 

In [36]:
#Find features for all messages
messages=list(zip(processed,y))

#Define a seed for reproducability

seed=1
np.random.seed=seed
np.random.shuffle(messages)

#Call find features fn for each message

featuresets=[(find_features(text),label) for (text,label) in messages]

In [38]:
#Split training and testing datasets using sklearn

from sklearn.model_selection import train_test_split

training,testing=train_test_split(featuresets,test_size=0.2,random_state=seed)

In [41]:
print('Testing:{}'.format(len(testing)))
print('Training:{}'.format(len(training)))


Testing:1115
Training:4457


In [43]:
#Scikit learn classifiers with nltk

from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report,accuracy_score,confusion_matrix



In [46]:
# Define models to train
names = ["K Nearest Neighbors", "Decision Tree", "Random Forest", "Logistic Regression",
         "Naive Bayes", "SVM Linear"]

classifiers = [
    KNeighborsClassifier(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    LogisticRegression(),
    MultinomialNB(),
    SVC(kernel = 'linear')
]

models=list(zip(names,classifiers))
print(models)

[('K Nearest Neighbors', KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')), ('Decision Tree', DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')), ('Random Forest', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))

In [51]:
#Wrap models in NLTK

from nltk.classify.scikitlearn import SklearnClassifier

for name,model in models:
    nltk_model=SklearnClassifier(model)
    nltk_model.train(training)
    accuracy=nltk.classify.accuracy(nltk_model,testing)*100
    print('{} :Accuracy: {} '.format(name,accuracy))

K Nearest Neighbors :Accuracy: 94.97757847533632 
Decision Tree :Accuracy: 97.30941704035875 
Random Forest :Accuracy: 97.9372197309417 
Logistic Regression :Accuracy: 98.38565022421525 
Naive Bayes :Accuracy: 98.11659192825111 
SVM Linear :Accuracy: 98.38565022421525 
