# Natural Language Processing for Text Classification with NLTK and Scikit-learn FOR SMS SPAM DETECTION

Libraries

In [1]:
import sys
import nltk
import pandas as pd
import numpy as np
import sklearn

 Load Datasets

In [2]:
df=pd.read_csv(r"C:\Users\SRKT\Desktop\SMSspam",sep='\t',header=None,encoding='utf-8')
df.head()

Unnamed: 0,0,1
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
#check class distribution
df[0].value_counts() # describe whic is spam or ham

ham     4825
spam     747
Name: 0, dtype: int64

Preprocessing Data

In [4]:
# Label Encoder convert class level to binary values, 1=spam / 0=ham
from sklearn.preprocessing import LabelEncoder
encoder=LabelEncoder()
y=encoder.fit_transform(df[0])
y[0:10]

array([0, 0, 1, 0, 0, 1, 0, 0, 1, 1])

In [5]:
#store the sms msg data
text_msg=df[1]
text_msg[:10]

0    Go until jurong point, crazy.. Available only ...
1                        Ok lar... Joking wif u oni...
2    Free entry in 2 a wkly comp to win FA Cup fina...
3    U dun say so early hor... U c already then say...
4    Nah I don't think he goes to usf, he lives aro...
5    FreeMsg Hey there darling it's been 3 week's n...
6    Even my brother is not like to speak with me. ...
7    As per your request 'Melle Melle (Oru Minnamin...
8    WINNER!! As a valued network customer you have...
9    Had your mobile 11 months or more? U R entitle...
Name: 1, dtype: object

In [6]:
# use Regular expression to replace email address,URL,phone no.,other no.

#replace email address with email
processed=text_msg.str.replace(r'^.+@[^\.].*\.[a-z]{2,}$','emailaddress')

#replace URL with webaddress
processed=processed.str.replace(r'^http\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(/\S*)?$','webaddress')

#replace 10 digit phone no with phonenumber
processed=processed.str.replace(r'^\(?[\d]{3}\)?[\s-]?[\d]{3}[\s-]?[\d]{4}$','phonenumber')

#replace money symbol with moneysymb
processed=processed.str.replace(r'£|\$', 'moneysymb')

#replace numbers with numbrs
processed=processed.str.replace(r'£|\$', 'moneysymb')

In [7]:
#remove punctuation
processed=processed.str.replace(r'[^\w\d\s]',' ')

#replace whitespace betweens terms with single space
processed=processed.str.replace(r'\s+', ' ')

# Remove leading and trailing whitespace
processed = processed.str.replace(r'^\s+|\s+?$', '')

In [8]:
#chnage words to lowercase
processed=processed.str.lower()
processed[0:20]

0     go until jurong point crazy available only in ...
1                               ok lar joking wif u oni
2     free entry in 2 a wkly comp to win fa cup fina...
3           u dun say so early hor u c already then say
4     nah i don t think he goes to usf he lives arou...
5     freemsg hey there darling it s been 3 week s n...
6     even my brother is not like to speak with me t...
7     as per your request melle melle oru minnaminun...
8     winner as a valued network customer you have b...
9     had your mobile 11 months or more u r entitled...
10    i m gonna be home soon and i don t want to tal...
11    six chances to win cash from 100 to 20 000 pou...
12    urgent you have won a 1 week free membership i...
13    i ve been searching for the right words to tha...
14                    i have a date on sunday with will
15    xxxmobilemovieclub to use your credit click th...
16                               oh k i m watching here
17    eh u remember how 2 spell his name yes i d

In [9]:
#remove stopwords from text
from nltk.corpus import stopwords
stop_words=set(stopwords.words('english'))

processed=processed.apply(lambda x: ' '.join(term for term in x.split() if term not in stop_words))

#remove word stems using a Porter Stemmer
ps=nltk.PorterStemmer()
processed=processed.apply(lambda x: ' '.join(ps.stem(term) for term in x.split()))


In [10]:
processed[0:10]

0    go jurong point crazi avail bugi n great world...
1                                ok lar joke wif u oni
2    free entri 2 wkli comp win fa cup final tkt 21...
3                  u dun say earli hor u c alreadi say
4                 nah think goe usf live around though
5    freemsg hey darl 3 week word back like fun sti...
6        even brother like speak treat like aid patent
7    per request mell mell oru minnaminungint nurun...
8    winner valu network custom select receivea mon...
9    mobil 11 month u r entitl updat latest colour ...
Name: 1, dtype: object

Generate Tokenize

In [11]:
#Create a Bag of Words
from nltk.tokenize import word_tokenize
all_words=[]
for message in processed:
    words=word_tokenize(message)
    for w in words:
        all_words.append(w)
all_words=nltk.FreqDist(all_words)

In [12]:
print("total no of words",len(all_words))
print("most common words",all_words.most_common(15))

total no of words 7309
most common words [('u', 1207), ('call', 674), ('2', 519), ('go', 456), ('get', 451), ('ur', 391), ('4', 323), ('gt', 318), ('lt', 316), ('come', 304), ('ok', 293), ('free', 284), ('day', 276), ('know', 275), ('love', 266)]


In [54]:
#use 1500 most common words
word_feature=list(all_words.keys())[:1500]

In [19]:
#the find_feature function will determine which of the 1500 word features are contained in the review
def find_features(message):
    words=word_tokenize(message)
    feature={}
    for word in word_feature:
        feature[word]=(word in words)
    return feature

#example
features=find_features(processed[0])
for key, value in features.items():
    if value == True:
        print(key)

go
jurong
point
crazi
avail
bugi
n
great
world
la
e
buffet
cine
got
amor
wat


In [25]:
#now lets do for all message 
messages=zip(processed,y)

featuresets = [(find_features(text), label) for (text, label) in messages]

In [56]:
# we can split the featuresets into training and testing datasets using sklearn
from sklearn import model_selection

# split the data into training and testing datasets
training, testing = model_selection.train_test_split(featuresets, test_size = 0.25, random_state=1)

In [57]:
print(len(training))
print(len(testing))

4179
1393


In [58]:
# We can use sklearn algorithms in NLTK
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.svm import SVC

model = SklearnClassifier(SVC(kernel = 'linear'))

# train the model on the training data
model.train(training)

# and test on the testing dataset!
accuracy = nltk.classify.accuracy(model, testing)*100
print("SVC Accuracy: {}".format(accuracy))

SVC Accuracy: 98.49246231155779


Scikit-Learn Classifiers with NLTK

In [61]:
#we can use sklearn algorithm in nltk
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.svm import SVC

model=SklearnClassifier(SVC(kernel='linear'))
model.train(training)

#add test on testing datasets
accuracy=nltk.classify.accuracy(model,testing)*100

In [62]:
accuracy

98.49246231155779