In [1]:
import pandas as pd
import numpy as np
import sys
import sklearn
import nltk

### The dataset is taking from UCI Machine Learning repository.It contains over 5000 SMS labeled messages that have been collected for mobile phone spam research.link of dataset:https://archive.ics.uci.edu/ml/datasets/sms+spam+collection

In [2]:
df=pd.read_table("SMSSpamCollection",header=None,encoding="utf-8")

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
0    5572 non-null object
1    5572 non-null object
dtypes: object(2)
memory usage: 87.1+ KB


In [6]:
df.head()

Unnamed: 0,0,1
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
myclass=df[0]

In [5]:
myclass.value_counts()

ham     4825
spam     747
Name: 0, dtype: int64

### As we observe there is two classes of dataset, ham (total meassages=4825)and spam (total meassages=747)

## Preprocess data

### Preprocessing the data is an essential step in natural language process. In the following cells, we will convert our class labels to binary values using the LabelEncoder from sklearn, replace email addresses, URLs, phone numbers, and other symbols by using regular expressions, remove stop words, and extract word stems.

In [6]:
#convrting class labels into binary values,0=ham and 1=spam
from sklearn.preprocessing import LabelEncoder

In [7]:
encoder=LabelEncoder()
y=encoder.fit_transform(myclass)

In [8]:
print(y[:10])

[0 0 1 0 0 1 0 0 1 1]


In [9]:
# varibale used for storing the sms messages data
text_message=df[1]

In [10]:
text_message[:10]

0    Go until jurong point, crazy.. Available only ...
1                        Ok lar... Joking wif u oni...
2    Free entry in 2 a wkly comp to win FA Cup fina...
3    U dun say so early hor... U c already then say...
4    Nah I don't think he goes to usf, he lives aro...
5    FreeMsg Hey there darling it's been 3 week's n...
6    Even my brother is not like to speak with me. ...
7    As per your request 'Melle Melle (Oru Minnamin...
8    WINNER!! As a valued network customer you have...
9    Had your mobile 11 months or more? U R entitle...
Name: 1, dtype: object

## Use regular exp  to replace email addess,urls....

In [14]:
#replace email adress with "emailadder"
my_mess=text_message.str.replace(r"^\w+@[a-zA-Z_]+?\.[a-zA-Z]{2,3}$","emailadder")
#replace Url "webaddress" 
my_mess=text_message.str.replace(r"/^(?:([A-Za-z]+):)?(\/{0,3})([0-9.\-A-Za-z]+)(?::(\d+))?(?:\/([^?#]*))?(?:\?([^#]*))?(?:#(.*))?$/;","webaddress")
#replace number with "number"
my_mess=text_message.str.replace(r"^[0-9]","number")
#replace money with "moneysymbol"
my_mess=text_message.str.replace(r"^\$(\d{1,3}(\,\d{3})*|(\d+))(\.\d{2})?$","moneysymbol")
#replace 10 digit phone number with "contact"
my_mess=text_message.str.replace(r"^[2-9]{2}[0-9]{8}$","contact")

In [15]:
#remove punctuations and white spaces
my_mess=text_message.str.replace(r"\d+(\.\d+)?"," ")
my_mess=text_message.str.replace(r"\s+"," ")
my_mess=text_message.str.replace(r"^\s+?$","")

In [16]:
#change the words into lower case
my_mess=text_message.str.lower()
my_mess

0       go until jurong point, crazy.. available only ...
1                           ok lar... joking wif u oni...
2       free entry in 2 a wkly comp to win fa cup fina...
3       u dun say so early hor... u c already then say...
4       nah i don't think he goes to usf, he lives aro...
5       freemsg hey there darling it's been 3 week's n...
6       even my brother is not like to speak with me. ...
7       as per your request 'melle melle (oru minnamin...
8       winner!! as a valued network customer you have...
9       had your mobile 11 months or more? u r entitle...
10      i'm gonna be home soon and i don't want to tal...
11      six chances to win cash! from 100 to 20,000 po...
12      urgent! you have won a 1 week free membership ...
13      i've been searching for the right words to tha...
14                    i have a date on sunday with will!!
15      xxxmobilemovieclub: to use your credit, click ...
16                             oh k...i'm watching here:)
17      eh u r

In [21]:
from nltk.corpus import stopwords
#removing stop words from dataset
stop_words=set(stopwords.words("english"))
my_mess=my_mess.apply(lambda x: " ".join(term for term in x.split() if term not in stop_words ))

In [23]:
# removing stems using Porter Stemmer
from nltk import PorterStemmer
ps=PorterStemmer()
my_mess=my_mess.apply(lambda x:" ".join(ps.stem(term) for term in x.split()))

In [24]:
my_mess

0       go jurong point, crazy.. avail bugi n great wo...
1                             ok lar... joke wif u oni...
2       free entri 2 wkli comp win fa cup final tkt 21...
3               u dun say earli hor... u c alreadi say...
4                   nah think goe usf, live around though
5       freemsg hey darl 3 week' word back! i'd like f...
6       even brother like speak me. treat like aid pat...
7       per request 'mell mell (oru minnaminungint nur...
8       winner!! valu network custom select receivea £...
9       mobil 11 month more? u r entitl updat latest c...
10      i'm gonna home soon want talk stuff anymor ton...
11      six chanc win cash! 100 20,000 pound txt> csh1...
12      urgent! 1 week free membership £100,000 prize ...
13      i'v search right word thank breather. promis w...
14                                     date sunday will!!
15      xxxmobilemovieclub: use credit, click wap link...
16                                oh k...i'm watch here:)
17      eh u r

In [55]:
# Tokenizing the each words
from nltk.tokenize import word_tokenize
#creating the bag of words
all_words=[]
for message in my_mess:
    words=word_tokenize(message)
    for w in words:
        all_words.append(w)
all_words=nltk.FreqDist(all_words)        
    

In [26]:
#print totol number of words
print("Number of words:{}".format(len(all_words)))

Number of words:9313


In [28]:
#use most common words as features
word_features=list(all_words.keys())[:1500]

In [30]:
# The find_features function determined the 1500 words features are contained in the review
def find_features(message):
    words=word_tokenize(message)
    features={}
    for word in word_features:
        features[word]=(word in words)
        
    return features
features =find_features(processed[0])
for key,value in features.items():
    if value==True:
        print(key)
        

go
jurong
point
,
crazy..
n
great
world
la
e
buffet
...
cine
got
wat


In [32]:
# lets do it for all messages
messages=zip(my_mess, y)
seed=1
np.random.seed=seed
#np.random.shuffle(messages)
featuresets=[(find_features(text),label) for (text,label) in messages]

In [33]:
# spliting the features dataset into training dataset and testing dataset using Scikit-learn
from sklearn import model_selection
training,testing=model_selection.train_test_split(featuresets,test_size=0.25,random_state=seed)


In [34]:
#size of splited datasets
print(len(training))
print(len(testing))

4179
1393


## sklearn classifier with nltk

In [35]:
# We can use sklearn algorithms in NLTK
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.svm import SVC

model = SklearnClassifier(SVC(kernel = 'linear'))

# train the model on the training data
model.train(training)

# and test on the testing dataset!
accuracy = nltk.classify.accuracy(model, testing)*100
print("SVC Accuracy: {}".format(accuracy))

SVC Accuracy: 98.63603732950466


### Wow ! Model predict with 98.63% accuracy 

### Lets we chack other algorithms with hope of better accuracy

In [36]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression,SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report,accuracy_score,confusion_matrix

  from numpy.core.umath_tests import inner1d


In [49]:
#define models to train
names=["KN neighbor","Decision tree","Random forest","Logistic regression","SGD classifier","Naive bayes","SVM linear"]

classifiers=[
    KNeighborsClassifier(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    LogisticRegression(),
    SGDClassifier(max_iter=100),
    MultinomialNB(),
    SVC(kernel="linear")
    
    
]
models=zip(names,classifiers)


In [50]:
from nltk.classify.scikitlearn import SklearnClassifier
for name,model in models:
    nltk_model=SklearnClassifier(model)
    nltk_model.train(training)
    accuracy=nltk.classify.accuracy(nltk_model,testing)*100
    print(name,accuracy)
    

KN neighbor 92.24694903086863
Decision tree 96.33883704235463
Random forest 97.77458722182341
Logistic regression 98.49246231155779
SGD classifier 98.06173725771716
Naive bayes 97.98994974874373
SVM linear 98.63603732950466


### As observe SVM linear predict with highest accuracy 98.63%  and  KN neighbor predict least accuracy 92.24% in all above algorithms

# Lets try ensemble method for more better accuracy

In [51]:
#ensemble method-volting classifier
from sklearn.ensemble import VotingClassifier
# define model to train
names=["KN neighbors","Decision tree","Random forest","Logistic regression","SGD classifier","Naive bayes","SVM linear"]
classifiers=[
    KNeighborsClassifier(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    LogisticRegression(),
    SGDClassifier(max_iter=100),
    MultinomialNB(),
    SVC(kernel="linear")
] 
models=list(zip(names,classifiers))

In [52]:
nltk_ensemble=SklearnClassifier(VotingClassifier(estimators=models,voting ="hard",n_jobs=-1))
nltk_ensemble.train(training)
accuracy=nltk.classify.accuracy(nltk_ensemble,testing)*100
print(accuracy)

98.63603732950466


  if diff:


### With ensemble method ,we also got 98.63% accuracy 

In [53]:
#make class label prediction for testing report
txt_features,labels=zip(*testing)
prediction=nltk_ensemble.classify_many(txt_features)


  if diff:


In [54]:
#print a confusion matrics and classification report
print(classification_report(labels,prediction))
pd.DataFrame(
    confusion_matrix(labels,prediction),
    index=[["actual","actual"],["ham","spam"]],
    columns=[["predicted","predicted"],["ham","spam"]]
)

             precision    recall  f1-score   support

          0       0.99      1.00      0.99      1208
          1       0.99      0.91      0.95       185

avg / total       0.99      0.99      0.99      1393



Unnamed: 0_level_0,Unnamed: 1_level_0,predicted,predicted
Unnamed: 0_level_1,Unnamed: 1_level_1,ham,spam
actual,ham,1206,2
actual,spam,17,168
