In [0]:
import sys
import nltk
import sklearn
import pandas as pd
import numpy as np


#1.Load the dataset


In [1]:

from google.colab import files
uploaded=files.upload()


Saving SMSSpamCollection to SMSSpamCollection


####Load the dataset of sms messages

In [0]:

df=pd.read_table('SMSSpamCollection',header=None,encoding='utf-8')

####print useful information about the dataset

In [5]:

print(df.info())
print(df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
0    5572 non-null object
1    5572 non-null object
dtypes: object(2)
memory usage: 87.1+ KB
None
      0                                                  1
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...


In [7]:
#check class distribution
classes=df[0]
print(classes.value_counts())

ham     4825
spam     747
Name: 0, dtype: int64


#2.Preprocess the data


In [10]:
#Convert class labels to binary value; 0 for ham and 1 for spam
from sklearn.preprocessing import LabelEncoder

encoder=LabelEncoder()
Y=encoder.fit_transform(classes)



[0 0 1 0 0 1 0 0 1 1]


In [0]:
#store sms message data
text_messages=df[1]

In [0]:
#use regular expressions to replace email addresses,urls, phone numbers, other numbers, symbols

#replace email addresses with 'emailaddr'
processed=text_messages.str.replace(r'^.+@[^\.].*\.[a-z]{2,}$','emailaddr')

#replace URLs with 'webaddr'
processed=processed.str.replace(r'^http\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(/\S*)?$','webaddr')

#replace 10 digit phone numbers with 'phonenumber'
processed=processed.str.replace(r'^\(?[\d]{3}\)?[\s-]?[\d]{3}[\s-]?[\d]{4}$','phonenumber')

#replace other number with 'numbr'
processed=processed.str.replace(r'\d+(\.\d+)?','numbr')

#replace money symbols with 'moneySymbol'
processed=processed.str.replace(r'£|\$','moneySymbol')

In [65]:
#remove punctuation
processed=processed.str.replace(r'[^\w\d\s]',' ')

#replace whitespace between terms with a single space
processed=processed.str.replace(r'\s+', ' ')

#Remove leading and trailing whotespace
processed = processed.str.replace(r'^\s+|\s+?$', '')

#change words to lower case
processed=processed.str.lower()

print(processed)

0       go until jurong point crazy available only in ...
1                                 ok lar joking wif u oni
2       free entry in numbr a wkly comp to win fa cup ...
3             u dun say so early hor u c already then say
4       nah i don t think he goes to usf he lives arou...
5       freemsg hey there darling it s been numbr week...
6       even my brother is not like to speak with me t...
7       as per your request melle melle oru minnaminun...
8       winner as a valued network customer you have b...
9       had your mobile numbr months or more u r entit...
10      i m gonna be home soon and i don t want to tal...
11      six chances to win cash from numbr to numbr nu...
12      urgent you have won a numbr week free membersh...
13      i ve been searching for the right words to tha...
14                      i have a date on sunday with will
15      xxxmobilemovieclub to use your credit click th...
16                                 oh k i m watching here
17      eh u r

In [66]:
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
# remove stop words from text messages

stop_words = set(stopwords.words('english'))

processed = processed.apply(lambda x: ' '.join(term for term in x.split() if term not in stop_words))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [0]:
#remove word stems using a Porter stemmer
ps=nltk.PorterStemmer()
processed=processed.apply(lambda x:' '.join(ps.stem(term) for term in x.split()))

#3.Generating features


In [68]:
from nltk.tokenize import word_tokenize
nltk.download('punkt')
#create bag-of-words
all_words=[]
for message in processed:
  words=word_tokenize(message)
  for w in words:
    all_words.append(w)

    
all_words=nltk.FreqDist(all_words)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [69]:
#print total number of words and 15 most common words
print('Number of words: {}'.format(len(all_words)))
print('Most common words: {}'.format(all_words.most_common(15)))

Number of words: 6562
Most common words: [(u'numbr', 2961), (u'u', 1207), (u'call', 679), (u'go', 456), (u'get', 451), (u'ur', 391), (u'gt', 318), (u'lt', 316), (u'come', 304), (u'ok', 293), (u'free', 284), (u'day', 276), (u'know', 275), (u'love', 266), (u'like', 261)]


In [0]:
#use 1500 most common words as features
word_features=list(all_words.keys())[:1500]

In [71]:
#define a find feature function
def find_features(message):
  words=word_tokenize(message)
  features={}
  for word in word_features:
    features[word]=word in words
  
  return features

# lets see an example
features=find_features(processed[0])
for key, value in features.items():
  if value==True:
    print key
  

avail
buffet
world
great


In [0]:
#Now let's do it for all the messages
messages=zip(processed,Y)

#define a seed for reproducibility
seed=1
np.random.seed=seed
np.random.shuffle(messages)

#call find_features function for each of these SMS messages
featuresets = [(find_features(text), label) for (text, label) in messages]


In [73]:
#split featuresets into training and testing data using sklearn
from sklearn import model_selection
training,testing=model_selection.train_test_split(featuresets,test_size=0.25,random_state=seed)
print(len(training))
print(len(testing))

4179
1393


#4.Scikit-learn classifiers with nltk

In [0]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

#define models to train
names=["K Nearest Neighbors","Decision Tree","Random Forest","Logistic Regression","SGD Classifier","Naives Bayes","SVM Linear"]
classifiers=[KNeighborsClassifier(),DecisionTreeClassifier(),RandomForestClassifier(),LogisticRegression(),SGDClassifier(max_iter=100),MultinomialNB(),SVC(kernel='linear')]
models=zip(names,classifiers)


In [75]:
#wrap models in NLTK
from nltk.classify.scikitlearn import SklearnClassifier

for name,model in models:
  nltk_model=SklearnClassifier(model)
  nltk_model.train(training)
  accuracy=nltk.classify.accuracy(nltk_model,testing)*100
  print('{}: Accuracy :{}'.format(name,accuracy))

K Nearest Neighbors: Accuracy :93.3237616655
Decision Tree: Accuracy :94.544149318
Random Forest: Accuracy :94.472361809
Logistic Regression: Accuracy :94.4005743001
SGD Classifier: Accuracy :94.615936827
Naives Bayes: Accuracy :95.0466618808
SVM Linear: Accuracy :94.687724336


In [76]:
#ensemble method-Voting classifier
from sklearn.ensemble import VotingClassifier


models = zip(names, classifiers)

nltk_ensemble = SklearnClassifier(VotingClassifier(estimators = models, voting = 'hard', n_jobs = -1))
nltk_ensemble.train(training)
accuracy=nltk.classify.accuracy(nltk_model,testing)*100
print('Ensemble method Accuracy :{}'.format(accuracy))


Ensemble method Accuracy :94.687724336


In [0]:
#make class label prediction for testing set
txt_features,label=zip(*testing)

prediction=nltk_ensemble.classify_many(txt_features)

In [79]:
#print a confusion matrix and a classification report
print(classification_report(label,prediction))

pd.DataFrame(
    confusion_matrix(label, prediction),
    index = [['actual', 'actual'], ['ham', 'spam']],
    columns = [['predicted', 'predicted'], ['ham', 'spam']])


              precision    recall  f1-score   support

           0       0.95      0.99      0.97      1202
           1       0.93      0.68      0.78       191

   micro avg       0.95      0.95      0.95      1393
   macro avg       0.94      0.83      0.88      1393
weighted avg       0.95      0.95      0.95      1393



Unnamed: 0_level_0,Unnamed: 1_level_0,predicted,predicted
Unnamed: 0_level_1,Unnamed: 1_level_1,ham,spam
actual,ham,1193,9
actual,spam,62,129
