IMPORT PACKAGES 

In [1]:
import sys
import sklearn
import nltk 
import pandas as pd 
import numpy as np

LOAD DATASET 



In [2]:
df = pd.read_table('SMSSpamCollection',header = None, encoding = 'utf-8')

  """Entry point for launching an IPython kernel.


In [3]:
print(df.info())
print(df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
0    5572 non-null object
1    5572 non-null object
dtypes: object(2)
memory usage: 87.1+ KB
None
      0                                                  1
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...


In [4]:
#class distribution 
classes = df[0]
print(classes.value_counts())

ham     4825
spam     747
Name: 0, dtype: int64


PREPROCESSING OF DATA 


In [6]:
#changing the class labels into binary values - ham : 0 & spam : 1
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
Y = encoder.fit_transform(classes)
print(Y[:5])

[0 0 1 0 0]


In [7]:
text_messages = df[1]
print(text_messages[:5])

0    Go until jurong point, crazy.. Available only ...
1                        Ok lar... Joking wif u oni...
2    Free entry in 2 a wkly comp to win FA Cup fina...
3    U dun say so early hor... U c already then say...
4    Nah I don't think he goes to usf, he lives aro...
Name: 1, dtype: object


In [9]:
#REGULAR EXPRESSION 
# use regular expressions to replace email addresses, URLs, phone numbers, other numbers

# Replace email addresses with 'email'
processed = text_messages.str.replace(r'^.+@[^\.].*\.[a-z]{2,}$',
                                 'emailaddress')

# Replace URLs with 'webaddress'
processed = processed.str.replace(r'^http\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(/\S*)?$',
                                  'webaddress')

# Replace money symbols with 'moneysymb' (£ can by typed with ALT key + 156)
processed = processed.str.replace(r'£|\$', 'moneysymb')
    
# Replace 10 digit phone numbers (formats include paranthesis, spaces, no spaces, dashes) with 'phonenumber'
processed = processed.str.replace(r'^\(?[\d]{3}\)?[\s-]?[\d]{3}[\s-]?[\d]{4}$',
                                  'phonenumbr')
    
# Replace numbers with 'numbr'
processed = processed.str.replace(r'\d+(\.\d+)?', 'numbr')



In [10]:
# Remove punctuation
processed = processed.str.replace(r'[^\w\d\s]', ' ')

# Replace whitespace between terms with a single space
processed = processed.str.replace(r'\s+', ' ')

# Remove leading and trailing whitespace
processed = processed.str.replace(r'^\s+|\s+?$', '')

In [12]:
#Standardize 
processed = processed.str.lower()
print(processed[:5])

0    go until jurong point crazy available only in ...
1                              ok lar joking wif u oni
2    free entry in numbr a wkly comp to win fa cup ...
3          u dun say so early hor u c already then say
4    nah i don t think he goes to usf he lives arou...
Name: 1, dtype: object


In [13]:
#removing stop words 
from nltk.corpus import stopwords 
stop_words = set(stopwords.words("english"))
processed = processed.apply(lambda x: ' '.join(trem for trem in x.split() if trem not in stop_words))
print(processed[:10])

0    go jurong point crazy available bugis n great ...
1                              ok lar joking wif u oni
2    free entry numbr wkly comp win fa cup final tk...
3                  u dun say early hor u c already say
4               nah think goes usf lives around though
5    freemsg hey darling numbr week word back like ...
6       even brother like speak treat like aids patent
7    per request melle melle oru minnaminunginte nu...
8    winner valued network customer selected receiv...
9    mobile numbr months u r entitled update latest...
Name: 1, dtype: object


In [16]:
#Stemmer - PorterStemmer 

ps = nltk.PorterStemmer()

processed = processed.apply(lambda x : ' '.join(ps.stem(trem) for trem in x.split()))
print(processed[:10])

0    go jurong point crazi avail bugi n great world...
1                                ok lar joke wif u oni
2    free entri numbr wkli comp win fa cup final tk...
3                  u dun say earli hor u c alreadi say
4                 nah think goe usf live around though
5    freemsg hey darl numbr week word back like fun...
6        even brother like speak treat like aid patent
7    per request mell mell oru minnaminungint nurun...
8    winner valu network custom select receivea mon...
9    mobil numbr month u r entitl updat latest colo...
Name: 1, dtype: object


GENERATING FEATURES


In [24]:
from nltk.tokenize import word_tokenize 

all_words = []

for message in processed:
    words = word_tokenize(message)
    for w in word:
        all_words.append(w)

all_words = nltk.FreqDist(all_words)

In [25]:
print('Number of words : {}'.format(len(all_words)))
print('Top 15 common words : {}'.format(all_words.most_common(15)))

Number of words : 3
Top 15 common words : [('rofl', 5572), ('true', 5572), ('name', 5572)]


In [30]:
#Getting 1500 common words as features
word_features = list(all_words.keys())[:1500]

In [39]:
#The find_features function will determine which of the 1500 word features are contained in the review
def find_features(message):
    words = word_tokenize(message)
    
    features = {}
    
    for word in word_features:
        features[word] = (word in words)
    
    return features


In [41]:
#Lets do it for all the messages
messages = zip(processed,Y)

#Using seed for repoductivity 
#seed = 1
#np.random.seed = seed
#np.random.shuffle(messages)

#calling find_feature on each message to create a featureset 
featureset = [(find_features(text),label) for (text,label) in messages]

In [44]:
#Spliting into traning and testing data
from sklearn import model_selection
training , testing = model_selection.train_test_split(featureset,test_size = 0.25)

In [45]:
print('Training: ',len(training))
print('Testing:',len(testing))

Training:  4179
Testing: 1393


In [48]:

# We can use sklearn algorithms in NLTK
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

# Define models to train
names = ["K Nearest Neighbors", "Decision Tree", "Random Forest", "Logistic Regression", "SGD Classifier",
         "Naive Bayes", "SVM Linear"]

classifiers = [
    KNeighborsClassifier(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    LogisticRegression(),
    SGDClassifier(max_iter = 100),
    MultinomialNB(),
    SVC(kernel = 'linear')
]

models = zip(names, classifiers)

for name, model in models:
    nltk_model = SklearnClassifier(model)
    nltk_model.train(training)
    accuracy = nltk.classify.accuracy(nltk_model, testing)*100
    print("{} Accuracy: {}".format(name, accuracy))

K Nearest Neighbors Accuracy: 87.15003589375449
Decision Tree Accuracy: 87.07824838478105
Random Forest Accuracy: 87.07824838478105
Logistic Regression Accuracy: 87.07824838478105
SGD Classifier Accuracy: 87.07824838478105
Naive Bayes Accuracy: 87.07824838478105




SVM Linear Accuracy: 87.07824838478105


In [55]:
#Can use emsemle voting classifier for better training accuracy 