In [66]:
import pandas as pd
import numpy as np

In [67]:
df = pd.read_csv("E:\\projects\\sms spam classification\\smsspamcollection\\SMSSpamCollection", sep = '\t', header = None, encoding = 'utf-8') 

In [68]:
print(df.head())

      0                                                  1
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...


In [69]:
classes = df[0]
#print(classes)
print(classes.value_counts())

ham     4825
spam     747
Name: 0, dtype: int64


# Preprocess The Data

In [70]:
# Convert to binary classification 0=ham, 1=spam
from sklearn.preprocessing import LabelEncoder

In [71]:
encoder = LabelEncoder()
Y = encoder.fit_transform(classes)
print(classes[:10])
print(Y[:10])

0     ham
1     ham
2    spam
3     ham
4     ham
5    spam
6     ham
7     ham
8    spam
9    spam
Name: 0, dtype: object
[0 0 1 0 0 1 0 0 1 1]


In [72]:
# Store the text messages data
tm = df[1]
print(tm[:10])

0    Go until jurong point, crazy.. Available only ...
1                        Ok lar... Joking wif u oni...
2    Free entry in 2 a wkly comp to win FA Cup fina...
3    U dun say so early hor... U c already then say...
4    Nah I don't think he goes to usf, he lives aro...
5    FreeMsg Hey there darling it's been 3 week's n...
6    Even my brother is not like to speak with me. ...
7    As per your request 'Melle Melle (Oru Minnamin...
8    WINNER!! As a valued network customer you have...
9    Had your mobile 11 months or more? U R entitle...
Name: 1, dtype: object


In [73]:
# replace email's, url's, etc. because every sms can have unique mail address, url, etc.
# No help to learn for machine learning 
processed = tm.str.replace(r'^.+@[^\.].*\.[a-z]{2,}$','emailaddr')
processed = processed.str.replace(r'^http\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(/\S*)?$', 'webaddress')
processed = processed.str.replace(r'£|\$', 'moneysymb')
processed = processed.str.replace(r'^\(?[\d]{3}\)?[\s-]?[\d]{3}[\s-]?[\d]{4}$', 'phonenumber')
processed = processed.str.replace(r'\d+(\.\d+)?','number')

In [74]:
processed = processed.str.replace(r'[^\w\d\s]',' ') # Punctuations
processed = processed.str.replace(r'\s+',' ') # whitespaces between terms with a single space
processed = processed.str.replace(r'^\s+|\s+?$','') # Remove leading and trailing whitespaces

In [75]:
# Change words to lowercase
processed = processed.str.lower()
print(processed)

0       go until jurong point crazy available only in ...
1                                 ok lar joking wif u oni
2       free entry in number a wkly comp to win fa cup...
3             u dun say so early hor u c already then say
4       nah i don t think he goes to usf he lives arou...
                              ...                        
5567    this is the numbernd time we have tried number...
5568                  will ü b going to esplanade fr home
5569    pity was in mood for that so any other suggest...
5570    the guy did some bitching but i acted like i d...
5571                            rofl its true to its name
Name: 1, Length: 5572, dtype: object


In [76]:
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))

processed = processed.apply(lambda x: ' '.join(term for term in x.split() if term not in stop_words))

In [77]:
import nltk
ps = nltk.PorterStemmer()

processed = processed.apply(lambda x: ' '.join(ps.stem(term) for term in x.split()))

In [78]:
print(processed)

0       go jurong point crazi avail bugi n great world...
1                                   ok lar joke wif u oni
2       free entri number wkli comp win fa cup final t...
3                     u dun say earli hor u c alreadi say
4                    nah think goe usf live around though
                              ...                        
5567    numbernd time tri number contact u u moneysymb...
5568                              ü b go esplanad fr home
5569                                    piti mood suggest
5570    guy bitch act like interest buy someth els nex...
5571                                       rofl true name
Name: 1, Length: 5572, dtype: object


In [79]:
from nltk.tokenize import word_tokenize

In [80]:
# Creating bag-of-words
all_words = []
for msg in processed:
    words = word_tokenize(msg)
    for w in words:
        all_words.append(w)
        
all_words = nltk.FreqDist(all_words)

In [81]:
print('Number of Words: {}'.format(len(all_words)) )
print('Most Common Words: {}'.format(all_words.most_common(15)))

Number of Words: 6574
Most Common Words: [('number', 2759), ('u', 1207), ('call', 674), ('go', 456), ('get', 451), ('ur', 391), ('gt', 318), ('lt', 316), ('come', 304), ('moneysymbnumb', 303), ('ok', 293), ('free', 284), ('day', 276), ('know', 275), ('love', 266)]


In [82]:
# Using 1500 most common features
word_features = list(all_words.keys())[:1500]
print(word_features)

['go', 'jurong', 'point', 'crazi', 'avail', 'bugi', 'n', 'great', 'world', 'la', 'e', 'buffet', 'cine', 'got', 'amor', 'wat', 'ok', 'lar', 'joke', 'wif', 'u', 'oni', 'free', 'entri', 'number', 'wkli', 'comp', 'win', 'fa', 'cup', 'final', 'tkt', 'numberst', 'may', 'text', 'receiv', 'question', 'std', 'txt', 'rate', 'c', 'appli', 'numberovernumb', 'dun', 'say', 'earli', 'hor', 'alreadi', 'nah', 'think', 'goe', 'usf', 'live', 'around', 'though', 'freemsg', 'hey', 'darl', 'week', 'word', 'back', 'like', 'fun', 'still', 'tb', 'xxx', 'chg', 'send', 'moneysymbnumb', 'rcv', 'even', 'brother', 'speak', 'treat', 'aid', 'patent', 'per', 'request', 'mell', 'oru', 'minnaminungint', 'nurungu', 'vettam', 'set', 'callertun', 'caller', 'press', 'copi', 'friend', 'winner', 'valu', 'network', 'custom', 'select', 'receivea', 'prize', 'reward', 'claim', 'call', 'code', 'klnumber', 'valid', 'hour', 'mobil', 'month', 'r', 'entitl', 'updat', 'latest', 'colour', 'camera', 'co', 'gon', 'na', 'home', 'soon', 'wa

In [83]:
def find_features(msg):
    words = word_tokenize(msg)
    features = {}
    for word in word_features:
        features[word] = (word in words)
        
    return features

features = find_features(processed[0])
for key, value in features.items():
    if value == True:
        print(key)

go
jurong
point
crazi
avail
bugi
n
great
world
la
e
buffet
cine
got
amor
wat


In [84]:
messages = list(zip(processed, Y))

seed=1
np.random.seed = seed
np.random.shuffle(messages)

# find_features for each sms messages
featuresets = [(find_features(text), label) for (text, label) in messages]

In [85]:
print(featuresets[0])

({'go': False, 'jurong': False, 'point': False, 'crazi': False, 'avail': False, 'bugi': False, 'n': True, 'great': False, 'world': False, 'la': False, 'e': True, 'buffet': False, 'cine': False, 'got': False, 'amor': False, 'wat': False, 'ok': False, 'lar': False, 'joke': False, 'wif': False, 'u': True, 'oni': False, 'free': False, 'entri': False, 'number': False, 'wkli': False, 'comp': False, 'win': False, 'fa': False, 'cup': False, 'final': False, 'tkt': False, 'numberst': False, 'may': False, 'text': False, 'receiv': False, 'question': False, 'std': False, 'txt': False, 'rate': False, 'c': False, 'appli': False, 'numberovernumb': False, 'dun': False, 'say': False, 'earli': False, 'hor': False, 'alreadi': False, 'nah': False, 'think': False, 'goe': False, 'usf': False, 'live': False, 'around': False, 'though': False, 'freemsg': False, 'hey': False, 'darl': False, 'week': False, 'word': False, 'back': False, 'like': False, 'fun': False, 'still': False, 'tb': False, 'xxx': False, 'chg':

In [86]:
from sklearn.model_selection import train_test_split

In [87]:
train, test = train_test_split(featuresets, test_size = 0.25, random_state = seed)

In [88]:
print('Training: ', len(train))
print('Test: ', len(test))

Training:  4179
Test:  1393


In [89]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

In [90]:
names = ['K Nearest Neighbor', 'Decision Tree', 'Random Forest', 'Logistic Regression', 'SGD Classifier', 'Naive Bayes', 'SVM Linear']

classifiers = [
    KNeighborsClassifier(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    LogisticRegression(),
    SGDClassifier(max_iter=100),
    MultinomialNB(),
    SVC(kernel='linear')
]
models = list(zip(names, classifiers))

In [91]:
models

[('K Nearest Neighbor',
  KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                       metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                       weights='uniform')),
 ('Decision Tree',
  DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                         max_features=None, max_leaf_nodes=None,
                         min_impurity_decrease=0.0, min_impurity_split=None,
                         min_samples_leaf=1, min_samples_split=2,
                         min_weight_fraction_leaf=0.0, presort=False,
                         random_state=None, splitter='best')),
 ('Random Forest',
  RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                         max_depth=None, max_features='auto', max_leaf_nodes=None,
                         min_impurity_decrease=0.0, min_impurity_split=None,
                         min_samples_leaf=1, min_samples_split=2,
                    

In [96]:
from nltk.classify.scikitlearn import SklearnClassifier

for name, model in models:
    nltk_models = SklearnClassifier(model)
    nltk_models.train(train)
    accuracy = nltk.classify.accuracy(nltk_model, test)*100
    print('{}: Accuracy: {}'.format(name, accuracy))
    

K Nearest Neighbor: Accuracy: 98.85139985642498
Decision Tree: Accuracy: 98.85139985642498
Random Forest: Accuracy: 98.85139985642498




Logistic Regression: Accuracy: 98.85139985642498
SGD Classifier: Accuracy: 98.85139985642498
Naive Bayes: Accuracy: 98.85139985642498
SVM Linear: Accuracy: 98.85139985642498


In [97]:
txt_features, labels = zip(*test)

prediction = nltk_models.classify_many(txt_features)


In [99]:
# Print confusion matrix 
print(classification_report(labels, prediction))

pd.DataFrame(
    confusion_matrix(labels, prediction),
    index = [['actual', 'actual'],['ham', 'spam']],
    columns = [['predicted', 'predicted'],['ham', 'spam']]
) 

              precision    recall  f1-score   support

           0       0.99      1.00      0.99      1216
           1       0.97      0.94      0.95       177

    accuracy                           0.99      1393
   macro avg       0.98      0.97      0.97      1393
weighted avg       0.99      0.99      0.99      1393



Unnamed: 0_level_0,Unnamed: 1_level_0,predicted,predicted
Unnamed: 0_level_1,Unnamed: 1_level_1,ham,spam
actual,ham,1211,5
actual,spam,11,166
