In [4]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from textblob import TextBlob
from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix

In [11]:
data = pd.read_csv('D:\\Analytics\\Python ML\\Naive Bayes\\SMSSpamDataset',sep='\t',names=["label", "message"])
data.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [13]:
data.shape

(5572, 2)

In [14]:
data.describe()

Unnamed: 0,label,message
count,5572,5572
unique,2,5169
top,ham,"Sorry, I'll call later"
freq,4825,30


In [17]:
data.groupby('label').describe()

Unnamed: 0_level_0,message,message,message,message
Unnamed: 0_level_1,count,unique,top,freq
label,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ham,4825,4516,"Sorry, I'll call later",30
spam,747,653,Please call our customer service representativ...,4


In [19]:
data['label'].head()

0     ham
1     ham
2    spam
3     ham
4     ham
Name: label, dtype: object

In [20]:
# Power of textblob
# writing a function to split the words. To define it as tokens

def split_into_tokens(message):
    message = str(message)
    return TextBlob(message).words

In [23]:
#data.message.head()
data['message'].head()

0    Go until jurong point, crazy.. Available only ...
1                        Ok lar... Joking wif u oni...
2    Free entry in 2 a wkly comp to win FA Cup fina...
3    U dun say so early hor... U c already then say...
4    Nah I don't think he goes to usf, he lives aro...
Name: message, dtype: object

In [31]:
#applying the split_into_tokens() - function to message column
data['message'].apply(split_into_tokens)

0       [Go, until, jurong, point, crazy, Available, o...
1                          [Ok, lar, Joking, wif, u, oni]
2       [Free, entry, in, 2, a, wkly, comp, to, win, F...
3       [U, dun, say, so, early, hor, U, c, already, t...
4       [Nah, I, do, n't, think, he, goes, to, usf, he...
5       [FreeMsg, Hey, there, darling, it, 's, been, 3...
6       [Even, my, brother, is, not, like, to, speak, ...
7       [As, per, your, request, 'Melle, Melle, Oru, M...
8       [WINNER, As, a, valued, network, customer, you...
9       [Had, your, mobile, 11, months, or, more, U, R...
10      [I, 'm, gon, na, be, home, soon, and, i, do, n...
11      [SIX, chances, to, win, CASH, From, 100, to, 2...
12      [URGENT, You, have, won, a, 1, week, FREE, mem...
13      [I, 've, been, searching, for, the, right, wor...
14             [I, HAVE, A, DATE, ON, SUNDAY, WITH, WILL]
15      [XXXMobileMovieClub, To, use, your, credit, cl...
16                         [Oh, k, i, 'm, watching, here]
17      [Eh, u

In [32]:
data['message'].apply(split_into_tokens).head()

0    [Go, until, jurong, point, crazy, Available, o...
1                       [Ok, lar, Joking, wif, u, oni]
2    [Free, entry, in, 2, a, wkly, comp, to, win, F...
3    [U, dun, say, so, early, hor, U, c, already, t...
4    [Nah, I, do, n't, think, he, goes, to, usf, he...
Name: message, dtype: object

In [54]:
#For example a sentence with pos
TextBlob("Hello world, how is it going?").tags

[('Hello', 'NNP'),
 ('world', 'NN'),
 ('how', 'WRB'),
 ('is', 'VBZ'),
 ('it', 'PRP'),
 ('going', 'VBG')]

In [53]:
TextBlob(str(data['message'].head(2))).tags

[('0', 'CD'),
 ('Go', 'NNP'),
 ('until', 'IN'),
 ('jurong', 'JJ'),
 ('point', 'NN'),
 ('crazy..', 'NN'),
 ('Available', 'JJ'),
 ('only', 'RB'),
 ('1', 'CD'),
 ('Ok', 'NNP'),
 ('lar', 'NN'),
 ('Joking', 'VBG'),
 ('wif', 'NN'),
 ('u', 'JJ'),
 ('oni', 'NN'),
 ('Name', 'NN'),
 ('message', 'NN'),
 ('dtype', 'NN'),
 ('object', 'NN')]

In [55]:
# function to split words depending on lemmas
def split_into_lemmas(message):
    message = str(message).lower()
    words = TextBlob(message).words
    # for each word, take its "base form" = lemma
    return [each_word.lemma for each_word in words]

#Better. You can probably think of many more ways to improve the preprocessing: decoding HTML entities (those &amp;
#and &lt; we saw above); filtering out stop words (pronouns etc); adding more features, 
#such as an word-in-all-caps indicator and so on.

In [56]:
data['message'].apply(split_into_lemmas)

0       [go, until, jurong, point, crazy, available, o...
1                          [ok, lar, joking, wif, u, oni]
2       [free, entry, in, 2, a, wkly, comp, to, win, f...
3       [u, dun, say, so, early, hor, u, c, already, t...
4       [nah, i, do, n't, think, he, go, to, usf, he, ...
5       [freemsg, hey, there, darling, it, 's, been, 3...
6       [even, my, brother, is, not, like, to, speak, ...
7       [a, per, your, request, 'melle, melle, oru, mi...
8       [winner, a, a, valued, network, customer, you,...
9       [had, your, mobile, 11, month, or, more, u, r,...
10      [i, 'm, gon, na, be, home, soon, and, i, do, n...
11      [six, chance, to, win, cash, from, 100, to, 20...
12      [urgent, you, have, won, a, 1, week, free, mem...
13      [i, 've, been, searching, for, the, right, wor...
14             [i, have, a, date, on, sunday, with, will]
15      [xxxmobilemovieclub, to, use, your, credit, cl...
16                         [oh, k, i, 'm, watching, here]
17      [eh, u

In [57]:
data['message'].apply(split_into_lemmas).head()

0    [go, until, jurong, point, crazy, available, o...
1                       [ok, lar, joking, wif, u, oni]
2    [free, entry, in, 2, a, wkly, comp, to, win, f...
3    [u, dun, say, so, early, hor, u, c, already, t...
4    [nah, i, do, n't, think, he, go, to, usf, he, ...
Name: message, dtype: object

In [66]:
#Now converting strings to vectors using countvectorizer and tfidftransformer
#Use %time before the statement for checking time taken
#we are creating a vocablury from data['message'] along with their column-index
bow_transformer = CountVectorizer(analyzer=split_into_lemmas).fit(data['message'])
#print(bow_transformer.vocabulary_['go'])
print(len(bow_transformer.vocabulary_))

8859


In [67]:
print(bow_transformer.vocabulary_)



In [71]:
# create a dummy message for checking the prediction
data1 = "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's"

In [75]:
#change the dummy message into vectors
bow4 = bow_transformer.transform([data1])
print(bow4)
#Col1: Row Col2: Word-Col-index in vocablury Col3: Count of that word in the dummy message

  (0, 21)	2
  (0, 103)	1
  (0, 452)	1
  (0, 472)	1
  (0, 484)	1
  (0, 893)	1
  (0, 948)	1
  (0, 1272)	1
  (0, 1897)	1
  (0, 2263)	1
  (0, 2476)	1
  (0, 3042)	2
  (0, 3173)	2
  (0, 3283)	1
  (0, 3433)	1
  (0, 4179)	1
  (0, 5019)	1
  (0, 6287)	1
  (0, 6340)	1
  (0, 6398)	1
  (0, 7300)	1
  (0, 7523)	1
  (0, 7637)	1
  (0, 7790)	1
  (0, 7801)	3
  (0, 7985)	1
  (0, 8470)	1
  (0, 8510)	1


In [83]:
#transforming all the messages into vectors
data_bow = bow_transformer.transform(data['message'])
print(data_bow)

  (0, 1189)	1
  (0, 1411)	1
  (0, 1847)	1
  (0, 1849)	1
  (0, 2156)	1
  (0, 2418)	1
  (0, 2908)	1
  (0, 3628)	1
  (0, 3673)	1
  (0, 3710)	1
  (0, 4179)	1
  (0, 4450)	1
  (0, 4597)	1
  (0, 5315)	1
  (0, 5654)	1
  (0, 6021)	1
  (0, 7693)	1
  (0, 8090)	1
  (0, 8334)	1
  (0, 8548)	1
  (1, 4421)	1
  (1, 4631)	1
  (1, 5618)	1
  (1, 5650)	1
  (1, 8002)	1
  :	:
  (5570, 1882)	1
  (5570, 2686)	1
  (5570, 2987)	1
  (5570, 3382)	1
  (5570, 3433)	1
  (5570, 3547)	1
  (5570, 3766)	1
  (5570, 3858)	1
  (5570, 4098)	2
  (5570, 4179)	1
  (5570, 4252)	1
  (5570, 4310)	1
  (5570, 4722)	1
  (5570, 5417)	1
  (5570, 7117)	1
  (5570, 7126)	1
  (5570, 7673)	1
  (5570, 7801)	1
  (5570, 8002)	1
  (5570, 8377)	1
  (5571, 4310)	2
  (5571, 5334)	1
  (5571, 6595)	1
  (5571, 7801)	1
  (5571, 7939)	1


In [84]:
print(data_bow[0]) #vector values 

  (0, 1189)	1
  (0, 1411)	1
  (0, 1847)	1
  (0, 1849)	1
  (0, 2156)	1
  (0, 2418)	1
  (0, 2908)	1
  (0, 3628)	1
  (0, 3673)	1
  (0, 3710)	1
  (0, 4179)	1
  (0, 4450)	1
  (0, 4597)	1
  (0, 5315)	1
  (0, 5654)	1
  (0, 6021)	1
  (0, 7693)	1
  (0, 8090)	1
  (0, 8334)	1
  (0, 8548)	1


In [89]:
#Now applying tfidf to messages - vectors data
tfidf_transformer = TfidfTransformer().fit(data_bow)

#transforming all messages to tfidf
data_tfidf = tfidf_transformer.transform(data_bow)
print(data_tfidf)

#Here we are getting Tfidf value for each column/feature/lemma-word

  (0, 8548)	0.21319191519519087
  (0, 8334)	0.1787434254996649
  (0, 8090)	0.22406429043314388
  (0, 7693)	0.15215940657554947
  (0, 6021)	0.21715183499644708
  (0, 5654)	0.15318647535765337
  (0, 5315)	0.17079893890270292
  (0, 4597)	0.2686658868215042
  (0, 4450)	0.31802154557344775
  (0, 4179)	0.10475446843704307
  (0, 3710)	0.1766689019165984
  (0, 3673)	0.14742644203898994
  (0, 3628)	0.14095995379541812
  (0, 2908)	0.1885140858192191
  (0, 2418)	0.24628579544040377
  (0, 2156)	0.2686658868215042
  (0, 1849)	0.2686658868215042
  (0, 1847)	0.3035859407893093
  (0, 1411)	0.24182966437241885
  (0, 1189)	0.31802154557344775
  (1, 8456)	0.4233689861580337
  (1, 8002)	0.19064241079409208
  (1, 5650)	0.5361629383828198
  (1, 5618)	0.2696099118247978
  (1, 4631)	0.4005112552353877
  :	:
  (5570, 5417)	0.20694564879601926
  (5570, 4722)	0.15681683321451362
  (5570, 4310)	0.1125873302642354
  (5570, 4252)	0.28317596401115275
  (5570, 4179)	0.11041203606114545
  (5570, 4098)	0.15065981393088

In [90]:
print(data_tfidf[0])

  (0, 8548)	0.21319191519519087
  (0, 8334)	0.1787434254996649
  (0, 8090)	0.22406429043314388
  (0, 7693)	0.15215940657554947
  (0, 6021)	0.21715183499644708
  (0, 5654)	0.15318647535765337
  (0, 5315)	0.17079893890270292
  (0, 4597)	0.2686658868215042
  (0, 4450)	0.31802154557344775
  (0, 4179)	0.10475446843704307
  (0, 3710)	0.1766689019165984
  (0, 3673)	0.14742644203898994
  (0, 3628)	0.14095995379541812
  (0, 2908)	0.1885140858192191
  (0, 2418)	0.24628579544040377
  (0, 2156)	0.2686658868215042
  (0, 1849)	0.2686658868215042
  (0, 1847)	0.3035859407893093
  (0, 1411)	0.24182966437241885
  (0, 1189)	0.31802154557344775


In [92]:
print(data_tfidf.shape)

(5572, 8859)


In [98]:
#transforming dummy message to tfidf 
dummy_tfidf4 = tfidf_transformer.transform(bow4)
print(dummy_tfidf4)

  (0, 8510)	0.17971875390044798
  (0, 8470)	0.1382546322887404
  (0, 7985)	0.11822139451363602
  (0, 7801)	0.1710754494891446
  (0, 7790)	0.20826046348403296
  (0, 7637)	0.11294903529123612
  (0, 7523)	0.13017347967701054
  (0, 7300)	0.1855159750882623
  (0, 6398)	0.15770613879372739
  (0, 6340)	0.15770613879372739
  (0, 6287)	0.1584592287304719
  (0, 5019)	0.14843980091398046
  (0, 4179)	0.07644104078087112
  (0, 3433)	0.10913428445053755
  (0, 3283)	0.17357742458716732
  (0, 3173)	0.44306320535284216
  (0, 3042)	0.33953739608831124
  (0, 2476)	0.1902526496899536
  (0, 2263)	0.1855159750882623
  (0, 1897)	0.1197254675480583
  (0, 1272)	0.16003424328960916
  (0, 948)	0.06442218176557092
  (0, 893)	0.20826046348403296
  (0, 484)	0.21405768467184733
  (0, 472)	0.21405768467184733
  (0, 452)	0.09448139031327672
  (0, 103)	0.22153160267642108
  (0, 21)	0.1837624874244498


In [96]:
#Using naive bayes to classify the messages
spam_detector = MultinomialNB().fit(data_tfidf,data['label'])

In [104]:
#Use %time before the statement for checking time taken
%time print('Predicted:',spam_detector.predict(dummy_tfidf4)[0])

Predicted: spam
Wall time: 0 ns


In [114]:
# for checking accuracy

all_predictions = spam_detector.predict(data_tfidf)
print(all_predictions)
print(all_predictions[0:5])

['ham' 'ham' 'spam' ... 'ham' 'ham' 'ham']
['ham' 'ham' 'spam' 'ham' 'ham']


In [115]:
# accuracy
print('accuracy:', accuracy_score(data['label'], all_predictions))
print('confusion matrix\n', confusion_matrix(data['label'], all_predictions))

accuracy: 0.9694903086862886
confusion matrix
 [[4825    0]
 [ 170  577]]


In [110]:
print(classification_report(data['label'], all_predictions))

             precision    recall  f1-score   support

        ham       0.97      1.00      0.98      4825
       spam       1.00      0.77      0.87       747

avg / total       0.97      0.97      0.97      5572



In [116]:
data['label']

0        ham
1        ham
2       spam
3        ham
4        ham
5       spam
6        ham
7        ham
8       spam
9       spam
10       ham
11      spam
12      spam
13       ham
14       ham
15      spam
16       ham
17       ham
18       ham
19      spam
20       ham
21       ham
22       ham
23       ham
24       ham
25       ham
26       ham
27       ham
28       ham
29       ham
        ... 
5542     ham
5543     ham
5544     ham
5545     ham
5546     ham
5547    spam
5548     ham
5549     ham
5550     ham
5551     ham
5552     ham
5553     ham
5554     ham
5555     ham
5556     ham
5557     ham
5558     ham
5559     ham
5560     ham
5561     ham
5562     ham
5563     ham
5564     ham
5565     ham
5566    spam
5567    spam
5568     ham
5569     ham
5570     ham
5571     ham
Name: label, Length: 5572, dtype: object