https://medium.com/coinmonks/spam-detector-using-naive-bayes-c22cc740e257

In [1]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report


In [3]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import matplotlib.pyplot as plt
from wordcloud import WordCloud

In [5]:
from sklearn.feature_extraction.text import CountVectorizer 
from math import log, sqrt

In [6]:
names = ['spam_check' , 'message_data']

In [7]:

df = pd.read_csv("C:\\Users\\PRIYANSH\\Desktop\\spam_detection\\spam.csv", encoding='latin-1')   # encoding has been used due to having some unspecified character or not understable characters 
print(df.shape)

(5571, 5)


In [8]:
df.head()

Unnamed: 0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...",Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,Ok lar... Joking wif u oni...,,,
1,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
2,ham,U dun say so early hor... U c already then say...,,,
3,ham,"Nah I don't think he goes to usf, he lives aro...",,,
4,spam,FreeMsg Hey there darling it's been 3 week's n...,,,


In [9]:
df = df.drop(['Unnamed: 2','Unnamed: 3','Unnamed: 4'],axis=1)

In [10]:
df.shape

(5571, 2)

In [12]:
df.to_csv("C:\\Users\\PRIYANSH\\Desktop\\spam_detection\\spam_corrected.csv",index=False)

In [14]:
data = pd.read_csv("C:\\Users\\PRIYANSH\\Desktop\\spam_detection\\spam_corrected.csv" , names = names)
data.head()

Unnamed: 0,spam_check,message_data
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


# Count the number of words in each Text

In [16]:
# Replace ham with 0 and spam with 1
data = data.replace(['ham','spam'],[0, 1])
data.head()

Unnamed: 0,spam_check,message_data
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [17]:
data['Count']=0
l = len(data.message_data)  
for i in np.arange(0,l):     # for i in range(0,l):
    data.loc[i,'Count'] = len(data.loc[i,'message_data'])    # datacolumn property such as read and write 

In [18]:
data.head()

Unnamed: 0,spam_check,message_data,Count
0,0,"Go until jurong point, crazy.. Available only ...",111
1,0,Ok lar... Joking wif u oni...,29
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,155
3,0,U dun say so early hor... U c already then say...,49
4,0,"Nah I don't think he goes to usf, he lives aro...",61


In [19]:
data.shape

(5572, 3)

In [20]:
#data = data.drop(['Count1'] , axis=1)

**# Total ham(0) and spam(1) messages**

In [21]:
data['spam_check'].value_counts()

0    4825
1     747
Name: spam_check, dtype: int64

In [22]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 3 columns):
spam_check      5572 non-null int64
message_data    5572 non-null object
Count           5572 non-null int64
dtypes: int64(2), object(1)
memory usage: 108.9+ KB


In [23]:
data.describe()    # evaluate continuous or numerical values

Unnamed: 0,spam_check,Count
count,5572.0,5572.0
mean,0.134063,80.118808
std,0.340751,59.690841
min,0.0,2.0
25%,0.0,36.0
50%,0.0,61.0
75%,0.0,121.0
max,1.0,910.0


In [24]:
corpus = []   # collection of texts 
ps = PorterStemmer()

In [25]:
# Original Messages

print (data['message_data'][0])
print (data['message_data'][1])

Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...
Ok lar... Joking wif u oni...


#Processing Messages

In [26]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\PRIYANSH\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [27]:
changes = []  # contains changes
# we will print very first 2 rows of 'message_data' column , just to see the result
for i in range(0, l):

    # Applying Regular Expression
    
    '''
    Replace email addresses with 'emailaddr'
    Replace URLs with 'httpaddr'
    Replace money symbols with 'moneysymb'
    Replace phone numbers with 'phonenumbr'
    Replace numbers with 'numbr'
    '''
    
    
    msg = data['message_data'][i]    # read row one by one    
    msg = re.sub('\b[\w\-.]+?@\w+?\.\w{2,4}\b', 'emailaddr', data['message_data'][i])
    msg = re.sub('(http[s]?\S+)|(\w+\.[A-Za-z]{2,4}\S*)', 'httpaddr', data['message_data'][i])
    msg = re.sub('£|\$', 'moneysymb',data['message_data'][i])
    msg = re.sub('\b(\+\d{1,2}\s)?\d?[\-(.]?\d{3}\)?[\s.-]?\d{3}[\s.-]?\d{4}\b', 'phonenumbr', data['message_data'][i])
    msg = re.sub('\d+(\.\d+)?', 'numbr', data['message_data'][i])
    
    changes.append(msg)
    
    
    # Remove all punctuations 
    msg = re.sub('[^\w\d\s]', ' ', data['message_data'][i])
    
    if i<2:
        print("\t\t\t\t MESSAGE ", i)
    
    if i<2:
        print("\n After Regular Expression - Message ", i, " : ", msg)
    
    
    
    
    # Each word to lower case
    msg = msg.lower()  
    
    if i<2:
        print("\n Lower case Message ", i, " : ", msg)
    
    
    #Tokenization
    # Splitting words to Tokenize 
    msg = msg.split()    
    
    if i<2:
        print("\n After Splitting - Message ", i, " : ", msg)
    
    
    
    # Stemming with PorterStemmer handling Stop Words
    # find the root-word of the given text/token  if it is not a 'Stopword'  (useless / redundant words in sentimental analysis)
    # if it is stopwords then remove the given text/token from the list 
    msg = [ps.stem(word) for word in msg if not word in set(stopwords.words('english'))]
    
    if i<2:
        print("\n After Stemming - Message ", i, " : ", msg)
    
    
    
    # preparing Messages with Remaining Tokens
    msg = ' '.join(msg)
    
    if i<2:
        print("\n Final Prepared - Message ", i, " : ", msg, "\n\n")
    
    
    # Preparing WordVector Corpus
    # create a corpus list and add all the token from the given article or document
    corpus.append(msg)

				 MESSAGE  0

 After Regular Expression - Message  0  :  Go until jurong point  crazy   Available only in bugis n great world la e buffet    Cine there got amore wat   

 Lower case Message  0  :  go until jurong point  crazy   available only in bugis n great world la e buffet    cine there got amore wat   

 After Splitting - Message  0  :  ['go', 'until', 'jurong', 'point', 'crazy', 'available', 'only', 'in', 'bugis', 'n', 'great', 'world', 'la', 'e', 'buffet', 'cine', 'there', 'got', 'amore', 'wat']

 After Stemming - Message  0  :  ['go', 'jurong', 'point', 'crazi', 'avail', 'bugi', 'n', 'great', 'world', 'la', 'e', 'buffet', 'cine', 'got', 'amor', 'wat']

 Final Prepared - Message  0  :  go jurong point crazi avail bugi n great world la e buffet cine got amor wat 


				 MESSAGE  1

 After Regular Expression - Message  1  :  Ok lar    Joking wif u oni   

 Lower case Message  1  :  ok lar    joking wif u oni   

 After Splitting - Message  1  :  ['ok', 'lar', 'joking', 'wif', '

In [39]:
print(len(changes))
changes[0:16]

5572


['Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...',
 'Ok lar... Joking wif u oni...',
 "Free entry in numbr a wkly comp to win FA Cup final tkts numbrst May numbr. Text FA to numbr to receive entry question(std txt rate)T&C's apply numbrovernumbr's",
 'U dun say so early hor... U c already then say...',
 "Nah I don't think he goes to usf, he lives around here though",
 "FreeMsg Hey there darling it's been numbr week's now and no word back! I'd like some fun you up for it still? Tb ok! XxX std chgs to send, å£numbr to rcv",
 'Even my brother is not like to speak with me. They treat me like aids patent.',
 "As per your request 'Melle Melle (Oru Minnaminunginte Nurungu Vettam)' has been set as your callertune for all Callers. Press *numbr to copy your friends Callertune",
 'WINNER!! As a valued network customer you have been selected to receivea å£numbr prize reward! To claim call numbr. Claim code KLnumbr. Valid numbr hours 

# Make a new dataframe and store all the data in to a csvfile to see the changes

In [35]:
sub = pd.DataFrame()
sub['msg'] = changes 

In [37]:
sub.to_csv('C:\\Users\\PRIYANSH\\Desktop\\spam_detection\\changes.csv' ,index=False )

In [38]:
print(len(corpus))
corpus[:16]

5572


['go jurong point crazi avail bugi n great world la e buffet cine got amor wat',
 'ok lar joke wif u oni',
 'free entri 2 wkli comp win fa cup final tkt 21st may 2005 text fa 87121 receiv entri question std txt rate c appli 08452810075over18',
 'u dun say earli hor u c alreadi say',
 'nah think goe usf live around though',
 'freemsg hey darl 3 week word back like fun still tb ok xxx std chg send å 1 50 rcv',
 'even brother like speak treat like aid patent',
 'per request mell mell oru minnaminungint nurungu vettam set callertun caller press 9 copi friend callertun',
 'winner valu network custom select receivea å 900 prize reward claim call 09061701461 claim code kl341 valid 12 hour',
 'mobil 11 month u r entitl updat latest colour mobil camera free call mobil updat co free 08002986030',
 'gonna home soon want talk stuff anymor tonight k cri enough today',
 'six chanc win cash 100 20 000 pound txt csh11 send 87575 cost 150p day 6day 16 tsandc appli repli hl 4 info',
 'urgent 1 week free

# Word Embedding

In [45]:
print(list(corpus[0].split(" ")))    # convert a string into a list
print(len(list(corpus[0].split(" "))))

['go', 'jurong', 'point', 'crazi', 'avail', 'bugi', 'n', 'great', 'world', 'la', 'e', 'buffet', 'cine', 'got', 'amor', 'wat']
16


In [49]:
#total words in corppus
add = 0
for i in range(0,len(corpus)):
    add = add + len(list(corpus[i].split(" ")))
print(add) 

53422


In [52]:
# calculate the unique words in corpus
unique = []
for i in range(0,len(corpus)):
    for j in list(corpus[i].split(" ")):
        unique.append(j)
print(len(np.unique(unique)))

7247


In [53]:
cv = CountVectorizer()
x = cv.fit_transform(corpus).toarray()    # convert into numpy array

In [54]:
print(len(x))
x

5572


array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

# How many words are  in one row  and how many are unique words

In [58]:
for i in range(0,5572):
    if i <5:
        print(len(x[i]))
        print(np.unique(x[i]))

7213
[0 1]
7213
[0 1]
7213
[0 1 2]
7213
[0 1 2]
7213
[0 1]


# Applying Classification
Input : Prepared Sparse Matrix       
Ouput : Labels (Spam or Ham)

In [70]:
y = data['spam_check']
print(y.value_counts())   # check how many vlaues are 'spam' or how many 'not-spam(ham)'
print(type(y))
y[:5]

0    4825
1     747
Name: spam_check, dtype: int64
<class 'pandas.core.series.Series'>


0    0
1    0
2    1
3    0
4    0
Name: spam_check, dtype: int64

In [71]:
le = LabelEncoder()
y = le.fit_transform(y)   # convert pandas dataframe to numpy array
print(type(y))
y

<class 'numpy.ndarray'>


array([0, 0, 1, ..., 0, 0, 0], dtype=int32)

# Splitting to Training and Testing DATA

In [72]:
xtrain, xtest, ytrain, ytest = train_test_split(x, y,test_size= 0.20, random_state = 0)

In [76]:
print(xtrain.shape)
xtrain

(4457, 7213)


array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [77]:
print(ytrain.shape)
ytrain

(4457,)


array([0, 0, 0, ..., 0, 0, 0], dtype=int32)

In [78]:
print(xtest.shape)
xtest

(1115, 7213)


array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [79]:
print(ytest.shape)
ytest

(1115,)


array([0, 0, 0, ..., 0, 0, 0], dtype=int32)

# Applying Guassian Naive Bayes

In [80]:
bayes_classifier = GaussianNB()
bayes_classifier.fit(xtrain, ytrain)    # model is getting trained

GaussianNB(priors=None, var_smoothing=1e-09)

In [81]:
# Predicting
y_pred = bayes_classifier.predict(xtest)

In [82]:
y_pred

array([0, 0, 0, ..., 0, 0, 0], dtype=int32)

In [85]:
new = pd.DataFrame()
new['ytest'] = ytest
new['y_pred'] = y_pred
new

Unnamed: 0,ytest,y_pred
0,0,0
1,0,0
2,0,0
3,0,0
4,0,0
5,0,0
6,0,0
7,0,0
8,0,0
9,1,1


# check the performance of the model by using confusion matrix

In [86]:
# Evaluating
cm = confusion_matrix(ytest, y_pred)

In [87]:
cm

array([[824, 125],
       [ 19, 147]], dtype=int64)

In [89]:
print ("Accuracy : %0.5f \n\n" % accuracy_score(ytest,y_pred))
print (classification_report(ytest, y_pred))

Accuracy : 0.87085 


              precision    recall  f1-score   support

           0       0.98      0.87      0.92       949
           1       0.54      0.89      0.67       166

   micro avg       0.87      0.87      0.87      1115
   macro avg       0.76      0.88      0.80      1115
weighted avg       0.91      0.87      0.88      1115



# Applying Decision Tree

In [90]:
dt = DecisionTreeClassifier(random_state=50)
dt.fit(xtrain, ytrain)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=50,
            splitter='best')

In [92]:
# Predicting
y_pred_dt = dt.predict(xtest)

In [93]:
# Again use confusion matrix to check the performance the model (binary classification model)
# Evaluating
cm = confusion_matrix(ytest, y_pred_dt)

print(cm)

[[943   6]
 [ 29 137]]


In [94]:
print ("Accuracy : %0.5f \n\n" % accuracy_score(ytest,  y_pred_dt))
print (classification_report(ytest,  y_pred_dt))

Accuracy : 0.96861 


              precision    recall  f1-score   support

           0       0.97      0.99      0.98       949
           1       0.96      0.83      0.89       166

   micro avg       0.97      0.97      0.97      1115
   macro avg       0.96      0.91      0.93      1115
weighted avg       0.97      0.97      0.97      1115



# Final Accuracy
Decision Tree : 96.861%     
Guassian NB : 87.085%