In [1]:
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
import string

In [2]:
df = pd.read_csv('email.csv')

df.head(5)

Unnamed: 0,text,spam
0,Subject: naturally irresistible your corporate...,1
1,Subject: the stock trading gunslinger fanny i...,1
2,Subject: unbelievable new homes made easy im ...,1
3,Subject: 4 color printing special request add...,1
4,"Subject: do not have money , get software cds ...",1


In [3]:
df.shape

(5728, 2)

In [4]:
df.columns

Index(['text', 'spam'], dtype='object')

In [5]:
#check for duplicates
df.drop_duplicates(inplace=True)

In [6]:
df.shape

(5695, 2)

In [7]:
#show the number of missing data
df.isnull().sum()

text    0
spam    0
dtype: int64

In [8]:
#download the stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/xueweisun/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [9]:
def process_text(text):
  #remove punctation
  nopunc = [char for char in text if char not in string.punctuation]
  nopunc = ''.join(nopunc)

  #remove stopwords
  clean_words = [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]

  return clean_words

In [10]:
#show the tokenization (a list of tokens also lemmas)
df['text'].head().apply(process_text)

0    [Subject, naturally, irresistible, corporate, ...
1    [Subject, stock, trading, gunslinger, fanny, m...
2    [Subject, unbelievable, new, homes, made, easy...
3    [Subject, 4, color, printing, special, request...
4    [Subject, money, get, software, cds, software,...
Name: text, dtype: object

In [26]:
#convert collection of text to a matrix of tokens
from sklearn.feature_extraction.text import CountVectorizer
messages_bow = CountVectorizer(analyzer=process_text).fit_transform(df['text'])
print(messages_bow)

  (0, 3638)	1
  (0, 23369)	1
  (0, 18841)	1
  (0, 10065)	1
  (0, 17696)	1
  (0, 21140)	1
  (0, 27986)	1
  (0, 16674)	1
  (0, 28110)	1
  (0, 9296)	3
  (0, 21654)	2
  (0, 15429)	1
  (0, 32602)	1
  (0, 18238)	1
  (0, 18886)	1
  (0, 16089)	2
  (0, 8054)	1
  (0, 20952)	3
  (0, 32319)	1
  (0, 31968)	1
  (0, 24838)	1
  (0, 36025)	2
  (0, 21431)	2
  (0, 33037)	1
  (0, 23040)	2
  :	:
  (5694, 24818)	2
  (5694, 21624)	1
  (5694, 5729)	9
  (5694, 30934)	1
  (5694, 2828)	3
  (5694, 13338)	1
  (5694, 13127)	1
  (5694, 17388)	1
  (5694, 14130)	1
  (5694, 20273)	1
  (5694, 31827)	1
  (5694, 13128)	1
  (5694, 20467)	1
  (5694, 35288)	1
  (5694, 8629)	1
  (5694, 30082)	1
  (5694, 13522)	5
  (5694, 36185)	1
  (5694, 959)	2
  (5694, 2797)	1
  (5694, 30287)	1
  (5694, 17590)	1
  (5694, 33923)	1
  (5694, 10373)	1
  (5694, 11386)	1


In [12]:
#split data %80 training %20 test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(messages_bow, df['spam'],test_size=0.20, random_state=0)

In [13]:
#get the shapes of messages_bow
messages_bow.shape

(5695, 37229)

In [14]:
# create and train naive bayes classification
from sklearn.naive_bayes import MultinomialNB
classifier = MultinomialNB().fit(X_train,y_train)

In [15]:
#print predictions
print(classifier.predict(X_train))

#print values
print(y_train)

[0 0 0 ... 0 0 0]
3337    0
2104    0
3905    0
461     1
314     1
       ..
4950    0
3273    0
1653    0
2611    0
2736    0
Name: spam, Length: 4556, dtype: int64


In [16]:
#evaluate the model on the Training Dataset
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
pred = classifier.predict(X_train)
print(classification_report(y_train,pred))
print()
print('Confusion Matrix: /n', confusion_matrix(y_train,pred))
print()
print('Accuracy: ', accuracy_score(y_train,pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      3457
           1       0.99      1.00      0.99      1099

    accuracy                           1.00      4556
   macro avg       0.99      1.00      1.00      4556
weighted avg       1.00      1.00      1.00      4556


Confusion Matrix: /n [[3445   12]
 [   1 1098]]

Accuracy:  0.9971466198419666


In [17]:
#print predictions
print(classifier.predict(X_test))

#print values
print(y_test)

[1 0 0 ... 0 0 0]
977     1
3275    0
4163    0
751     1
3244    0
       ..
4506    0
1050    1
3366    0
2191    0
3911    0
Name: spam, Length: 1139, dtype: int64


In [18]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
pred = classifier.predict(X_test)
print(classification_report(y_test,pred))
print()
print('Confusion Matrix: /n', confusion_matrix(y_test,pred))
print()
print('Accuracy: ', accuracy_score(y_test,pred))

              precision    recall  f1-score   support

           0       1.00      0.99      0.99       870
           1       0.97      1.00      0.98       269

    accuracy                           0.99      1139
   macro avg       0.98      0.99      0.99      1139
weighted avg       0.99      0.99      0.99      1139


Confusion Matrix: /n [[862   8]
 [  1 268]]

Accuracy:  0.9920983318700615


In [19]:
#get accuracy using Cross Validation
from sklearn.model_selection import KFold, cross_val_score
kfold = KFold(n_splits=5,shuffle=True)
print("Accuracy using Cross Validation is :",np.mean(cross_val_score(MultinomialNB(),messages_bow,df['spam'],cv=kfold,scoring="accuracy"))*100," %")


Accuracy using Cross Validation is : 99.13959613696225  %


In [20]:
#build a neural network model and use the train data to fit the model
from sklearn.neural_network import MLPClassifier
model = MLPClassifier()
model.fit(X_train,y_train)


MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(100,), learning_rate='constant',
              learning_rate_init=0.001, max_iter=200, momentum=0.9,
              n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
              random_state=None, shuffle=True, solver='adam', tol=0.0001,
              validation_fraction=0.1, verbose=False, warm_start=False)

In [21]:
#use the model to predict the results of train data
pred_train=dict()
pred_train = model.predict(X_train)

In [22]:
#compare the real value and the results of train data using the model
print(classification_report(y_train,pred_train))
print('Accuracy: ', accuracy_score(y_train,pred_train))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      3457
           1       1.00      1.00      1.00      1099

    accuracy                           1.00      4556
   macro avg       1.00      1.00      1.00      4556
weighted avg       1.00      1.00      1.00      4556

Accuracy:  1.0


In [23]:
#use the model to predict the results of test data
prediction = dict()
prediction = dict()
prediction["NN"] = model.predict(X_test)


In [24]:
#compare the real value and the results of test data using the model
print(classification_report(y_test,prediction["NN"]))
accuracy_score(y_test,prediction["NN"])

print('Confusion Matrix: /n', confusion_matrix(y_test,prediction["NN"]))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99       870
           1       1.00      0.96      0.98       269

    accuracy                           0.99      1139
   macro avg       0.99      0.98      0.99      1139
weighted avg       0.99      0.99      0.99      1139

Confusion Matrix: /n [[869   1]
 [ 11 258]]


In [25]:
#get accuracy using Cross Validation
from sklearn.model_selection import KFold, cross_val_score
kfold = KFold(n_splits=5,shuffle=True)
print("Accuracy using Cross Validation is :",np.mean(cross_val_score(model,messages_bow,df['spam'],cv=kfold,scoring="accuracy"))*100," %")


Accuracy using Cross Validation is : 98.92888498683057  %
