In [2]:
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
import string

In [3]:
df = pd.read_csv('email.csv')

df.head(5)

Unnamed: 0,text,spam
0,Subject: naturally irresistible your corporate...,1
1,Subject: the stock trading gunslinger fanny i...,1
2,Subject: unbelievable new homes made easy im ...,1
3,Subject: 4 color printing special request add...,1
4,"Subject: do not have money , get software cds ...",1


In [None]:
df.shape

(5728, 2)

In [None]:
df.columns

Index(['text', 'spam'], dtype='object')

In [None]:
#check for duplicates
df.drop_duplicates(inplace=True)

In [None]:
df.shape

(5695, 2)

In [None]:
#show the number of missing data
df.isnull().sum()

text    0
spam    0
dtype: int64

In [None]:
#download the stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/xueweisun/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
def process_text(text):
  #remove punctation
  nopunc = [char for char in text if char not in string.punctuation]
  nopunc = ''.join(nopunc)

  #remove stopwords
  clean_words = [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]

  return clean_words

In [None]:
#show the tokenization (a list of tokens also lemmas)
df['text'].head().apply(process_text)

0    [Subject, naturally, irresistible, corporate, ...
1    [Subject, stock, trading, gunslinger, fanny, m...
2    [Subject, unbelievable, new, homes, made, easy...
3    [Subject, 4, color, printing, special, request...
4    [Subject, money, get, software, cds, software,...
Name: text, dtype: object

In [None]:
#get the tokenization(a list of tokens also lemmas)
text=df['text'].head().apply(process_text)
import smart_open
smart_open.open = smart_open.smart_open
from gensim.models import Word2Vec
#build the word2vec model
model_word2vec= Word2Vec(window=10, min_count=2, size=100,workers=4)
#build vocabulary from a sequence of sentences
model_word2vec.build_vocab(text,progress_per=1000)

In [None]:
#get epochs to avoid common mistakes around the model’s ability to do multiple training passes itself
model_word2vec.epochs

5

In [None]:
#train the word2vec model
model_word2vec.train(text,total_examples=model_word2vec.corpus_count,epochs=model_word2vec.epochs)

(109, 1485)

In [None]:
#save the word2vec model 
model_word2vec.save("word2vec.model")

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [None]:
#get the tokens of sentences
def to_review_vector(text):
    global word_vec
    
    word_vec=np.zeros((1,100))
 #add word vectors and calculate the mean to get vectors of sentences   
    for word in text:
        if word in model_word2vec:
            word_vec+=np.array([model_word2vec[word]])
    return pd.Series(word_vec.mean(axis=0))
#use the defined function in the actual dataset
messages_bow2=df.text.apply(to_review_vector)
messages_bow2.head()

  
  if __name__ == '__main__':


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,0.485879,-0.422665,0.259873,-0.063737,-0.351703,-0.310604,0.343882,0.285293,0.359695,-0.408639,...,-0.106639,0.504653,-0.274965,0.216136,0.105973,0.41928,0.03176,0.45189,0.487262,0.553028
1,0.21691,-0.18869,0.116015,-0.028454,-0.15701,-0.138662,0.153519,0.127363,0.160578,-0.182428,...,-0.047607,0.225292,-0.122752,0.096489,0.04731,0.187179,0.014179,0.201736,0.217528,0.246887
2,0.156175,-0.135857,0.083531,-0.020487,-0.113047,-0.099837,0.110534,0.091701,0.115616,-0.131348,...,-0.034277,0.16221,-0.088381,0.069472,0.034063,0.134769,0.010209,0.14525,0.15662,0.177759
3,0.138823,-0.120761,0.074249,-0.018211,-0.100487,-0.088744,0.098252,0.081512,0.10277,-0.116754,...,-0.030468,0.144187,-0.078561,0.061753,0.030278,0.119794,0.009074,0.129111,0.139218,0.158008
4,0.104117,-0.090571,0.055687,-0.013658,-0.075365,-0.066558,0.073689,0.061134,0.077077,-0.087565,...,-0.022851,0.10814,-0.058921,0.046315,0.022709,0.089846,0.006806,0.096833,0.104413,0.118506


In [None]:
#split data %80 training %20 test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(messages_bow2, df['spam'],test_size=0.20, random_state=0)

In [None]:
#build a neural network model and use the training data to fit the model
from sklearn.neural_network import MLPClassifier
model = MLPClassifier()
model.fit(X_train,y_train)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(100,), learning_rate='constant',
              learning_rate_init=0.001, max_iter=200, momentum=0.9,
              n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
              random_state=None, shuffle=True, solver='adam', tol=0.0001,
              validation_fraction=0.1, verbose=False, warm_start=False)

In [None]:
#use the model to predict the results of training data
pred_train=dict()
pred_train = model.predict(X_train)

In [None]:
#evaluate on training data based on the built model 
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
print(classification_report(y_train,pred_train))
print('Accuracy: ', accuracy_score(y_train,pred_train))

              precision    recall  f1-score   support

           0       0.76      1.00      0.86      3457
           1       0.00      0.00      0.00      1099

    accuracy                           0.76      4556
   macro avg       0.38      0.50      0.43      4556
weighted avg       0.58      0.76      0.65      4556

Accuracy:  0.7587796312554873


  'precision', 'predicted', average, warn_for)


In [None]:
#use the model to predict the results of the testing data
prediction = dict()
prediction["NN"] = model.predict(X_test)

In [None]:
#evaluate the neural network model on testing data
print(classification_report(y_test,prediction["NN"]))
accuracy_score(y_test,prediction["NN"])

print('Confusion Matrix: /n', confusion_matrix(y_test,prediction["NN"]))

              precision    recall  f1-score   support

           0       0.76      1.00      0.87       870
           1       0.00      0.00      0.00       269

    accuracy                           0.76      1139
   macro avg       0.38      0.50      0.43      1139
weighted avg       0.58      0.76      0.66      1139

Confusion Matrix: /n [[870   0]
 [269   0]]


In [None]:
#get accuracy using Cross Validation
from sklearn.model_selection import KFold, cross_val_score
kfold = KFold(n_splits=5,shuffle=True)
print("Accuracy using Cross Validation is :",np.mean(cross_val_score(model,messages_bow2,df['spam'],cv=kfold,scoring="accuracy"))*100," %")


Accuracy using Cross Validation is : 75.94381035996489  %
