In [1]:
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
import string

In [2]:
df = pd.read_csv('email.csv')

df.head(5)

Unnamed: 0,text,spam
0,Subject: naturally irresistible your corporate...,1
1,Subject: the stock trading gunslinger fanny i...,1
2,Subject: unbelievable new homes made easy im ...,1
3,Subject: 4 color printing special request add...,1
4,"Subject: do not have money , get software cds ...",1


In [3]:
df.shape

(5728, 2)

In [4]:
df.columns

Index(['text', 'spam'], dtype='object')

In [5]:
#check for duplicates
df.drop_duplicates(inplace=True)

In [6]:
df.shape

(5695, 2)

In [7]:
#show the number of missing data
df.isnull().sum()

text    0
spam    0
dtype: int64

In [8]:
#download the stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/xueweisun/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [9]:
def process_text(text):
  #remove punctation
  nopunc = [char for char in text if char not in string.punctuation]
  nopunc = ''.join(nopunc)

  #remove stopwords
  clean_words = [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]

  return clean_words

In [10]:
#show the tokenization (a list of tokens also lemmas)
df['text'].head().apply(process_text)

0    [Subject, naturally, irresistible, corporate, ...
1    [Subject, stock, trading, gunslinger, fanny, m...
2    [Subject, unbelievable, new, homes, made, easy...
3    [Subject, 4, color, printing, special, request...
4    [Subject, money, get, software, cds, software,...
Name: text, dtype: object

In [11]:
#get the tokenization(a list of tokens also lemmas)
text=df['text'].head().apply(process_text)
import smart_open
smart_open.open = smart_open.smart_open
from gensim.models import Word2Vec
#build the word2vec model
model_word2vec= Word2Vec(window=10, min_count=2, size=100,workers=4)
#build vocabulary from a sequence of sentences
model_word2vec.build_vocab(text,progress_per=1000)

In [12]:
#get epochs to avoid common mistakes around the model’s ability to do multiple training passes itself
model_word2vec.epochs

5

In [13]:
#train the word2vec model
model_word2vec.train(text,total_examples=model_word2vec.corpus_count,epochs=model_word2vec.epochs)

(109, 1485)

In [14]:
#save the word2vec model 
model_word2vec.save("word2vec.model")

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [15]:
#get the tokens of sentences
def to_review_vector(text):
    global word_vec
    
    word_vec=np.zeros((1,100))
    
    for word in text:
        if word in model_word2vec:
            word_vec+=np.array([model_word2vec[word]])
    return pd.Series(word_vec.mean(axis=0))

messages_bow2=df.text.apply(to_review_vector)
messages_bow2.head()

  
  if __name__ == '__main__':


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,-0.437484,-0.437489,0.252785,-0.015725,0.230238,0.187766,0.020453,-0.103712,0.199441,0.340459,...,0.314083,-0.345686,-0.002111,0.490596,-0.304742,-0.429651,0.364303,0.290646,-0.285389,-0.183512
1,-0.195305,-0.195307,0.112851,-0.00702,0.102785,0.083824,0.009131,-0.0463,0.089036,0.151991,...,0.140216,-0.154324,-0.000943,0.219016,-0.136046,-0.191808,0.162635,0.129753,-0.127406,-0.081925
2,-0.14062,-0.140621,0.081252,-0.005055,0.074005,0.060353,0.006574,-0.033336,0.064106,0.109433,...,0.100955,-0.111113,-0.000679,0.157692,-0.097953,-0.138102,0.117098,0.093422,-0.091732,-0.058986
3,-0.124995,-0.124997,0.072224,-0.004493,0.065782,0.053647,0.005844,-0.029632,0.056983,0.097274,...,0.089738,-0.098768,-0.000603,0.14017,-0.087069,-0.122757,0.104087,0.083042,-0.08154,-0.052432
4,-0.093747,-0.093748,0.054168,-0.00337,0.049337,0.040236,0.004383,-0.022224,0.042737,0.072956,...,0.067303,-0.074076,-0.000452,0.105128,-0.065302,-0.092068,0.078065,0.062281,-0.061155,-0.039324


In [16]:
#split data %80 training %20 test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(messages_bow2, df['spam'],test_size=0.20, random_state=0)

In [17]:
#build a neural network model and use the train data to fit the model
from sklearn.neural_network import MLPClassifier
model = MLPClassifier()
model.fit(X_train,y_train)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(100,), learning_rate='constant',
              learning_rate_init=0.001, max_iter=200, momentum=0.9,
              n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
              random_state=None, shuffle=True, solver='adam', tol=0.0001,
              validation_fraction=0.1, verbose=False, warm_start=False)

In [18]:
#use the model to predict the results of train data
pred_train=dict()
pred_train = model.predict(X_train)

In [19]:
#compare the real value and the results using the model
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
print(classification_report(y_train,pred_train))
print('Accuracy: ', accuracy_score(y_train,pred_train))

              precision    recall  f1-score   support

           0       0.76      1.00      0.86      3457
           1       0.47      0.01      0.01      1099

    accuracy                           0.76      4556
   macro avg       0.61      0.50      0.44      4556
weighted avg       0.69      0.76      0.66      4556

Accuracy:  0.7585601404741001


In [20]:
#use the model to predict the results of test data
prediction = dict()
prediction["NN"] = model.predict(X_test)

In [21]:
#compare the real value and the results of test data using the model
print(classification_report(y_test,prediction["NN"]))
accuracy_score(y_test,prediction["NN"])

print('Confusion Matrix: /n', confusion_matrix(y_test,prediction["NN"]))

              precision    recall  f1-score   support

           0       0.76      1.00      0.87       870
           1       1.00      0.00      0.01       269

    accuracy                           0.76      1139
   macro avg       0.88      0.50      0.44      1139
weighted avg       0.82      0.76      0.66      1139

Confusion Matrix: /n [[870   0]
 [268   1]]


In [22]:
#get accuracy using Cross Validation
from sklearn.model_selection import KFold, cross_val_score
kfold = KFold(n_splits=5,shuffle=True)
print("Accuracy using Cross Validation is :",np.mean(cross_val_score(model,messages_bow2,df['spam'],cv=kfold,scoring="accuracy"))*100," %")


Accuracy using Cross Validation is : 75.9964881474978  %
