<a href="https://colab.research.google.com/github/sahil3Vedi/Semi-Supervised-Mail-Classifier/blob/master/Spam_Classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
#mounting Google Drive
from google.colab import drive
drive.mount('/content/drive')

import os #for file handling
import numpy as np #for mathematical functions
import re #for regular expressions
import nltk  # for text processing
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('words')

main_dict = {} #main  dictionary that stores a count of all words across the training data.

stop_words = set(stopwords.words('english'))

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!


In [6]:
#Generating indices for training and testing data

def segregateIndices(SPAMDIR,HAMDIR,train_percentage):
  length_spam = len(os.listdir(SPAMDIR))
  length_ham = len(os.listdir(HAMDIR))
  length_train_spam = int(length_spam*train_percentage/100)
  length_test_spam = length_spam - length_train_spam
  length_train_ham = int(length_ham*train_percentage/100)
  length_test_ham = length_ham - length_train_ham

  spam_indices = [x for x in range(length_spam)]
  ham_indices = [y for y in range(length_ham)]
  np_spam_indices = np.array(spam_indices)
  np_ham_indices = np.array(ham_indices)

  np.random.shuffle(np_spam_indices)
  np.random.shuffle(np_ham_indices)

  train_spam_indices = np_spam_indices[:length_train_spam]
  train_ham_indices = np_ham_indices[:length_train_ham]
  test_spam_indices = np_spam_indices[-length_test_spam:]
  test_ham_indices = np_ham_indices[-length_test_ham:]

  print("Total Spam: " + str(length_spam))
  print("Total Ham: " + str(length_ham))
  print("Total Train Spam: " + str(len(train_spam_indices)))
  print("Total Train Ham: " + str(len(train_ham_indices)))
  print("Total Test Spam: " + str(len(test_spam_indices)))
  print("Total Test Ham: " + str(len(test_ham_indices)))

  return_list = [train_spam_indices,train_ham_indices,test_spam_indices,test_ham_indices]
  return return_list

#scanning a mail to update the main dictionary
def scanMail(mail_string):
  res = re.findall(r'\w+', mail_string)
  res.pop(0) #Removing first word from the mail (Usually "subject")
  final_res = []
  for each_word in res:
    if ((each_word.isalpha()) and (each_word not in stop_words)) :
      final_res.append(each_word) #updating main dictionary
      if each_word in main_dict:
        main_dict[each_word] += 1
      else:
        main_dict[each_word] = 1

  return final_res #for debugging

In [7]:
#Segregating Training and Testing Data from Drive

ENRONSPAMDATASET = 'drive/My Drive/Enron Spam/spam'
ENRONNOSPAMDATASET = 'drive/My Drive/Enron Spam/ham'

indices = segregateIndices(ENRONSPAMDATASET,ENRONNOSPAMDATASET,70)

training_spam_indices = indices[0]
training_ham_indices = indices[1]
testing_spam_indices = indices[2]
testing_ham_indices = indices[3]
spam_files = os.listdir(ENRONSPAMDATASET)
ham_files = os.listdir(ENRONNOSPAMDATASET)

for each_index in training_spam_indices:
  each_file = spam_files[each_index]
  FILENAME = os.path.join(ENRONSPAMDATASET, each_file)
  with open(FILENAME, 'r',encoding='utf-8',errors='ignore') as myfile:
    data = myfile.read()
    word_list = scanMail(data)

for each_index in training_ham_indices:
  each_file = ham_files[each_index]
  FILENAME = os.path.join(ENRONNOSPAMDATASET, each_file)
  with open(FILENAME, 'r',encoding='utf-8',errors='ignore') as myfile:
    data = myfile.read()
    word_list = scanMail(data)

# Forming a new dictionary with the N most frequent words from the main dictionary

def getCommon(new_dict, n):
  temp_dict = new_dict
  ret_dict = {}
  for i in range(n):
    Keymax = max(temp_dict, key=temp_dict.get)
    ret_dict[Keymax]=temp_dict[Keymax]
    del temp_dict[Keymax]
  return ret_dict

Total Spam: 1518
Total Ham: 3681
Total Train Spam: 1062
Total Train Ham: 2576
Total Test Spam: 456
Total Test Ham: 1105


In [10]:
final_dict = getCommon(main_dict,1000)

#saving Bag of Words as a CSV File
import csv
with open('drive/My Drive/Enron Spam/bagofwords.csv', 'w') as f:
    for key in final_dict.keys():
        f.write("%s,%s\n"%(key,final_dict[key]))

#exporting training and testing indices as an NPY file
np.save('drive/My Drive/Enron Spam/spam_classifier_indices.npy', indices)

In [11]:
from sklearn.utils import shuffle #for shuffling datasets
import pandas as pd #for manipulating dataframes

#generating feature vector from a mail
def generateVector(mail_string,bagofwords):
  mail_vector = np.zeros(1000)
  res = re.findall(r'\w+', mail_string)
  res.pop(0)
  final_res = []
  for each_word in res:
    if ((each_word.isalpha()) and (each_word not in stop_words)) :
      final_res.append(each_word)
  vector_counter = 0
  for each_word in bagofwords:
    if each_word in final_res:
      mail_vector[vector_counter]=1
    vector_counter+=1

  return mail_vector

BAGOFWORDS = 'drive/My Drive/Enron Spam/bagofwords.csv'
bagofwords_data = pd.read_csv(BAGOFWORDS)
bagofwords = bagofwords_data[bagofwords_data.columns[0]]

SPAM_CLASSIFIER_INDICES = 'drive/My Drive/Enron Spam/spam_classifier_indices.npy'
spam_classifier_indices = np.load(SPAM_CLASSIFIER_INDICES,allow_pickle=True)

X_train = []
Y_train = []
X_test = []
Y_test = []

In [12]:
spam_training_indices = spam_classifier_indices[0]
ham_training_indices = spam_classifier_indices[1]
spam_testing_indices = spam_classifier_indices[2]
ham_testing_indices = spam_classifier_indices[3]

#Adding Spam Mails to X_test and Y_test
print("Adding Spam Mails to X_test and Y_test, Please be Patient...")
file_list = os.listdir(ENRONSPAMDATASET)
for each_index in spam_testing_indices:
  each_file = file_list[each_index]
  FILENAME = os.path.join(ENRONSPAMDATASET, each_file)
  with open(FILENAME, 'r',encoding='utf-8',errors='ignore') as myfile:
    data = myfile.read()
    mail_vector = generateVector(data,bagofwords)
    X_test.append(mail_vector)
    Y_test.append(1)


#Adding Non Spam Mails to X_test and Y_test
print("Adding Non Spam Mails to X_test and Y_test, Please be Patient...")
file_list = os.listdir(ENRONNOSPAMDATASET)
for each_index in ham_testing_indices:
  each_file = file_list[each_index]
  FILENAME = os.path.join(ENRONNOSPAMDATASET, each_file)
  with open(FILENAME, 'r',encoding='utf-8',errors='ignore') as myfile:
    data = myfile.read()
    mail_vector = generateVector(data,bagofwords)
    X_test.append(mail_vector)
    Y_test.append(0)

#Adding Spam Mails to X_train and Y_train
print("Adding Spam Mails to X_train and Y_train, Please be Patient...")
file_list = os.listdir(ENRONSPAMDATASET)
for each_index in spam_training_indices:
  each_file = file_list[each_index]
  FILENAME = os.path.join(ENRONSPAMDATASET, each_file)
  with open(FILENAME, 'r',encoding='utf-8',errors='ignore') as myfile:
    data = myfile.read()
    mail_vector = generateVector(data,bagofwords)
    X_train.append(mail_vector)
    Y_train.append(1)

#Adding Non Spam Mails to X_train and Y_train
print("Adding Non Spam Mails to X_train and Y_train, Please be Patient...")
file_list = os.listdir(ENRONNOSPAMDATASET)
for each_index in ham_training_indices:
  each_file = file_list[each_index]
  FILENAME = os.path.join(ENRONNOSPAMDATASET, each_file)
  with open(FILENAME, 'r',encoding='utf-8',errors='ignore') as myfile:
    data = myfile.read()
    mail_vector = generateVector(data,bagofwords)
    X_train.append(mail_vector)
    Y_train.append(0)

#saving numpy arrays
XTRAIN = 'drive/My Drive/Enron Spam/spam_classifier_xtrain.npy'
YTRAIN = 'drive/My Drive/Enron Spam/spam_classifier_ytrain.npy'
XTEST = 'drive/My Drive/Enron Spam/spam_classifier_xtest.npy'
YTEST = 'drive/My Drive/Enron Spam/spam_classifier_ytest.npy'

np.save(XTRAIN,X_train)
np.save(YTRAIN,Y_train)
np.save(XTEST, X_test)
np.save(YTEST, Y_test)

Adding Spam Mails to X_test and Y_test, Please be Patient...
Adding Non Spam Mails to X_test and Y_test, Please be Patient...
Adding Spam Mails to X_train and Y_train, Please be Patient...
Adding Non Spam Mails to X_train and Y_train, Please be Patient...


In [15]:
from sklearn.metrics import confusion_matrix #to describe performance
from sklearn.naive_bayes import MultinomialNB #enables Multinomial Naive Bayes Classifier
from sklearn.svm import LinearSVC #enables Support Vector Classifier
from sklearn.metrics import accuracy_score #to describe performance

XTRAIN_LOC = 'drive/My Drive/Enron Spam/spam_classifier_xtrain.npy'
YTRAIN_LOC = 'drive/My Drive/Enron Spam/spam_classifier_ytrain.npy'
XTEST_LOC = 'drive/My Drive/Enron Spam/spam_classifier_xtest.npy'
YTEST_LOC = 'drive/My Drive/Enron Spam/spam_classifier_ytest.npy'

X_train = np.load(XTRAIN_LOC)
Y_train = np.load(YTRAIN_LOC)
X_test = np.load(XTEST_LOC)
Y_test = np.load(YTEST_LOC)

#Training SVM and Naive Bayes Classifier

model1 = MultinomialNB()
model2 = LinearSVC()
model1.fit(X_train, Y_train)
model2.fit(X_train, Y_train)

result1 = model1.predict(X_test)
result2 = model2.predict(X_test)

print(confusion_matrix(Y_test,result1))
print(accuracy_score(Y_test,result1))
print(confusion_matrix(Y_test,result2))
print(accuracy_score(Y_test,result2))

[[1063   42]
 [ 152  304]]
0.8757206918641897
[[1065   40]
 [ 151  305]]
0.877642536835362


In [16]:
# Libraries to train the Neural Network
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from keras.utils import to_categorical
import random

#converting binary classifications to categorical ones
Y_train_categorical = to_categorical(Y_train)
Y_test_categorical = to_categorical(Y_test)

#Initialising Neural Network
neuralNet = tf.keras.Sequential([
# Adds a densely-connected layer with 64 units to the model:
layers.Dense(64, activation='relu', input_shape=(1000,)),
# Add another:
layers.Dense(64, activation='relu'),
# Add an output layer with 2 output units:
layers.Dense(2, activation='softmax')])

neuralNet.compile(optimizer=tf.keras.optimizers.Adam(0.01),
              loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])

neuralNet.fit(X_train, Y_train_categorical, epochs=10, batch_size=32)
Y_pred = neuralNet.predict(X_test)

#converting Y_pred to multiclass predictions to single valued outputs
Y_pred_normalised = []
for each_output in Y_pred:
  if (each_output[0]>each_output[1]):
    Y_pred_normalised.append(0)
  else:
    Y_pred_normalised.append(1)
Y_pred_np = np.array(Y_pred_normalised)

#performance of the neural network
print(confusion_matrix(Y_test,Y_pred_np))
print(accuracy_score(Y_test,Y_pred_np))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
[[1064   41]
 [ 164  292]]
0.8686739269698911
