## E-Mail Spam *Filtering*

### Feature Selection

In [5]:
import os
import numpy as np
import warnings
import math
from scipy.special import factorial
from cvxopt import matrix, solvers
import string
import csv
import pandas as pd
import time
from sklearn.metrics.pairwise import rbf_kernel
from sklearn.model_selection import train_test_split

In [30]:
warnings.filterwarnings("ignore")

In [13]:
class Utils:
  emailDirectory = "/content/drive/MyDrive/Sem 3/MLSec/lingspam_public/lemm_stop"
  spam = 0
  spam2 = -1
  legit = 1
  csvFile = "spam_precision_recall.csv"


  def isSpam(filename):
    if filename.startswith("spmsg"):
      return True
    else:
      return False

  def spamPrecision(Y_actual, Y_pred):
    trueSpam = 0
    predSpam = 0

    for y in Y_pred:
      if y == Utils.spam or y == Utils.spam2:
        predSpam += 1
    for i in range(0, Y_pred.shape[0]):
      if (Y_pred[i] == Utils.spam or Y_pred[i] == Utils.spam2) \
      and (Y_actual[i] == Utils.spam or Y_actual[i] == Utils.spam2):
        trueSpam += 1

    return trueSpam/predSpam

  def spamRecall(Y_actual, Y_pred):
    trueSpam = 0
    predSpam = 0

    for y in Y_actual:
      if y == Utils.spam or y == Utils.spam2:
        trueSpam += 1
    for i in range(0, Y_pred.shape[0]):
      if (Y_actual[i] == Utils.spam or Y_actual[i] == Utils.spam2) \
      and (Y_pred[i] == Utils.spam or Y_pred[i] == Utils.spam2):
        predSpam += 1

    return predSpam/trueSpam

  def accuracy(Y_actual, Y_pred):
    count = 0

    for i in range(0, Y_pred.shape[0]):
      if Y_actual[i] == Y_pred[i]:
        count += 1

    return (count/Y_pred.shape[0]) * 100


  def createSpamCSVFile():
    header = ["Model", "#Features", "Spam Precision", "Spam Recall", "Test Accuracy", "Latency"]
    with open(Utils.csvFile, mode='w', newline='') as csv_file:
      csvWriter = csv.writer(csv_file)
      csvWriter.writerow(header)

  def appendSpamCSVFile(model, numFeats, spamPrecision, spamRecall, accuracy, latency):
    data = [model, numFeats, spamPrecision, spamRecall, accuracy, latency]
    with open(Utils.csvFile, mode='a', newline='') as csv_file:
      csvWriter = csv.writer(csv_file)
      csvWriter.writerow(data)

  def displaySpamCSVFile():
    df = pd.read_csv(Utils.csvFile)
    return df

  #converts Y[spam,legit] symbol from [0,1] to [-1,1] for SVM
  def convertY(Y):
    for i in range(0, Y.shape[0]):
      if Y[i][0] == 0:
        Y[i][0] = -1

    return Y


# Dataset Class
1. Loads the data from the input files
2. Preprocess the emails like removal of punctutations
3. Calculate various metrics
4. Provide Training and Testing input features vector
5. Perform Feature Selection based on Information Gain(IG)
6. Provide top-N features

In [7]:
class Dataset:

  def __init__(self, datasetLocation, removePunctutations = False, size=10):
    self.location = datasetLocation
    self.featureIndexMap = dict()
    self.indexFeatureMap = dict()
    self.featureCount = []  # list of [#spam,#legit]
    self.spamCount = 0
    self.legitCount = 0
    self.totalCount = 0
    self.initialProcessing = False
    self.featureSelection = False
    self.featureSelectionSize = size
    self.removePunctutations = removePunctutations

  def initialFeaturesProcessing(self):
    featureIdx = 0
    for part in os.listdir(self.location):
      if part != "part10" and part != ".DS_Store":
        partPath = os.path.join(self.location, part)
        for filename in os.listdir(partPath):
          f = open(os.path.join(partPath, filename), "r")
          filedata = f.read()
          if self.removePunctutations:
            filedata = self._textCleanUp(filedata)
          filedata = filedata.split()
          currMsgFeatureSet = set()
          if Utils.isSpam(filename):
            self.spamCount += 1
          else:
            self.legitCount += 1

          for eachFeature in filedata:
            if eachFeature.casefold() == "Subject:".casefold() or eachFeature.casefold() == "Subject".casefold():
              continue
            if not eachFeature in self.featureIndexMap:
              self.featureIndexMap[eachFeature] = featureIdx
              self.indexFeatureMap[featureIdx] = eachFeature
              featureIdx = featureIdx + 1
              currMsgFeatureSet.add(eachFeature)
              if Utils.isSpam(filename):
                self.featureCount.append([1,0])
              else:
                self.featureCount.append([0,1])
            else:
              if not eachFeature in currMsgFeatureSet:
                currMsgFeatureSet.add(eachFeature)
                featIdx = self.featureIndexMap[eachFeature]
                if Utils.isSpam(filename):
                  self.featureCount[featIdx][0] = self.featureCount[featIdx][0] + 1
                else:
                  self.featureCount[featIdx][1] = self.featureCount[featIdx][1] + 1

    self.totalCount = self.spamCount + self.legitCount

  def getBooleanFeaturesVector(self):
    x = []
    y = []

    for part in os.listdir(self.location):
      if part != "part10" and part != ".DS_Store":
        partPath = os.path.join(self.location, part)
        for filename in os.listdir(partPath):
          f = open(os.path.join(partPath, filename), "r")
          filedata = f.read()
          if self.removePunctutations:
            filedata = self._textCleanUp(filedata)
          filedata = filedata.split()
          inputVector = np.zeros(len(self.featureIndexMap))

          for eachFeature in filedata:
            if eachFeature.casefold() == "Subject:".casefold() or eachFeature.casefold() == "Subject".casefold():
              continue
            featIdx = self.featureIndexMap[eachFeature]
            inputVector[featIdx] = 1

          if self.featureSelectionSize >= 0:
            topFeaturesIndex = self._featureSelection()
            x_top = [inputVector[i] for i in topFeaturesIndex]
            x.append(x_top)
          else:
            x.append(inputVector)

          if Utils.isSpam(filename):
            y.append(0)
          else:
            y.append(1)

    return np.vstack(x),np.vstack(y)

  def getTermFrequencyFeaturesVector(self):
    x = []
    y = []
    for part in os.listdir(self.location):
      if part != "part10" and part != ".DS_Store":
        partPath = os.path.join(self.location, part)
        for filename in os.listdir(partPath):
          f = open(os.path.join(partPath, filename), "r")
          filedata = f.read()
          if self.removePunctutations:
            filedata = self._textCleanUp(filedata)
          filedata = filedata.split()
          inputVector = np.zeros(len(self.featureIndexMap))

          for eachFeature in filedata:
            if eachFeature.casefold() == "Subject:".casefold() or eachFeature.casefold() == "Subject".casefold():
              continue
            featIdx = self.featureIndexMap[eachFeature]
            inputVector[featIdx] += 1

          if self.featureSelectionSize >= 0:
            topFeaturesIndex = self._featureSelection()
            x_top = [inputVector[i] for i in topFeaturesIndex]
            x.append(x_top)
          else:
            x.append(inputVector)

          if Utils.isSpam(filename):
            y.append(0)
          else:
            y.append(1)

    return np.vstack(x),np.vstack(y)

  def setFeatureSelectionSize(self, size):
    self.featureSelection = False
    self.featureSelectionSize = size

  def getTopFeatures(self):
    if self.featureSelectionSize < 0:
      print("Feature Selection is Disabled")
      exit()

    topFeaturesIndex = self._featureSelection()
    topFeatures = []

    for i in topFeaturesIndex:
      topFeatures.append(self.indexFeatureMap[i])

    return topFeatures

  def getTestBooleanFeaturesVector(self):
    testLocation = os.path.join(self.location, "part10")

    x = []
    y = []

    for filename in os.listdir(testLocation):
      f = open(os.path.join(testLocation, filename), "r")
      filedata = f.read()
      if self.removePunctutations:
        filedata = self._textCleanUp(filedata)
      filedata = filedata.split()
      inputVector = np.zeros(len(self.featureIndexMap))

      for eachFeature in filedata:
        if eachFeature.casefold() == "Subject:".casefold() or eachFeature.casefold() == "Subject".casefold():
          continue
        if eachFeature in self.featureIndexMap:
          featIdx = self.featureIndexMap[eachFeature]
          inputVector[featIdx] = 1

      if self.featureSelectionSize >= 0:
        topFeaturesIndex = self._featureSelection()
        x_top = [inputVector[i] for i in topFeaturesIndex]
        x.append(x_top)
      else:
        x.append(inputVector)


      if Utils.isSpam(filename):
        y.append(0)
      else:
        y.append(1)

    return np.vstack(x),np.vstack(y)

  def getTestTermFeaturesVector(self):
    testLocation = os.path.join(self.location, "part10")
    x = []
    y = []

    for filename in os.listdir(testLocation):
      f = open(os.path.join(testLocation, filename), "r")
      filedata = f.read()
      if self.removePunctutations:
        filedata = self._textCleanUp(filedata)
      filedata = filedata.split()
      inputVector = np.zeros(len(self.featureIndexMap))

      for eachFeature in filedata:
        if eachFeature.casefold() == "Subject:".casefold() or eachFeature.casefold() == "Subject".casefold():
          continue
        if eachFeature in self.featureIndexMap:
          featIdx = self.featureIndexMap[eachFeature]
          inputVector[featIdx] += 1

      if self.featureSelectionSize >= 0:
        topFeaturesIndex = self._featureSelection()
        x_top = [inputVector[i] for i in topFeaturesIndex]
        x.append(x_top)
      else:
        x.append(inputVector)

      if Utils.isSpam(filename):
        y.append(0)
      else:
        y.append(1)

    return np.vstack(x),np.vstack(y)


  def _textCleanUp(self, text) -> str:
    textWithoutPunctuation = [c for c in text if c not in string.punctuation]
    textWithoutPunctuation = ''.join(textWithoutPunctuation)
    return textWithoutPunctuation

  def _featureSelection(self, force=False):
    if self.featureSelection and not force:
      return self.topFeaturesIndex

    ig = self._informationGain()
    ig_sorted = dict(sorted(ig.items(), reverse=True))
    topFeaturesIndex = []
    b = False

    count = 0

    for items in ig_sorted.values():
      for idx in items:
        topFeaturesIndex.append(idx)
        count += 1
        if count == self.featureSelectionSize:
          b = True
          break
      if b:
        break

    self.featureSelection = True
    topFeaturesIndex.sort()
    self.topFeaturesIndex  = list(topFeaturesIndex)
    return self.topFeaturesIndex


  def _informationGain(self) -> dict():
    ig = dict()
    probSpam = self.spamCount/self.totalCount
    probLegit = self.legitCount/self.totalCount

    H_C = - (probSpam * np.log2(probSpam) + (probLegit) * np.log2(probLegit))

    featIdx = 0

    for feat in self.featureCount:
      featIdx += 1
      probXGivenSpam = feat[Utils.spam]/self.spamCount
      probXGivenLegit = feat[Utils.legit]/self.legitCount
      # probX =  (probXGivenSpam*probSpam) + (probXGivenLegit*probLegit)
      probX = (feat[Utils.spam]+feat[Utils.legit]) / self.totalCount
      probNotXGivenSpam = 1 - probXGivenSpam
      probNotXGivenLegit = 1 - probXGivenLegit
      probNotX = 1 - probX

      sum1 = probXGivenSpam*probSpam*(np.log2(probXGivenSpam*probSpam) - np.log2(probX))
      sum2 = probNotXGivenSpam*probSpam*(np.log2(probNotXGivenSpam*probSpam)- np.log2(probNotX))
      sum3 = probXGivenLegit*probLegit*(np.log2(probXGivenLegit*probLegit) - np.log2(probX))
      sum4 = probNotXGivenLegit*probLegit*(np.log2(probNotXGivenLegit*probLegit) - np.log2(probNotX))

      H_C_X = -(sum1+sum2+sum3+sum4)
      ig_x = H_C - H_C_X
      ig.setdefault(ig_x, []).append(featIdx)

    return ig



### Top 10 Feature when punctuations removal is disabled:

In [31]:
lingSpam = Dataset(Utils.emailDirectory, False, 10)
lingSpam.initialFeaturesProcessing()
print(lingSpam.getTopFeatures())

["'", 'phonetic', '(', ')', 'conference', 'hold', 'fujimura', 'department', 'speech', 'advertiser']


### Top 10 Feature when punctuations removal is enabled:

In [32]:
lingSpam = Dataset(Utils.emailDirectory, True, 10)
lingSpam.initialFeaturesProcessing()
print(lingSpam.getTopFeatures())

['98', 'phonetic', 'conference', 'hold', 'fujimura', 'department', 'speech', 'host', 'selfsustain', 'advertiser']


### For probabilistic models, we will use email text without punctuations removal.

In [33]:
class BernoulliNB:

  def __init__(self) -> None:
    self.spamCount = 0
    self.legitCount = 0

  def countLabels(self, Y_train):
    for label in Y_train:
      if label == Utils.spam:
        self.spamCount += 1
      else:
        self.legitCount += 1

  def fit(self, X_train, Y_train):
    self.numFeats = X_train.shape[1]
    self.numSamples = X_train.shape[0]
    self.alphaSpam = np.zeros(X_train.shape[1])
    self.alphaLegit = np.zeros(X_train.shape[1])

    self.countLabels(Y_train)

    self.probSpam = self.spamCount/self.numSamples
    self.probLegit = 1 - self.probSpam

    for i in range(0, self.numFeats):
      currSpamCount = 0
      currLegitCount = 0
      for j in range(0, self.numSamples):
        if Y_train[j] == Utils.spam:
          currSpamCount += X_train[j][i]
        else:
          currLegitCount += X_train[j][i]

      self.alphaSpam[i] = (currSpamCount+1)/(self.spamCount+2)
      self.alphaLegit[i] = (currLegitCount+1)/(self.legitCount+2)

  def evaluate(self, X_test):
    numSamples = X_test.shape[0]
    pred = []

    for sample in X_test:
      probXGivenSpam = 1
      probXGivenLegit = 1
      for i in range(0, self.numFeats):
        probXGivenSpam = probXGivenSpam * pow(self.alphaSpam[i], sample[i]) * pow(1 - self.alphaSpam[i], 1 - sample[i])
        probXGivenLegit = probXGivenLegit * pow(self.alphaLegit[i], sample[i]) * pow(1 - self.alphaLegit[i], 1 - sample[i])

      probX = (probXGivenSpam * self.probSpam) + (probXGivenLegit * self.probLegit)
      probSpamGivenX = np.log2(probXGivenSpam) + np.log2(self.probSpam) - np.log2(probX)
      probLegitGivenX = np.log2(probXGivenLegit) + np.log2(self.probLegit) - np.log2(probX)

      if probSpamGivenX > probLegitGivenX:
        pred.append(Utils.spam)
      else:
        pred.append(Utils.legit)

    return np.array(pred)

In [34]:
class MultinomialNB:

  def __init__(self) -> None:
    self.spamCount = 0
    self.legitCount = 0

  def countLabels(self, Y_train):
    for label in Y_train:
      if label == Utils.spam:
        self.spamCount += 1
      else:
        self.legitCount += 1

  def fit(self, X_train, Y_train):
    self.numFeats = X_train.shape[1]
    self.numSamples = X_train.shape[0]
    self.alphaSpam = np.zeros(X_train.shape[1])
    self.alphaLegit = np.zeros(X_train.shape[1])

    self.countLabels(Y_train)

    self.probSpam = self.spamCount/self.numSamples
    self.probLegit = 1 - self.probSpam

    spamFeatureOccurence = []
    legitFeatureOccurence = []

    for i in range(0, self.numFeats):
      currSpamCount = 0
      currLegitCount = 0
      for j in range(0, self.numSamples):
        if Y_train[j] == Utils.spam:
          currSpamCount += X_train[j][i]
        else:
          currLegitCount += X_train[j][i]

      spamFeatureOccurence.append(currSpamCount)
      legitFeatureOccurence.append(currLegitCount)


    N_I_S = sum(spamFeatureOccurence)
    N_I_L = sum(legitFeatureOccurence)

    for i in range(0, self.numFeats):
      self.alphaSpam[i] = (1+spamFeatureOccurence[i]) / (self.numFeats + N_I_S)
      self.alphaLegit[i] = (1+legitFeatureOccurence[i]) / (self.numFeats + N_I_L)


  def evaluate(self, X_test):
    numSamples = X_test.shape[0]
    pred = []

    for sample in X_test:
      # sampleLength = 0
      probXGivenSpam = 1
      probXGivenLegit = 1
      for i in range(0, self.numFeats):
        probXGivenSpam =  probXGivenSpam * pow(self.alphaSpam[i], sample[i])
        # probXGivenLegit = probXGivenLegit * (pow(self.alphaLegit[i], sample[i]) / factorial(int(sample[i])))
        probXGivenLegit = probXGivenLegit * pow(self.alphaLegit[i], sample[i])
        # sampleLength += sample[i]

      probSpamGivenX = probXGivenSpam * self.probSpam
      probLegitGivenX = probXGivenLegit * self.probLegit

      if probSpamGivenX > probLegitGivenX:
        pred.append(Utils.spam)
      else:
        pred.append(Utils.legit)

    return np.array(pred)


### Load data and pre-process it

In [35]:
lingSpam = Dataset(Utils.emailDirectory, False, 10)
lingSpam.initialFeaturesProcessing()

In [36]:
Utils.createSpamCSVFile()

# Bernoulli NB with binary features

### N = 10

In [37]:
X,Y = lingSpam.getBooleanFeaturesVector()

In [38]:
bernoulliClassifier = BernoulliNB()

#Training
bernoulliClassifier.fit(X,Y.ravel())

X_test, Y_test = lingSpam.getTestBooleanFeaturesVector()

startTime = time.time()

#Prediction
pred = bernoulliClassifier.evaluate(X_test)

endTime = time.time()

#Latency
elapsedTime = endTime - startTime

spamRecall = Utils.spamRecall(Y_test.ravel(), pred)
spamPrecision = Utils.spamPrecision(Y_test.ravel(), pred)
accuracy = Utils.accuracy(Y_test.ravel(), pred)

print("Bernoulli NB with 10 binary features")
print("Spam Recall: ", spamRecall, "Spam Precision: ", spamPrecision, "\nLatency: ", elapsedTime, "Accuracy: " , accuracy)

Utils.appendSpamCSVFile("Bernoulli NB with binary features", 10, spamPrecision, spamRecall, accuracy, elapsedTime)


Bernoulli NB with 10 binary features
Spam Recall:  0.4489795918367347 Spam Precision:  0.7096774193548387 
Latency:  0.008576154708862305 Accuracy:  87.62886597938144


### N = 100

In [39]:
lingSpam.setFeatureSelectionSize(100)
X,Y = lingSpam.getBooleanFeaturesVector()

In [40]:
bernoulliClassifier = BernoulliNB()
bernoulliClassifier.fit(X,Y.ravel())

X_test, Y_test = lingSpam.getTestBooleanFeaturesVector()

startTime = time.time()
pred = bernoulliClassifier.evaluate(X_test)
endTime = time.time()

#Latency
elapsedTime = endTime - startTime

spamRecall = Utils.spamRecall(Y_test.ravel(), pred)
spamPrecision = Utils.spamPrecision(Y_test.ravel(), pred)
accuracy = Utils.accuracy(Y_test.ravel(), pred)

print("Bernoulli NB with 100 binary features")
print("Spam Recall: ", spamRecall, "Spam Precision: ", spamPrecision, "\nLatency: ", elapsedTime, "Accuracy: " , accuracy)

Utils.appendSpamCSVFile("Bernoulli NB with binary features", 100, spamPrecision, spamRecall, accuracy, elapsedTime)

Bernoulli NB with 100 binary features
Spam Recall:  0.8367346938775511 Spam Precision:  0.8541666666666666 
Latency:  0.0713646411895752 Accuracy:  94.84536082474226


### N = 1000

In [41]:
lingSpam.setFeatureSelectionSize(1000)
X,Y = lingSpam.getBooleanFeaturesVector()

In [42]:
bernoulliClassifier = BernoulliNB()
bernoulliClassifier.fit(X,Y.ravel())

X_test, Y_test = lingSpam.getTestBooleanFeaturesVector()

startTime = time.time()
pred = bernoulliClassifier.evaluate(X_test)
endTime = time.time()

#Latency
elapsedTime = endTime - startTime

spamRecall = Utils.spamRecall(Y_test.ravel(), pred)
spamPrecision = Utils.spamPrecision(Y_test.ravel(), pred)
accuracy = Utils.accuracy(Y_test.ravel(), pred)

print("Bernoulli NB with 1000 binary features")
print("Spam Recall: ", spamRecall, "Spam Precision: ", spamPrecision, "\nLatency: ", elapsedTime, "Accuracy: " , accuracy)

Utils.appendSpamCSVFile("Bernoulli NB with binary features", 1000, spamPrecision, spamRecall, accuracy, elapsedTime)

Bernoulli NB with 1000 binary features
Spam Recall:  0.8979591836734694 Spam Precision:  0.9565217391304348 
Latency:  0.6290507316589355 Accuracy:  97.59450171821305


# Multinomial NB with binary features

### N = 10

In [43]:
lingSpam.setFeatureSelectionSize(10)
X_train,Y_train = lingSpam.getBooleanFeaturesVector()

In [44]:
multinomialClassifier = MultinomialNB()
multinomialClassifier.fit(X_train,Y_train.ravel())

X_test, Y_test = lingSpam.getTestBooleanFeaturesVector()

startTime = time.time()
pred = multinomialClassifier.evaluate(X_test)
endTime = time.time()

#Latency
elapsedTime = endTime - startTime

spamRecall = Utils.spamRecall(Y_test.ravel(), pred)
spamPrecision = Utils.spamPrecision(Y_test.ravel(), pred)
accuracy = Utils.accuracy(Y_test.ravel(), pred)

print("Multinomial NB with 10 binary features")
print("Spam Recall: ", spamRecall, "Spam Precision: ", spamPrecision, "\nLatency: ", elapsedTime, "Accuracy: " , accuracy)

Utils.appendSpamCSVFile("Multinomial NB with binary features", 10, spamPrecision, spamRecall, accuracy, elapsedTime)

Multinomial NB with 10 binary features
Spam Recall:  0.061224489795918366 Spam Precision:  1.0 
Latency:  0.0028810501098632812 Accuracy:  84.19243986254295


### N = 100

In [45]:
lingSpam.setFeatureSelectionSize(100)
X_train,Y_train = lingSpam.getBooleanFeaturesVector()

In [46]:
multinomialClassifier = MultinomialNB()
multinomialClassifier.fit(X_train,Y_train.ravel())

X_test, Y_test = lingSpam.getTestBooleanFeaturesVector()

startTime = time.time()
pred = multinomialClassifier.evaluate(X_test)
endTime = time.time()

#Latency
elapsedTime = endTime - startTime

spamRecall = Utils.spamRecall(Y_test.ravel(), pred)
spamPrecision = Utils.spamPrecision(Y_test.ravel(), pred)

accuracy = Utils.accuracy(Y_test.ravel(), pred)

print("Multinomial NB with 100 binary features")
print("Spam Recall: ", spamRecall, "Spam Precision: ", spamPrecision, "\nLatency: ", elapsedTime, "Accuracy: " , accuracy)

Utils.appendSpamCSVFile("Multinomial NB with binary features", 100, spamPrecision, spamRecall, accuracy, elapsedTime)

Multinomial NB with 100 binary features
Spam Recall:  0.4897959183673469 Spam Precision:  0.8888888888888888 
Latency:  0.02376389503479004 Accuracy:  90.37800687285224


### N = 1000

In [47]:
lingSpam.setFeatureSelectionSize(1000)
X_train,Y_train = lingSpam.getBooleanFeaturesVector()

In [48]:
multinomialClassifier = MultinomialNB()
multinomialClassifier.fit(X_train,Y_train.ravel())

X_test, Y_test = lingSpam.getTestBooleanFeaturesVector()

startTime = time.time()
pred = multinomialClassifier.evaluate(X_test)
endTime = time.time()

#Latency
elapsedTime = endTime - startTime

spamRecall = Utils.spamRecall(Y_test.ravel(), pred)
spamPrecision = Utils.spamPrecision(Y_test.ravel(), pred)

accuracy = Utils.accuracy(Y_test.ravel(), pred)

print("Multinomial NB with 1000 binary features")
print("Spam Recall: ", spamRecall, "Spam Precision: ", spamPrecision, "\nLatency: ", elapsedTime, "Accuracy: " , accuracy)

Utils.appendSpamCSVFile("Multinomial NB with binary features", 1000, spamPrecision, spamRecall, accuracy, elapsedTime)

Multinomial NB with 1000 binary features
Spam Recall:  0.5918367346938775 Spam Precision:  1.0 
Latency:  0.5207004547119141 Accuracy:  93.12714776632302


# Multinomial NB with term frequency (TF) features

### N = 10

In [49]:
lingSpam.setFeatureSelectionSize(10)
X_train,Y_train = lingSpam.getTermFrequencyFeaturesVector()

In [50]:
multinomialClassifier = MultinomialNB()
multinomialClassifier.fit(X_train,Y_train.ravel())

X_test, Y_test = lingSpam.getTestTermFeaturesVector()

startTime = time.time()
pred = multinomialClassifier.evaluate(X_test)
endTime = time.time()

#Latency
elapsedTime = endTime - startTime

spamRecall = Utils.spamRecall(Y_test.ravel(), pred)
spamPrecision = Utils.spamPrecision(Y_test.ravel(), pred)

accuracy = Utils.accuracy(Y_test.ravel(), pred)

print("Multinomial NB with 10 Term Frequency features")
print("Spam Recall: ", spamRecall, "Spam Precision: ", spamPrecision, "\nLatency: ", elapsedTime, "Accuracy: " , accuracy)

Utils.appendSpamCSVFile("Multinomial NB with Term Frequency features", 10, spamPrecision, spamRecall, accuracy, elapsedTime)

Multinomial NB with 10 Term Frequency features
Spam Recall:  0.20408163265306123 Spam Precision:  0.38461538461538464 
Latency:  0.0028467178344726562 Accuracy:  81.09965635738831


### N = 100

In [51]:
lingSpam.setFeatureSelectionSize(100)
X_train,Y_train = lingSpam.getTermFrequencyFeaturesVector()

In [52]:
multinomialClassifier = MultinomialNB()
multinomialClassifier.fit(X_train,Y_train.ravel())

X_test, Y_test = lingSpam.getTestTermFeaturesVector()

startTime = time.time()
pred = multinomialClassifier.evaluate(X_test)
endTime = time.time()

#Latency
elapsedTime = endTime - startTime

spamRecall = Utils.spamRecall(Y_test.ravel(), pred)
spamPrecision = Utils.spamPrecision(Y_test.ravel(), pred)

accuracy = Utils.accuracy(Y_test.ravel(), pred)

print("Multinomial NB with 100 Term Frequency features")
print("Spam Recall: ", spamRecall, "Spam Precision: ", spamPrecision, "\nLatency: ", elapsedTime, "Accuracy: " , accuracy)

Utils.appendSpamCSVFile("Multinomial NB with Term Frequency features", 100, spamPrecision, spamRecall, accuracy, elapsedTime)

Multinomial NB with 100 Term Frequency features
Spam Recall:  0.6938775510204082 Spam Precision:  0.85 
Latency:  0.023176908493041992 Accuracy:  92.78350515463917


### N = 1000

In [53]:
lingSpam.setFeatureSelectionSize(1000)
X_train,Y_train = lingSpam.getTermFrequencyFeaturesVector()

In [54]:
multinomialClassifier = MultinomialNB()
multinomialClassifier.fit(X_train,Y_train.ravel())

X_test, Y_test = lingSpam.getTestTermFeaturesVector()

startTime = time.time()
pred = multinomialClassifier.evaluate(X_test)
endTime = time.time()

#Latency
elapsedTime = endTime - startTime

spamRecall = Utils.spamRecall(Y_test.ravel(), pred)
spamPrecision = Utils.spamPrecision(Y_test.ravel(), pred)

accuracy = Utils.accuracy(Y_test.ravel(), pred)

print("Multinomial NB with 1000 Term Frequency features")
print("Spam Recall: ", spamRecall, "Spam Precision: ", spamPrecision, "\nLatency: ", elapsedTime, "Accuracy: " , accuracy)

Utils.appendSpamCSVFile("Multinomial NB with Term Frequency features", 1000, spamPrecision, spamRecall, accuracy, elapsedTime)


Multinomial NB with 1000 Term Frequency features
Spam Recall:  0.22448979591836735 Spam Precision:  0.7333333333333333 
Latency:  0.244551420211792 Accuracy:  85.56701030927834


In [55]:
Utils.displaySpamCSVFile()

Unnamed: 0,Model,#Features,Spam Precision,Spam Recall,Test Accuracy,Latency
0,Bernoulli NB with binary features,10,0.709677,0.44898,87.628866,0.008576
1,Bernoulli NB with binary features,100,0.854167,0.836735,94.845361,0.071365
2,Bernoulli NB with binary features,1000,0.956522,0.897959,97.594502,0.629051
3,Multinomial NB with binary features,10,1.0,0.061224,84.19244,0.002881
4,Multinomial NB with binary features,100,0.888889,0.489796,90.378007,0.023764
5,Multinomial NB with binary features,1000,1.0,0.591837,93.127148,0.5207
6,Multinomial NB with Term Frequency features,10,0.384615,0.204082,81.099656,0.002847
7,Multinomial NB with Term Frequency features,100,0.85,0.693878,92.783505,0.023177
8,Multinomial NB with Term Frequency features,1000,0.733333,0.22449,85.56701,0.244551


# SVM based Email Spam Filter

For SVM based classifier, we pre-processed the email text by removing punctuations.

We used **term frequency features** for **SVM model** training and testing.

We did not perform feature selection for SVM based classifier as we got very good test accuracy with full feature set without much latency.

In [29]:
class Kernel:
  #Input Size: NxD, NxD
  #Output Size: NxN

  def linearKernel(X, Y):
    return np.dot(X, Y.T)

  def polyKernel(X, Y, deg):
    return np.power(np.dot(X,Y.T) + 1, deg)

  def rbfKernel(X, Y, gamma):
    return rbf_kernel(X, Y, gamma)



In [18]:
class SVM:

  MIN_SUPPORT_VECTOR_MULTIPLIER = 1e-5

  def __init__(self, kernel = "linear", deg = 1, gamma = 1, C = 1) -> None:
    self.kernel = kernel
    self.deg = deg
    self.gamma = gamma
    self.C = C

  def _kernelMat(self, X, Y):
    if self.kernel == "poly":
      K = Kernel.polyKernel(X, Y, self.deg)
      return K

    elif self.kernel == "rbf":
      K = Kernel.rbfKernel(X, Y, self.gamma)
      return K

    else:
      K = Kernel.linearKernel(X, Y)
      return K


  def fit(self, X_train, Y_train):

    n = X_train.shape[0]

    P = matrix(np.dot(Y_train, Y_train.T) * self._kernelMat(X_train, X_train))

    q = matrix(np.ones((n, 1)) * -1)

    # G = matrix(np.eye(n) * -1)
    # h = matrix(np.zeros(n))

    G = matrix(np.vstack((np.eye(n) * -1, np.eye(n))))
    h = matrix(np.hstack((np.zeros(n), np.ones(n) * self.C)))

    A = matrix(Y_train.reshape(1, -1), (1,n), 'd')
    b = matrix(np.zeros(1))

    solution = solvers.qp(P, q, G, h, A, b)
    alphas = np.array(solution['x'])

    indices = (alphas > self.MIN_SUPPORT_VECTOR_MULTIPLIER).flatten()
    self.supportVectorMultipliers = alphas[indices]
    self.supportVectorX = X_train[indices]
    self.supportVectorY = Y_train[indices]

    W_X_i = self._kernelMat(self.supportVectorX, self.supportVectorX) * self.supportVectorY * self.supportVectorMultipliers
    W_X = np.sum(W_X_i, axis = 0)

    b_i = self.supportVectorY - W_X
    self.b = np.mean(b_i)


  def predict(self, X_test):
    W_U_i = self._kernelMat(self.supportVectorX, X_test) * self.supportVectorY * self.supportVectorMultipliers
    W_U = np.sum(W_U_i, axis = 0)
    pred = W_U + self.b
    return np.sign(pred).astype(int)

In [19]:
#Dataset for SVM based Classifier
lingSpam = Dataset(Utils.emailDirectory, True, -1)
lingSpam.initialFeaturesProcessing()

In [20]:
X,Y = lingSpam.getTermFrequencyFeaturesVector()
Y = Utils.convertY(Y)

In [21]:
# Set the number of folds for cross-validation
num_folds = 5

# Define a list of hyperparameters to search
C_values = [0.1, 1, 10, 100]
gamma_values = [0.001, 0.01, 0.1, 0.5, 1]

rbf_best_accuracy = 0.0
rbf_best_C = None
rbf_best_gamma = None
iter = 0

# Perform grid search cross-validation
for C in C_values:
    for gamma in gamma_values:
        accuracies = []
        for fold in range(num_folds):
            # Split the data into training and validation sets
            fold_size = len(X) // num_folds
            validation_indices = range(fold * fold_size, (fold + 1) * fold_size)
            training_indices = [i for i in range(len(X)) if i not in validation_indices]

            X_train, Y_train = X[training_indices], Y[training_indices]
            X_val, Y_val = X[validation_indices], Y[validation_indices]

            # Train an SVM model
            svm = SVM(kernel='rbf', C=C, gamma=gamma)
            svm.fit(X_train, Y_train)

            # Evaluate the model on the validation set
            pred = svm.predict(X_val)
            accuracy = Utils.accuracy(Y_val.ravel(), pred)
            accuracies.append(accuracy)

        iter += 1
        mean_accuracy = np.mean(accuracies)
        print("Iteration: ", iter, "Accuracy: ", mean_accuracy)
        if mean_accuracy > rbf_best_accuracy:
            rbf_best_accuracy = mean_accuracy
            rbf_best_C = C
            rbf_best_gamma = gamma

     pcost       dcost       gap    pres   dres
 0: -2.2274e+02 -4.3832e+02  1e+04  2e+01  1e-14
 1: -9.6742e+01 -3.8961e+02  1e+03  2e+00  9e-15
 2: -6.3744e+01 -2.0188e+02  1e+02  4e-15  4e-15
 3: -6.5349e+01 -9.5795e+01  3e+01  6e-15  3e-15
 4: -6.6577e+01 -7.7512e+01  1e+01  5e-15  3e-15
 5: -6.7839e+01 -7.1248e+01  3e+00  2e-15  2e-15
 6: -6.8354e+01 -6.9670e+01  1e+00  2e-15  3e-15
 7: -6.8613e+01 -6.9007e+01  4e-01  6e-15  3e-15
 8: -6.8717e+01 -6.8780e+01  6e-02  2e-15  3e-15
 9: -6.8739e+01 -6.8742e+01  3e-03  2e-15  3e-15
10: -6.8740e+01 -6.8740e+01  8e-05  2e-15  3e-15
11: -6.8740e+01 -6.8740e+01  2e-06  1e-14  3e-15
Optimal solution found.
     pcost       dcost       gap    pres   dres
 0: -2.0275e+02 -4.3898e+02  1e+04  2e+01  7e-15
 1: -8.9096e+01 -3.8975e+02  1e+03  2e+00  7e-15
 2: -5.6662e+01 -1.9795e+02  1e+02  1e-14  3e-15
 3: -5.8082e+01 -8.6947e+01  3e+01  5e-15  2e-15
 4: -5.8671e+01 -7.7593e+01  2e+01  3e-15  2e-15
 5: -5.9906e+01 -6.6064e+01  6e+00  1e-15  2e-1

In [22]:
# Set the number of folds for cross-validation
num_folds = 5

# Define a list of hyperparameters to search
C_values = [0.1, 1, 10, 100]

linear_best_accuracy = 0.0
linear_best_C = None

# Perform grid search cross-validation
for C in C_values:
    accuracies = []
    for fold in range(num_folds):
        # Split the data into training and validation sets
        fold_size = len(X) // num_folds
        validation_indices = range(fold * fold_size, (fold + 1) * fold_size)
        training_indices = [i for i in range(len(X)) if i not in validation_indices]

        X_train, Y_train = X[training_indices], Y[training_indices]
        X_val, Y_val = X[validation_indices], Y[validation_indices]

        # Train an SVM model
        svm = SVM(kernel='linear', C=C)
        svm.fit(X_train, Y_train)

        # Evaluate the model on the validation set
        pred = svm.predict(X_val)
        accuracy = Utils.accuracy(Y_val.ravel(), pred)
        accuracies.append(accuracy)

    iter += 1
    mean_accuracy = np.mean(accuracies)
    print("Iteration: ", iter, "Accuracy: ", mean_accuracy)
    if mean_accuracy > linear_best_accuracy:
        linear_best_accuracy = mean_accuracy
        linear_best_C = C

     pcost       dcost       gap    pres   dres
 0: -4.6662e+00 -2.5448e+02  6e+03  2e+01  4e-14
 1: -4.3494e+00 -2.2095e+02  5e+02  9e-01  4e-14
 2: -3.5053e+00 -1.3239e+02  2e+02  3e-01  3e-14
 3: -2.7761e+00 -6.6992e+01  1e+02  1e-01  3e-14
 4: -1.9955e+00 -4.7231e+01  7e+01  5e-02  2e-14
 5: -1.8344e+00 -3.1829e+01  4e+01  3e-02  1e-14
 6: -1.4332e+00 -1.8465e+01  2e+01  1e-02  6e-14
 7: -1.2604e+00 -9.9578e+00  1e+01  4e-03  2e-14
 8: -1.1320e+00 -3.9989e+00  3e+00  7e-04  1e-14
 9: -1.3397e+00 -2.2276e+00  1e+00  2e-04  1e-14
10: -1.4126e+00 -1.7364e+00  3e-01  2e-16  9e-15
11: -1.4873e+00 -1.5685e+00  8e-02  3e-16  6e-15
12: -1.5120e+00 -1.5221e+00  1e-02  5e-16  7e-15
13: -1.5160e+00 -1.5163e+00  3e-04  6e-16  6e-15
14: -1.5161e+00 -1.5161e+00  9e-06  3e-16  7e-15
15: -1.5161e+00 -1.5161e+00  2e-07  3e-16  8e-15
Optimal solution found.
     pcost       dcost       gap    pres   dres
 0: -3.8761e+00 -2.4788e+02  6e+03  2e+01  5e-14
 1: -3.5767e+00 -2.1393e+02  5e+02  7e-01  6e-1

## SVM Parameters

### For Kernel: RBF

In [27]:
print("C:", rbf_best_C) # Regularization parameter
print("Gamma:", rbf_best_gamma)

print("Accuracy:", rbf_best_accuracy)

C: 100
Gamma: 0.001
Accuracy: 96.96153846153847


### For Kernel: Linear

In [28]:
print("C:", linear_best_C) # Regularization parameter

print("Accuracy:", linear_best_accuracy)

C: 0.1
Accuracy: 98.34615384615384


In [23]:
best_model = None
best_C = None
best_gamma = 1

if rbf_best_accuracy > linear_best_accuracy:
  best_model = "rbf"
  best_C = rbf_best_C
  best_gamma = rbf_best_gamma
else:
  best_model = "linear"
  best_C = linear_best_C

print(best_model)

linear


In [25]:
svm = SVM(kernel=best_model, C=best_C, gamma=best_gamma)
svm.fit(X,Y)

     pcost       dcost       gap    pres   dres
 0: -5.6583e+00 -3.2954e+02  9e+03  2e+01  1e-13
 1: -5.1817e+00 -2.8738e+02  7e+02  9e-01  1e-13
 2: -4.1381e+00 -1.6189e+02  3e+02  3e-01  7e-14
 3: -3.2152e+00 -8.3297e+01  1e+02  1e-01  4e-14
 4: -2.0206e+00 -5.9462e+01  8e+01  5e-02  2e-14
 5: -1.6877e+00 -3.8637e+01  5e+01  2e-02  3e-14
 6: -1.5316e+00 -2.1715e+01  3e+01  1e-02  3e-14
 7: -1.3915e+00 -1.0324e+01  1e+01  3e-03  1e-14
 8: -1.2781e+00 -4.4021e+00  4e+00  6e-04  2e-14
 9: -1.4977e+00 -2.4210e+00  1e+00  1e-04  1e-14
10: -1.5953e+00 -1.8676e+00  3e-01  2e-16  1e-14
11: -1.6644e+00 -1.7285e+00  6e-02  2e-16  7e-15
12: -1.6830e+00 -1.6984e+00  2e-02  3e-16  7e-15
13: -1.6883e+00 -1.6901e+00  2e-03  2e-16  7e-15
14: -1.6891e+00 -1.6891e+00  4e-05  2e-16  7e-15
15: -1.6891e+00 -1.6891e+00  6e-07  2e-16  7e-15
Optimal solution found.


In [26]:
X_test, Y_test = lingSpam.getTestTermFeaturesVector()
Y_test = Utils.convertY(Y_test)

startTime = time.time()
pred = svm.predict(X_test)
endTime = time.time()

#Latency
elapsedTime = endTime - startTime

spamRecall = Utils.spamRecall(Y_test.ravel(), pred)
spamPrecision = Utils.spamPrecision(Y_test.ravel(), pred)
accuracy = Utils.accuracy(Y_test.ravel(), pred)

print("SVM with Term Frequency features")
print("Spam Recall: ", spamRecall, "Spam Precision: ", spamPrecision, "\nLatency: ", elapsedTime, "Accuracy: " , accuracy)

SVM with Term Frequency features
Spam Recall:  0.9591836734693877 Spam Precision:  0.8867924528301887 
Latency:  0.32692956924438477 Accuracy:  97.2508591065292


# END