In [None]:
import numpy as np
import pandas as pd
import nltk
from math import log
from nltk.corpus import stopwords
from nltk import PorterStemmer as Stemmer
import re

In [None]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
import zipfile

zip_ref = zipfile.ZipFile('/content/drive/MyDrive/data.zip', 'r')
zip_ref.extractall('/content/dataset')
zip_ref.close()

In [None]:
feature_index = 1
target_index = 0

In [None]:
def count_words(message):
  counts = {}
  for word in message:
    if word in counts:
      counts[word] += 1
    else:
      counts[word] = 1
  return counts

In [None]:
def process_kaggle(split):
  data = pd.read_csv('/content/dataset/emails.csv').to_numpy()
  np.random.shuffle(data)
  data_split = int(len(data)*split)
  train_data, test_data = (data[:data_split], data[data_split:])
  total_prob_ham = len([d for d in data if d[target_index] == 0])/len(data)
  total_prob_spam = 1 - total_prob_ham
  return train_data, test_data, total_prob_ham, total_prob_spam

In [None]:
def process_enron(split):
  data = pd.read_csv('/content/dataset/enron_spam_data.csv')
  data = data[['Subject', 'Message', 'Spam/Ham']] #Select only these columns
  data = data[data['Message'].notna() & data['Subject'].notna() & data['Spam/Ham'].notna()] #filter out NaN data
  data['Mail'] = data['Subject'] + ' ' + data['Message'] #Combine Subject with Mail Message
  data['Spam/Ham'] = data['Spam/Ham'].replace(['ham', 'spam'], [0, 1]) #Convert text labels to numbers
  data = data[['Mail', 'Spam/Ham']] #Select only these columns
  data.columns = ['Message', 'Spam/Ham'] #Rename columns
  data = data.to_numpy()
  np.random.shuffle(data)
  data_split = int(len(data)*split)
  train_data, test_data = (data[:data_split], data[data_split:])
  total_prob_ham = len([d for d in data if d[target_index] == 0])/len(data)
  total_prob_spam = 1 - total_prob_ham
  return train_data, test_data, total_prob_ham, total_prob_spam

In [None]:
def process_mix(split):
  data = pd.read_csv('/content/dataset/data.csv').to_numpy()
  np.random.shuffle(data)
  data_split = int(len(data)*split)
  train_data, test_data = (data[:data_split], data[data_split:])
  total_prob_ham = len([d for d in data if d[target_index] == 0])/len(data)
  total_prob_spam = 1 - total_prob_ham
  return train_data, test_data, total_prob_ham, total_prob_spam

In [None]:
class NB_Classifier(object):

    def __init__(self, train_data, test_data):
        self.hams = [h[feature_index] for h in train_data if h[target_index] == 0]
        self.spams = [s[feature_index] for s in train_data if s[target_index] == 1]
        self.stop_words = stopwords.words('english')
        self.words = {}
        self.get_words(self.hams + self.spams)

    def get_words(self, docs):
        word_index = 0
        for doc in docs:
            tokens = self.tokenize(doc)
            for token in tokens:
                if token not in self.words:
                    self.words[token] = word_index
                    word_index += 1

    def tfidf(self, docs):
        num_docs = len(docs)
        num_words = len(self.words)
        F = np.zeros((num_docs, num_words))

        for i, doc in enumerate(docs):
            tokens = self.tokenize(doc)
            token_counts = count_words(tokens)
            for token, count in token_counts.items():
                if token in self.words:
                    j = self.words[token]
                    if(count != 0):
                      F[i, j] = 1
        return F

    def train(self):
      ham_size = len(self.hams)
      spam_size = len(self.spams)

      self.ham_probs = (self.tfidf(self.hams).sum(axis = 0) + 1) / (ham_size + 2)
      self.spam_probs = (self.tfidf(self.spams).sum(axis = 0) + 1) / (spam_size + 2)


    def test(self):
      confusion_mat = np.zeros((2, 2), dtype=int)
      for mail in test_data:
        prediction = self.classify_mail(mail[feature_index])
        target = mail[target_index]
        if prediction == target:
          confusion_mat[prediction][prediction] += 1
        else:
          confusion_mat[prediction][target] += 1

      print(confusion_mat)

    def classify_mail(self, mail):

      mail = self.tokenize(mail)
      prob_ham = log(total_prob_ham)
      prob_spam = log(total_prob_spam)

      for word in mail:
        if word in self.words:
          index = self.words[word]
          prob_ham = prob_ham + log(self.ham_probs[index])
          prob_spam = prob_spam + log(self.spam_probs[index])

      if prob_spam >= prob_ham:
        return 1
      else:
        return 0

    def tokenize(self, doc):
        stemmer = Stemmer()
        tokens = re.findall(r'\b\w+\b', doc.lower())
        if self.stop_words:
            tokens = [stemmer.stem(t) for t in tokens if t not in self.stop_words]
        return np.unique(tokens)



In [None]:
train_data, test_data, total_prob_ham, total_prob_spam = process_mix(0.8)

In [None]:
model = NB_Classifier(train_data, test_data)

In [None]:
model.train()

In [None]:
model.test()

[[7880 1418]
 [  25 7367]]


In [4]:
import os
os.chdir('Colab Notebooks')

In [5]:
!ls

logistic-filter.ipynb	    spam-filter.ipynb  svm-spam.ipynb	Untitled1.ipynb
logistic-spam-filter.ipynb  svm-filter	       Untitled0.ipynb
