In [167]:
import numpy as np
import pandas as pd
import nltk
from math import log, sqrt
from nltk.corpus import stopwords
from nltk import PorterStemmer as Stemmer
import re
import pickle
import os

In [168]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [169]:
# import zipfile

# zip_ref = zipfile.ZipFile('/content/drive/MyDrive/data.zip', 'r')
# zip_ref.extractall('/content/dataset')
# zip_ref.close()

In [170]:
def process_kaggle(split):
  data = pd.read_csv('/content/dataset/emails.csv').to_numpy()
  np.random.shuffle(data)
  data_split = int(len(data)*split)
  train_data, test_data = (data[:data_split], data[data_split:])
  return train_data, test_data

In [171]:
def process_enron(split):
  data = pd.read_csv('/content/dataset/enron_spam_data.csv')
  data = data[['Subject', 'Message', 'Spam/Ham']] #Select only these columns
  data = data[data['Message'].notna() & data['Subject'].notna() & data['Spam/Ham'].notna()] #filter out NaN data
  data['Mail'] = data['Subject'] + ' ' + data['Message'] #Combine Subject with Mail Message
  data['Spam/Ham'] = data['Spam/Ham'].replace(['ham', 'spam'], [0, 1]) #Convert text labels to numbers
  data = data[['Mail', 'Spam/Ham']] #Select only these columns
  data.columns = ['Message', 'Spam/Ham'] #Rename columns
  data = data.to_numpy()
  np.random.shuffle(data)
  data_split = int(len(data)*split)
  train_data, test_data = (data[:data_split], data[data_split:])
  return train_data, test_data

In [172]:
class LogisticRegression():
  def __init__(self):
    self.stop_words = stopwords.words('english')
    self.words = {}

  def tokenize(self, doc):
      stemmer = Stemmer()
      tokens = re.findall(r'\b\w+\b', doc.lower())
      if self.stop_words:
          tokens = [stemmer.stem(t) for t in tokens if t not in self.stop_words]
      return np.unique(tokens)

  def get_words(self, docs):
      word_index = 0
      for doc in docs:
          tokens = self.tokenize(doc)
          for token in tokens:
              if token not in self.words:
                  self.words[token] = word_index
                  word_index += 1

  def vectorize(self, x):
    vector = np.zeros(len(self.words.keys()))
    for token in self.tokenize(x):
      if token in self.words:
        vector[self.words[token]] = 1
    return vector

  def train(self, train_data, epochs, rate, reg_param, batch_size=None):
    self.get_words(train_data[:, 0])
    self.train_mails = [self.vectorize(x[0]) for x in train_data]
    self.train_targets = train_data[:, 1]
    self.train_size = len(self.train_targets)
    self.reg_param = reg_param
    self.weights = np.zeros(len(self.words.keys()), dtype=float)
    self.bias = 0.0

    if batch_size:
      self.mail_batches = [self.train_mails[k:k+batch_size]
                  for k in range(0, self.train_size, batch_size)]
      self.target_batches = [self.train_targets[k:k+batch_size]
                  for k in range(0, self.train_size, batch_size)]

    for e in range(epochs):
      if batch_size:
        self.Stochastic_Gradient_Ascent(rate)
      else:
        self.Gradient_Ascent(rate)

      if e % int(epochs*0.1) == 0:
        print("epoch: ", e, "\tlikelihood: ", self.total_likelihood(), "\tL2 likelihood: ", self.total_likelihood_l2())

    print(self.weights)
    print(self.bias)

  def test(self, test_data):
    test_mails = test_data[:, 0]
    test_targets = test_data[:, 1]
    test_size = len(test_targets)

    confusion_mat = np.zeros((2, 2), dtype=int)
    for mail, target in zip(test_mails, test_targets):
      prediction = self.predict(mail)
      prediction = 0 if prediction < 0.5 else 1
      confusion_mat[prediction][target] += 1

    print("Confusion Matrix:")
    print(confusion_mat)

    # Calculate statistics
    true_positive = confusion_mat[1][1]
    true_negative = confusion_mat[0][0]
    false_positive = confusion_mat[1][0]
    false_negative = confusion_mat[0][1]

    accuracy = (true_positive + true_negative) / np.sum(confusion_mat)
    precision = true_positive / (true_positive + false_positive) if (true_positive + false_positive) > 0 else 0
    recall = true_positive / (true_positive + false_negative) if (true_positive + false_negative) > 0 else 0
    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1_score:.4f}")

  def Gradient_Ascent(self, rate):
    dll_dw = np.zeros(len(self.weights), dtype=float)
    dll_db = 0.0
    for x, y in zip(self.train_mails, self.train_targets):
      z = np.dot(x, self.weights) + self.bias
      f = self.sigmoid(z)
      f_prime = f*(1 - f) # derivative of sigmoid function
      dll_db += (y/f)*f_prime - ((1 - y)/(1 - f))*f_prime
      dll_dw += dll_db*x

    self.weights += rate*(dll_dw - self.reg_param*self.weights)/float(self.train_size)
    self.bias += rate*(dll_db - self.reg_param*self.bias)/float(self.train_size)

  def Stochastic_Gradient_Ascent(self, rate):
    for mail_batch, target_batch in zip(self.mail_batches, self.target_batches):
      dll_dw = np.zeros(len(self.weights), dtype=float)
      dll_db = 0.0
      batch_len = len(mail_batch)
      for x, y in zip(mail_batch, target_batch):
        z = np.dot(x, self.weights) + self.bias
        f = self.sigmoid(z)
        f_prime = f*(1 - f) # derivative of sigmoid function
        dll_db += (y/f)*f_prime - ((1 - y)/(1 - f))*f_prime
        dll_dw += dll_db*x

      self.weights += rate*(dll_dw - self.reg_param*self.weights)/float(batch_len)
      self.bias += rate*(dll_db - self.reg_param*self.bias)/float(batch_len)

  def total_likelihood(self):
    likelihood = 0
    for mail, target in zip(self.train_mails, self.train_targets):
      likelihood += self.log_likelihood(mail, target)

    return -likelihood/self.train_size

  def total_likelihood_l2(self):
    return self.total_likelihood() + (self.reg_param/2)*(np.dot(self.weights, self.weights) + self.bias*self.bias)/self.train_size

  def log_likelihood(self, x, y):
    z = np.dot(x, self.weights) + self.bias
    return y * log(self.sigmoid(z)) + (1 - y) * log(1 - self.sigmoid(z))

  def predict(self, x):
    z = np.dot(self.vectorize(x), self.weights) + self.bias
    return self.sigmoid(z)

  def sigmoid(self, z):
    return 1 / (1 + np.exp(-z))

  def save(self, path):
    with open(path, 'wb') as f:
      pickle.dump(self, f)

  @classmethod
  def load(cls, path):
      if os.path.exists(path):
        model = cls()
        with open(path, 'rb') as f:
          return pickle.load(f)
      else:
        print("Model parameters not saved: Cannot load the file, ", path)
        return


In [173]:
# train_data, test_data = process_kaggle(0.8)
model_path = '/content/drive/MyDrive/Colab Notebooks/saved-models/logistic-kaggle-model.pkl'

In [174]:
# model = LogisticRegression()
# model.train(train_data, epochs=50, rate=0.01, reg_param=0.1, batch_size = 30)
# model.test(test_data)

In [175]:
# model = LogisticRegression.load(model_path)

0.31774501554554435