# Naive Bayes and Logistic Regression

## Creating the Datasets

In [1]:
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from sklearn.model_selection import train_test_split

import random
import os

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /Users/raaed/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/raaed/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /Users/raaed/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

bow_vectorizer = CountVectorizer()
brnli_vectorizer = CountVectorizer(binary = True)

add_one = 1

In [3]:
def load_dataset(filepath) -> list[tuple]:
    ham_data = []
    spam_data = []

    for dirpath, dirnames, filenames in os.walk(filepath):
        for filename in filenames:
            with open(os.path.join(dirpath,filename), encoding = 'iso-8859-1') as f:
                if 'ham' in filename:
                    ham_data.append(f.read())
                elif 'spam' in filename:
                    spam_data.append(f.read())
    
    # assign 1/0 to samples and combine data
    positive_samples = [(email, 1) for email in ham_data]
    negative_samples = [(email, 0) for email in spam_data]

    all_samples = positive_samples + negative_samples
    random.shuffle(all_samples)

    #remove stop words and lemmatize dataset
    filtered_dataset = []
    for text, y in all_samples:
        words = nltk.word_tokenize(text)
        words = [word for word in words if word.lower() not in stop_words]
        words = [lemmatizer.lemmatize(word) for word in words]
        fil_text = " ".join(words)
        filtered_dataset.append((fil_text, y))
    return filtered_dataset# [(text, 0), (text, 1)...] 


In [4]:
# Specify the path each of the datasets. Point to the directory containing the ham, spam directories - 
enron1_test = load_dataset('project1_datasets/enron1/test')
enron1_train = load_dataset('project1_datasets/enron1 2/train')

enron2_test = load_dataset('project1_datasets/test')
enron2_train = load_dataset('project1_datasets/train')

enron4_test = load_dataset('project1_datasets/enron4/test')
enron4_train = load_dataset('project1_datasets/enron4 2/train')

In [5]:
#convert the text data to bernoulli and bow models
def convert_to_bow(dataset):
    emails = [email for email, y in dataset]
    c = [y for email, y in dataset]

    bow_matrix = bow_vectorizer.fit_transform(emails)
    return bow_matrix, np.array(c)


def convert_to_bernoulli(dataset):
    emails = [email for email, y in dataset]
    c = [y for email, y in dataset]

    brnli_matrix = brnli_vectorizer.fit_transform(emails)
    return brnli_matrix, np.array(c)

In [6]:
# Bag Of Words datasets
e1_Xtrain_bow, e1_ytrain_bow = convert_to_bow(enron1_train)
e1_Xtest_bow, e1_ytest_bow = convert_to_bow(enron1_test)

e2_Xtrain_bow, e2_ytrain_bow = convert_to_bow(enron2_train)
e2_Xtest_bow, e2_ytest_bow = convert_to_bow(enron2_test)

e4_Xtrain_bow, e4_ytrain_bow = convert_to_bow(enron4_train)
e4_Xtest_bow, e4_ytest_bow = convert_to_bow(enron4_test)

# Bernoulli datasets
e1_Xtrain_bli, e1_ytrain_bli = convert_to_bernoulli(enron1_train)
e1_Xtest_bli, e1_ytest_bli = convert_to_bernoulli(enron1_test)

e2_Xtrain_bli, e2_ytrain_bli = convert_to_bernoulli(enron2_train)
e2_Xtest_bli, e2_ytest_bli = convert_to_bernoulli(enron2_test)

e4_Xtrain_bli, e4_ytrain_bli = convert_to_bernoulli(enron4_train)
e4_Xtest_bli, e4_ytest_bli = convert_to_bernoulli(enron4_test)


## Implementing Multinomial Naive Bayes

In [7]:
class MultinomialNaiveBayes:
    def train(self, X, y):
        self.classes = np.unique(y)
        self.parameters = []
        self.vocab = np.sum(X, axis = 0)
        self.features = X.shape[1]

        for c in self.classes:
            X_c = X[y == c]
            param_c = {}
            param_c['prior'] = np.log(X_c.shape[0] / float(X.shape[0]))
            param_c["word_count"] = np.sum(X_c, axis = 0)
            param_c['word_prob'] = np.log((param_c["word_count"] + add_one) / (np.sum(param_c["word_count"]) + X.shape[1]))
            self.parameters.append(param_c)

            
    def predict(self, X):
        results = []
        for x in X:
            class_scores = []
            for c in self.classes:
                word_prob = self.parameters[c]['word_prob']
                log_prob = self.parameters[c]['prior']
                
                common_words = np.isin(word_prob.nonzero()[1], x.nonzero()[1])
                
                x = x[:, common_words]
                word_prob_common = word_prob[:, common_words]
            
                log_prob += np.sum(np.sum(np.multiply(x, word_prob_common)))
                class_scores.append(log_prob)
            results.append(self.classes[np.argmax(class_scores)]) 
        return np.array(results)


### Results on Bag of Words Datasets

In [8]:
def mnb_evaluate(X_train, y_train, X_test, y_test, name):
    mbn = MultinomialNaiveBayes()
    mbn.train(X_train, y_train)
    predictions = mbn.predict(X_test)

    acc = accuracy_score(y_test, predictions)
    f1 = f1_score(y_test, predictions)
    recall = recall_score(y_test, predictions)
    precision = precision_score(y_test, predictions)

    print(f'Multinomial Naive Bayes Results on {name} Dataset')
    print(f'Accuracy = {acc}')
    print(f'Precision = {precision}')
    print(f'Recall = {recall}')
    print(f'F1 score = {f1}')

In [9]:
mnb_evaluate(e1_Xtrain_bow, e1_ytrain_bow, e1_Xtest_bow, e1_ytest_bow, 'Enron 1') # approx 3 secs

Multinomial Naive Bayes Results on Enron 1 Dataset
Accuracy = 0.6776315789473685
Precision = 0.7010050251256281
Recall = 0.9087947882736156
F1 score = 0.7914893617021277


In [10]:
mnb_evaluate(e2_Xtrain_bow, e2_ytrain_bow, e2_Xtest_bow, e2_ytest_bow, 'Enron 2') # approx 3 secs 

Multinomial Naive Bayes Results on Enron 2 Dataset
Accuracy = 0.7238493723849372
Precision = 0.7523364485981309
Recall = 0.9252873563218391
F1 score = 0.8298969072164949


In [11]:
mnb_evaluate(e4_Xtrain_bow, e4_ytrain_bow, e4_Xtest_bow, e4_ytest_bow, 'Enron 4') # approx 4 secs

Multinomial Naive Bayes Results on Enron 4 Dataset
Accuracy = 0.27992633517495397
Precision = 0.27992633517495397
Recall = 1.0
F1 score = 0.4374100719424461


## Implement Discrete Naive Bayes

In [12]:
class DiscreteNaiveBayes:
    def train(self, X, y):
        self.classes = np.unique(y)
        self.probabilities = {}
        self.prior = {}
        self.features = X.shape[1]
        
        for c in self.classes:
            X_c = X[y == c]
            self.prior[c] = np.log(X_c.shape[0] / X.shape[0])
            
            p = np.log((X_c.sum(axis = 0) + add_one )/ (X_c.sum() + 2))
            self.probabilities[c] = p

    def predict(self, X):
        results = []
        X = X[:, : self.features]
        X = X.toarray()
        for x in X: 
            class_scores = []
            for c in self.classes:
                p = self.probabilities[c]
                prior = self.prior[c]
                p = np.asarray(p).flatten()
                if len(p) > len(x):
                    diff = len(p) - len(x)
                    x = np.pad(x, (0, diff), 'constant')
                posterior = prior + ((x * p) + ((1 - x) * (1 - p))).sum()
                class_scores.append(posterior)
                     
            results.append(self.classes[np.argmax(class_scores)])
        return results

### Results on Bernoulli Datasets

In [13]:
def dnb_evaluate(X_train, y_train, X_test, y_test, name):
    dnb = DiscreteNaiveBayes()
    dnb.train(X_train, y_train)
    predictions = dnb.predict(X_test)

    acc = accuracy_score(y_test, predictions)
    f1 = f1_score(y_test, predictions)
    recall = recall_score(y_test, predictions)
    precision = precision_score(y_test, predictions)

    print(f'Discrete Naive Bayes Results on {name} Dataset')
    print(f'Accuracy = {acc}')
    print(f'Precision = {precision}')
    print(f'Recall = {recall}')
    print(f'F1 score = {f1}')

In [14]:
dnb_evaluate(e1_Xtrain_bli, e1_ytrain_bli, e1_Xtest_bli, e1_ytest_bli, 'Enron 1') 

Discrete Naive Bayes Results on Enron 1 Dataset
Accuracy = 0.6732456140350878
Precision = 0.6732456140350878
Recall = 1.0
F1 score = 0.8047182175622543


In [15]:
dnb_evaluate(e2_Xtrain_bli, e2_ytrain_bli, e2_Xtest_bli, e2_ytest_bli, 'Enron 2')

Discrete Naive Bayes Results on Enron 2 Dataset
Accuracy = 0.7280334728033473
Precision = 0.7280334728033473
Recall = 1.0
F1 score = 0.8426150121065376


In [16]:
dnb_evaluate(e4_Xtrain_bli, e4_ytrain_bli, e4_Xtest_bli, e4_ytest_bli, 'Enron 4')

Discrete Naive Bayes Results on Enron 4 Dataset
Accuracy = 0.7200736648250461
Precision = 0.0
Recall = 0.0
F1 score = 0.0


  _warn_prf(average, modifier, msg_start, len(result))


## Logistic Regression

In [17]:
class LogisticRegression:
    def __init__(self, l2 = 0.1):
        self.weights = None
        self.num_features = None
        self.lambda_ = l2

    def sigmoid(self, x):
        return 1 / (1 + np.exp(-x))

    def gradient(self, X, y, h):
        m = len(y)
        return (np.dot(X.T, (y - h)) / m) - ((self.lambda_ * np.concatenate([[0], self.weights[1:]])) / m)

    def loss(self, y, h, weights):
        m = len(y)
        return np.mean(y * np.log(h + 10**-9) + (1 - y) * np.log(1 - h + 10**-9)) - (self.lambda_ * np.sum(weights[1:] ** 2) / 2)

    def train(self, X, y, learning_rate = 0.01, max_iter = 10000):
        X = X.toarray()
        self.num_features = X.shape[1]
        X = np.hstack((np.ones((X.shape[0],1)), X))
        m, n = X.shape
        self.weights = np.zeros(n)

        for i in range(max_iter):
            z = np.dot(X, self.weights)
            pred = self.sigmoid(z)
            loss = self.loss(y, pred, self.weights)
            grad = self.gradient(X, y, pred)
            self.weights += learning_rate * grad # gradient ascent 
    
    def predict(self, X):
        X = X.toarray()
        X = X[:,:self.num_features]

        if X.shape[1] < len(self.weights):
            self.weights = self.weights[:X.shape[1]+1]
        z = np.dot(X, self.weights[1:]) + self.weights[0]
        y_pred = self.sigmoid(z)
        return np.round(y_pred)


In [18]:
grid = [0.001, 0.01, 0.1, 1]
def cross_validation(X, y, grid):
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.3, random_state = 42) # 70/30 train validation split
    best_f1 = 0 
    lambda_ = None
    
    for l in grid: # train LR using each value of lambda
        lr = LogisticRegression(l2 = l) 
        lr.train(X_train, y_train, max_iter = 5000)
        pred = lr.predict(X_val)

        f1 = f1_score(y_val, pred)
        if f1 > best_f1:
            best_f1 = f1
            lambda_ = l

    return lambda_
    

### Results

In [19]:
def lr_evaluate(X_train, y_train, X_test, y_test, name):
    lambda_ = cross_validation(X_train, y_train, grid)

    lr = LogisticRegression(l2 = lambda_)
    lr.train(X_train, y_train, max_iter = 10000)
    predictions = lr.predict(X_test)

    acc = accuracy_score(y_test, predictions)
    f1 = f1_score(y_test, predictions)
    recall = recall_score(y_test, predictions)
    precision = precision_score(y_test, predictions)

    print(f'Logistic Regresstion Results on {name} Dataset')
    print(f'Accuracy = {acc}')
    print(f'Precision = {precision}')
    print(f'Recall = {recall}')
    print(f'F1 score = {f1}')
    

In [20]:
lr_evaluate(e1_Xtrain_bow, e1_ytrain_bow, e1_Xtest_bow, e1_ytest_bow, 'Enron 1 BOW') # approx 30 s

Logistic Regresstion Results on Enron 1 BOW Dataset
Accuracy = 0.43201754385964913
Precision = 0.7727272727272727
Recall = 0.22149837133550487
F1 score = 0.34430379746835443


In [21]:
lr_evaluate(e1_Xtrain_bli, e1_ytrain_bli, e1_Xtest_bli, e1_ytest_bli, 'Enron 1 Bernoulli') # approx 30 s

Logistic Regresstion Results on Enron 1 Bernoulli Dataset
Accuracy = 0.41228070175438597
Precision = 0.7407407407407407
Recall = 0.19543973941368079
F1 score = 0.3092783505154639


In [22]:
lr_evaluate(e2_Xtrain_bow, e2_ytrain_bow, e2_Xtest_bow, e2_ytest_bow, 'Enron 2 BOW') # approx 35 s

Logistic Regresstion Results on Enron 2 BOW Dataset
Accuracy = 0.4497907949790795
Precision = 0.8102189781021898
Recall = 0.31896551724137934
F1 score = 0.45773195876288664


In [23]:
lr_evaluate(e2_Xtrain_bli, e2_ytrain_bli, e2_Xtest_bli, e2_ytest_bli, 'Enron 2 Bernoulli') # approx 35 s

Logistic Regresstion Results on Enron 2 Bernoulli Dataset
Accuracy = 0.5209205020920502
Precision = 0.7741935483870968
Recall = 0.4827586206896552
F1 score = 0.5946902654867257


In [24]:
lr_evaluate(e4_Xtrain_bow, e4_ytrain_bow, e4_Xtest_bow, e4_ytest_bow, 'Enron 4 BOW') # approx 1 min

Logistic Regresstion Results on Enron 4 BOW Dataset
Accuracy = 0.7182320441988951
Precision = 0.0
Recall = 0.0
F1 score = 0.0


In [25]:
lr_evaluate(e4_Xtrain_bli, e4_ytrain_bli, e4_Xtest_bli, e4_ytest_bli, 'Enron 4 Bernoulli') # approx 1 min

Logistic Regresstion Results on Enron 4 Bernoulli Dataset
Accuracy = 0.7200736648250461
Precision = 0.0
Recall = 0.0
F1 score = 0.0


  _warn_prf(average, modifier, msg_start, len(result))


## SGDClassifer from scikit-learn

### Results on datasets

In [26]:
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV

def sgd_eval(X_train, y_train, X_test, y_test, name):
    sgd = SGDClassifier(max_iter=5000)

    param_grid = {
        'loss': ['hinge', 'log_loss', 'modified_huber'],
        'penalty': ['l1', 'l2', 'elasticnet'],
        'alpha': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100]
    }

    grid_search = GridSearchCV(sgd, param_grid, cv = 5)
    grid_search.fit(X_train, y_train)

    best_estimator = grid_search.best_estimator_

    X_test.resize((X_test.shape[0], X_train.shape[1]))

    predictions = best_estimator.predict(X_test)

    acc = accuracy_score(y_test, predictions)
    f1 = f1_score(y_test, predictions)
    recall = recall_score(y_test, predictions)
    precision = precision_score(y_test, predictions)

    print(f'Logistic Regresstion Results on {name} Dataset')
    print(f'Accuracy = {acc}')
    print(f'Precision = {precision}')
    print(f'Recall = {recall}')
    print(f'F1 score = {f1}')



In [27]:
sgd_eval(e1_Xtrain_bow, e1_ytrain_bow, e1_Xtest_bow, e1_ytest_bow, 'Enron 1 BOW') # all are approx 1 sec

Logistic Regresstion Results on Enron 1 BOW Dataset
Accuracy = 0.5087719298245614
Precision = 0.6666666666666666
Recall = 0.5407166123778502
F1 score = 0.5971223021582733


In [28]:
sgd_eval(e1_Xtrain_bli, e1_ytrain_bli, e1_Xtest_bli, e1_ytest_bli, 'Enron 1 Bernoulli')

Logistic Regresstion Results on Enron 1 Bernoulli Dataset
Accuracy = 0.5109649122807017
Precision = 0.7164948453608248
Recall = 0.4527687296416938
F1 score = 0.5548902195608783


In [29]:
sgd_eval(e2_Xtrain_bow, e2_ytrain_bow, e2_Xtest_bow, e2_ytest_bow, 'Enron 2 BOW')

Logistic Regresstion Results on Enron 2 BOW Dataset
Accuracy = 0.4100418410041841
Precision = 0.7704918032786885
Recall = 0.27011494252873564
F1 score = 0.4


In [30]:
sgd_eval(e2_Xtrain_bli, e2_ytrain_bli, e2_Xtest_bli, e2_ytest_bli, 'Enron 2 Bernoulli') 

Logistic Regresstion Results on Enron 2 Bernoulli Dataset
Accuracy = 0.3619246861924686
Precision = 0.7415730337078652
Recall = 0.1896551724137931
F1 score = 0.3020594965675057


In [31]:
sgd_eval(e4_Xtrain_bow, e4_ytrain_bow, e4_Xtest_bow, e4_ytest_bow, 'Enron 4 BOW') # approx 3 secs



Logistic Regresstion Results on Enron 4 BOW Dataset
Accuracy = 0.6703499079189686
Precision = 0.23529411764705882
Recall = 0.07894736842105263
F1 score = 0.11822660098522168


In [32]:
sgd_eval(e4_Xtrain_bli, e4_ytrain_bli, e4_Xtest_bli, e4_ytest_bli, 'Enron 4 Bernoulli') 

Logistic Regresstion Results on Enron 4 Bernoulli Dataset
Accuracy = 0.7090239410681399
Precision = 0.4858490566037736
Recall = 0.6776315789473685
F1 score = 0.565934065934066
