# Naive Bayes and Logistic Regression

### Creating the Datasets

In [128]:
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

import random
import os

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /Users/raaed/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/raaed/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /Users/raaed/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [31]:
import numpy as np
arr = np.array([[0, 1, 0], [2, 0, 3]])
result = arr.nonzero()
print(result)

(array([0, 1, 1]), array([1, 0, 2]))


In [2]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
bow_vectorizer = CountVectorizer()
brnli_vectorizer = CountVectorizer(binary = True)
add_one = 1

In [3]:
def load_dataset(filepath) -> list[tuple]:
    ham_data = []
    spam_data = []

    for dirpath, dirnames, filenames in os.walk(filepath):
        for filename in filenames:
            with open(os.path.join(dirpath,filename), encoding = 'iso-8859-1') as f:
                if 'ham' in filename:
                    ham_data.append(f.read())
                elif 'spam' in filename:
                    spam_data.append(f.read())
    
    # assign 1/0 to samples and combine data
    positive_samples = [(email, 1) for email in ham_data]
    negative_samples = [(email, 0) for email in spam_data]

    all_samples = positive_samples + negative_samples
    random.shuffle(all_samples)

    #remove stop words and lemmatize dataset
    filtered_dataset = []
    for text, y in all_samples:
        words = nltk.word_tokenize(text)
        words = [word for word in words if word.lower() not in stop_words]
        words = [lemmatizer.lemmatize(word) for word in words]
        fil_text = " ".join(words)
        filtered_dataset.append((fil_text, y))
    return filtered_dataset# [(text, 0), (text, 1)...] 

# Specify the path each of the datasets. Point to the directory containing the ham, spam directories - 
enron1_test = load_dataset('project1_datasets/enron1/test')
enron1_train = load_dataset('project1_datasets/enron1 2/train')

enron2_test = load_dataset('project1_datasets/test')
enron2_train = load_dataset('project1_datasets/train')

enron4_test = load_dataset('project1_datasets/enron4/test')
enron4_train = load_dataset('project1_datasets/enron4 2/train')

In [4]:
#convert the text data to bernoulli and bow models
def convert_to_bow(dataset):
    emails = [email for email, y in dataset]
    c = [y for email, y in dataset]

    bow_matrix = bow_vectorizer.fit_transform(emails)
    return bow_matrix, np.array(c)


def convert_to_bernoulli(dataset):
    emails = [email for email, y in dataset]
    c = [y for email, y in dataset]

    brnli_matrix = brnli_vectorizer.fit_transform(emails)
    return brnli_matrix, np.array(c)
    
# Bag Of Words datasets
e1_Xtrain_bow, e1_ytrain_bow = convert_to_bow(enron1_train)
e1_Xtest_bow, e1_ytest_bow = convert_to_bow(enron1_test)

e2_Xtrain_bow, e2_ytrain_bow = convert_to_bow(enron2_train)
e2_Xtest_bow, e2_ytest_bow = convert_to_bow(enron2_test)

e4_Xtrain_bow, e4_ytrain_bow = convert_to_bow(enron4_train)
e4_Xtest_bow, e4_ytest_bow = convert_to_bow(enron4_test)

# Bernoulli datasets
e1_Xtrain_bli, e1_ytrain_bli = convert_to_bernoulli(enron1_train)
e1_Xtest_bli, e1_ytest_bli = convert_to_bernoulli(enron1_test)

e2_Xtrain_bli, e2_ytrain_bli = convert_to_bernoulli(enron2_train)
e2_Xtest_bli, e2_ytest_bli = convert_to_bernoulli(enron2_test)

e4_Xtrain_bli, e4_ytrain_bli = convert_to_bernoulli(enron4_train)
e4_Xtest_bli, e4_ytest_bli = convert_to_bernoulli(enron4_test)


In [5]:
print(e1_Xtrain_bow.shape)
print(e1_Xtest_bow.shape)
print(len(e1_ytrain_bow))

(450, 9370)
(456, 10822)
450


### Implementing Multinomial Naive Bayes

In [105]:
class MultinomialNaiveBayes:
    def train(self, X, y):
        self.classes = np.unique(y)
        self.parameters = []
        self.vocab = np.sum(X, axis = 0)

        for c in self.classes:
            X_c = X[y == c]
            param_c = {}
            param_c['prior'] = np.log(X_c.shape[0] / float(X.shape[0]))
            param_c["word_count"] = np.sum(X_c, axis = 0)
            # print(param_c["word_count"].shape)
            param_c['word_prob'] = np.log((param_c["word_count"] + add_one) / (np.sum(param_c["word_count"]) + X.shape[1]))
            self.parameters.append(param_c)

            
    def predict(self, X):
        results = []
        for x in X:
            class_scores = []
            # x = x.toarray()
            for c in self.classes:
                word_prob = self.parameters[c]['word_prob']
                log_prob = self.parameters[c]['prior']
                
                common_words = np.isin(word_prob.nonzero()[1], x.nonzero()[1])
                x = x[:, common_words]
                word_prob_common = word_prob[:, common_words]
            
                log_prob += np.sum(np.sum(np.multiply(x, word_prob_common)))
                class_scores.append(log_prob)
            results.append(self.classes[np.argmax(class_scores)]) 
        return np.array(results)

mbn = MultinomialNaiveBayes()
mbn.train(e1_Xtrain_bow, e1_ytrain_bow)
predictions = mbn.predict(e1_Xtest_bow)


In [130]:
print(type(predictions.tolist()), type(e1_ytest_bow.tolist()))
matrix = confusion_matrix(e1_ytest_bow, predictions)
matrix

<class 'list'> <class 'list'>


array([[ 30, 119],
       [ 28, 279]])

### Implement Discrete Naive Bayes