# Naive Bayes and Logistic Regression

### Creating the Datasets

In [50]:
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from sklearn.model_selection import train_test_split

import random
import os

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /Users/raaed/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/raaed/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /Users/raaed/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
bow_vectorizer = CountVectorizer()
brnli_vectorizer = CountVectorizer(binary = True)
add_one = 1

In [3]:
def load_dataset(filepath) -> list[tuple]:
    ham_data = []
    spam_data = []

    for dirpath, dirnames, filenames in os.walk(filepath):
        for filename in filenames:
            with open(os.path.join(dirpath,filename), encoding = 'iso-8859-1') as f:
                if 'ham' in filename:
                    ham_data.append(f.read())
                elif 'spam' in filename:
                    spam_data.append(f.read())
    
    # assign 1/0 to samples and combine data
    positive_samples = [(email, 1) for email in ham_data]
    negative_samples = [(email, 0) for email in spam_data]

    all_samples = positive_samples + negative_samples
    random.shuffle(all_samples)

    #remove stop words and lemmatize dataset
    filtered_dataset = []
    for text, y in all_samples:
        words = nltk.word_tokenize(text)
        words = [word for word in words if word.lower() not in stop_words]
        words = [lemmatizer.lemmatize(word) for word in words]
        fil_text = " ".join(words)
        filtered_dataset.append((fil_text, y))
    return filtered_dataset# [(text, 0), (text, 1)...] 

# Specify the path each of the datasets. Point to the directory containing the ham, spam directories - 
enron1_test = load_dataset('project1_datasets/enron1/test')
enron1_train = load_dataset('project1_datasets/enron1 2/train')

enron2_test = load_dataset('project1_datasets/test')
enron2_train = load_dataset('project1_datasets/train')

enron4_test = load_dataset('project1_datasets/enron4/test')
enron4_train = load_dataset('project1_datasets/enron4 2/train')

In [4]:
#convert the text data to bernoulli and bow models
def convert_to_bow(dataset):
    emails = [email for email, y in dataset]
    c = [y for email, y in dataset]

    bow_matrix = bow_vectorizer.fit_transform(emails)
    return bow_matrix, np.array(c)


def convert_to_bernoulli(dataset):
    emails = [email for email, y in dataset]
    c = [y for email, y in dataset]

    brnli_matrix = brnli_vectorizer.fit_transform(emails)
    return brnli_matrix, np.array(c)
    
# Bag Of Words datasets
e1_Xtrain_bow, e1_ytrain_bow = convert_to_bow(enron1_train)
e1_Xtest_bow, e1_ytest_bow = convert_to_bow(enron1_test)

e2_Xtrain_bow, e2_ytrain_bow = convert_to_bow(enron2_train)
e2_Xtest_bow, e2_ytest_bow = convert_to_bow(enron2_test)

e4_Xtrain_bow, e4_ytrain_bow = convert_to_bow(enron4_train)
e4_Xtest_bow, e4_ytest_bow = convert_to_bow(enron4_test)

# Bernoulli datasets
e1_Xtrain_bli, e1_ytrain_bli = convert_to_bernoulli(enron1_train)
e1_Xtest_bli, e1_ytest_bli = convert_to_bernoulli(enron1_test)

e2_Xtrain_bli, e2_ytrain_bli = convert_to_bernoulli(enron2_train)
e2_Xtest_bli, e2_ytest_bli = convert_to_bernoulli(enron2_test)

e4_Xtrain_bli, e4_ytrain_bli = convert_to_bernoulli(enron4_train)
e4_Xtest_bli, e4_ytest_bli = convert_to_bernoulli(enron4_test)


In [5]:
print(e1_Xtrain_bow.shape)
print(e1_Xtest_bow.shape)
print(len(e1_ytrain_bow))

(450, 9370)
(456, 10822)
450


### Implementing Multinomial Naive Bayes

In [132]:
class MultinomialNaiveBayes:
    def train(self, X, y):
        self.classes = np.unique(y)
        self.parameters = []
        self.vocab = np.sum(X, axis = 0)
        self.features = X.shape[1]

        for c in self.classes:
            X_c = X[y == c]
            param_c = {}
            param_c['prior'] = np.log(X_c.shape[0] / float(X.shape[0]))
            param_c["word_count"] = np.sum(X_c, axis = 0)
            # print(param_c["word_count"].shape)
            # param_c['word_prob'] = np.log((param_c["word_count"] + add_one) / (np.sum(param_c["word_count"]) + X.shape[1]))
            param_c['word_prob'] = (param_c["word_count"] + add_one) / (np.sum(param_c["word_count"]) + X.shape[1])
            self.parameters.append(param_c)

            
    def predict(self, X):
        results = []
        X = X[:, :self.features]
        for x in X:
            class_scores = []
            for c in self.classes:
                word_prob = self.parameters[c]['word_prob']
                log_prob = self.parameters[c]['prior']
                
                # common_words = np.isin(word_prob.nonzero()[1], x.nonzero()[1])
                
                # x = x[:, common_words]
                # word_prob_common = word_prob[:, common_words]
            
                # log_prob += np.sum(np.sum(np.multiply(x, word_prob_common)))
                
                log_prob += np.sum(x * word_prob)
                class_scores.append(log_prob)
            results.append(self.classes[np.argmax(class_scores)]) 
        return np.array(results)

mbn = MultinomialNaiveBayes()
mbn.train(e1_Xtrain_bow, e1_ytrain_bow)
predictions = mbn.predict(e1_Xtest_bow)


KeyboardInterrupt: 

In [127]:
matrix = confusion_matrix(e1_ytest_bow, predictions)
matrix

array([[ 30, 119],
       [ 28, 279]])

### Implement Discrete Naive Bayes

In [48]:
class DiscreteNaiveBayes:
    def train(self, X, y):
        self.classes = np.unique(y)
        self.probabilities = {}
        self.prior = {}
        self.features = X.shape[1]
        
        for c in self.classes:
            X_c = X[y == c]
            self.prior[c] = np.log(X_c.shape[0] / X.shape[0])
            
            p = np.log((X_c.sum(axis = 0) + add_one )/ (X_c.shape[0]))
            self.probabilities[c] = p

    def predict(self, X):
        results = []
        X = X[:, : self.features]
        X = X.toarray()
        for x in X: 
            class_scores = []

            for c in self.classes:
                p = self.probabilities[c]
                prior = self.prior[c]

                posterior = prior 
                posterior += ((x * p[0, c]) + (1-x) * (1 - p[0, c])).sum()
                class_scores.append(posterior)
            
            results.append(self.classes[np.argmax(posterior)])
        return results

        
dnm = DiscreteNaiveBayes()
dnm.train(e2_Xtrain_bli, e2_ytrain_bli)
pred = dnm.predict(e2_Xtest_bli)
    

In [49]:
matrix = confusion_matrix(e2_ytest_bli, pred)
matrix

array([[130,   0],
       [348,   0]])

In [53]:
print(type(e1_Xtest_bli))
print(type(e1_Xtest_bow))

<class 'scipy.sparse._csr.csr_matrix'>
<class 'scipy.sparse._csr.csr_matrix'>


In [62]:
arr = np.array([1,2,3])
np.concatenate(([0], arr))

array([0, 1, 2, 3])

### Logistic Regression

In [97]:
class LogisticRegression:
    def __init__(self, l2 = 0.1):
        self.weights = None
        self.num_features = None
        self.lambda_ = l2

    def sigmoid(self, x):
        return 1 / (1 + np.exp(-x))

    def gradient(self, X, y, h):
        m = len(y)
        return (np.dot(X.T, (y - h)) / m) - ((self.lambda_ * np.concatenate([[0], self.weights[1:]])) / m)

    def loss(self, y, h, weights):
        m = len(y)
        return np.mean(y * np.log(h + 10**-9) + (1 - y) * np.log(1 - h + 10**-9)) - (self.lambda_ * np.sum(weights[1:] ** 2) / 2)

    def train(self, X, y, learning_rate = 0.3, num_iter = 10000):
        X = X.toarray()
        self.num_features = X.shape[1]
        X = np.hstack((np.ones((X.shape[0],1)), X))
        m, n = X.shape
        self.weights = np.zeros(n)

        for i in range(num_iter):
            z = np.dot(X, self.weights)
            pred = self.sigmoid(z)
            loss = self.loss(y, pred, self.weights)
            grad = self.gradient(X, y, pred)
            self.weights += learning_rate * grad # gradient ascent 

            if (i + 1) % 1000 == 0: 
                # acc = accuracy_score(y, np.round(pred))
                print(f'Loss after {i + 1} iterations: {loss}')
    
    def predict(self, X):
        X = X.toarray()
        X = X[:,:self.num_features]
        # X = np.hstack((np.ones((X.shape[0],1)), X))

        z = np.dot(X, self.weights[1:]) + self.weights[0]
        y_pred = self.sigmoid(z)
        return np.round(y_pred)


In [120]:
lr = LogisticRegression(l2 = 0.01)
lr.train(e2_Xtrain_bow, e2_ytrain_bow, learning_rate = 0.03, num_iter= 10000)
kk = lr.predict(e2_Xtest_bow)

Loss after 1000 iterations: -0.13104265062750026
Loss after 2000 iterations: -0.12899023940837484
Loss after 3000 iterations: -0.13885696580260642
Loss after 4000 iterations: -0.15122087911857335
Loss after 5000 iterations: -0.16404688133829948
Loss after 6000 iterations: -0.17671563198597148
Loss after 7000 iterations: -0.1890174546993992
Loss after 8000 iterations: -0.2008871306307122
Loss after 9000 iterations: -0.21231576842991626
Loss after 10000 iterations: -0.2233171663516595


In [121]:

acc = accuracy_score(e2_ytest_bow, kk)
print(acc)

0.42887029288702927


In [115]:
# Train Test splitting and learning lambda

### SGDClassifer from scikit-learn

In [124]:
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV

sgd = SGDClassifier(max_iter=10000)

param_grid = {
    'loss': ['hinge', 'log_loss', 'modified_huber'],
    'penalty': ['l1', 'l2', 'elasticnet'],
    'alpha': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100]
}

grid_search = GridSearchCV(sgd, param_grid, cv = 5)
grid_search.fit(e1_Xtrain_bow, e1_ytrain_bow)

best_estimator = grid_search.best_estimator_

test_score = best_estimator.score(e1_Xtest_bow[:, :e1_Xtrain_bow.shape[1]], e1_ytest_bow)




In [125]:
test_score

0.47149122807017546