<a href="https://colab.research.google.com/github/riccardocappi/Machine-Learning-From-Scratch/blob/main/Naive_Bayes_Text_Classifier_from_scratch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

###Lec 16 - Naive Bayes Text Classifier from scratch

Implementation from scratch of a Multinomial Naive Bayes Text Classifier. The model is trained and tesetd on the twenty_newsgroup dataset from sklearn to perform text classification.



###Importing

In [1]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.datasets import fetch_20newsgroups
from sklearn.metrics import accuracy_score

###Useful function

In [2]:
def group_data_by_target(targets):
    """
    @param targets: a one-dimensional numpy array containing the target labels of each example
    return value: a dictionary which associates to each target label the indices of the examples with
                  that label in the training set
    """
    grouped_data = {}
    for i, y in enumerate(targets):
        if y not in grouped_data:
            grouped_data[y] = []
        grouped_data[y].append(i)
    return grouped_data

###My implementation of Naive Bayes Text Classifier


In [3]:
class NBTextClassifier:
    def __init__(self, alpha=1):
        self.classes = None
        self.features_probs = None
        self.priors = None
        self.alpha = alpha

    def fit(self, X, y):
        """
        @param X: a scipy.sparse.csr_matrix
        @param y: a one-dimensional numpy array
        """
        n_examples, n_features = X.shape
        grouped_data = group_data_by_target(y)
        self.classes = list(grouped_data.keys())
        self.priors = np.zeros(shape=len(self.classes))
        self.features_probs = np.zeros(shape=(len(self.classes), n_features))

        for i, class_i in enumerate(self.classes):
            data_class_i = X[grouped_data[class_i]]
            prior_class_i = data_class_i.shape[0] / n_examples
            self.priors[i] = prior_class_i
            tot_features_count = data_class_i.sum()   # count of all features in class_i
            features_count = np.array(data_class_i.sum(axis=0))[0]   # count of each feature x_j in class_i
            for j, n_j in enumerate(features_count):
                self.features_probs[i][j] = (self.alpha + n_j) / (tot_features_count + self.alpha * n_features)

    def predict(self, X):
        """
        @param X: a scipy.sparse.csr_matrix
        return value: a numpy array containing the predicted class for each test example
        """
        y_pred = []
        log_features_probs = np.log(self.features_probs)
        log_priors = np.log(self.priors)
        for instance in X:
            theta = instance.multiply(log_features_probs).sum(axis=1)
            likelihood = [log_prior_class_i + theta[i] for i, log_prior_class_i in enumerate(log_priors)]
            y_pred.append(self.classes[np.argmax(likelihood)])
        return np.array(y_pred)


###Testing the model

In [4]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.metrics import accuracy_score

newsgroups_train = fetch_20newsgroups(subset='train')
newsgroups_test = fetch_20newsgroups(subset='test')

In [5]:
X_train = newsgroups_train['data']
y_train = newsgroups_train['target']
X_test =  newsgroups_test['data']
y_test =  newsgroups_test['target']

In [6]:
model = NBTextClassifier()
vectorizer = TfidfVectorizer(stop_words='english')
X_train_token_counts = vectorizer.fit_transform(X_train)

print("Start training")
model.fit(X_train_token_counts, y_train)

X_test_token_counts = vectorizer.transform(X_test)
y_pred = model.predict(X_test_token_counts)
print("Accuracy of Naive Bayes text classifier: " + str(accuracy_score(y_test, y_pred)))


Start training
Accuracy of Naive Bayes text classifier: 0.8169144981412639
