# Naive Bayes (Document Classification)

## Importing Libraries

In [1]:
import string
import numpy as np
 
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score

### Loading and preprocessing data into Term Frequency Vector

In [2]:
X_train, y_train = fetch_20newsgroups(subset="train", return_X_y=True)
X_test, y_test = fetch_20newsgroups(subset="test", return_X_y=True)

n_classes = 20
max_features = 10000

vectorizer = CountVectorizer(ngram_range=(1, 1), max_features=max_features)
vectorizer.fit(X_train)

X_train = vectorizer.transform(X_train)
X_test = vectorizer.transform(X_test)

X_train = X_train.toarray()
X_test = X_test.toarray()

y_train = np.array(y_train)
y_test = np.array(y_test)

### Building Prior Probability and Word Probability Matrices

In [3]:
probs = np.ones((n_classes, max_features))
for X, y in zip(X_train, y_train):
    probs[y] += X
probs_sum = np.sum(probs, axis=1, keepdims=True)
probs = probs / probs_sum

prior = np.array([np.sum(y_train == c) for c in range(n_classes)])
prior = prior / prior.sum()

### Function to Predict data using prior and probability matrix

In [4]:
def predict(X, probs, prior):
    y_likelihood = np.zeros((X.shape[0], probs.shape[0]))
    for i in range(X.shape[0]):
        y_likelihood[i] = np.sum(X[i] * np.log(probs), axis=1)
    y_pred_prob = y_likelihood +  prior
    y_pred = np.argmax(y_pred_prob, axis=1)
    return y_pred

### Training and Testing Prediction Accuracy

In [5]:
y_pred = predict(X_train, probs, prior)

print("Training Accuracy : {:.2f}%".format(accuracy_score(y_train, y_pred) * 100))

y_pred = predict(X_test, probs, prior)

print("Testing Accuracy : {:.2f}%".format(accuracy_score(y_test, y_pred) * 100))

Training Accuracy : 89.39%
Testing Accuracy : 77.27%


## Using Sklearn 

In [6]:
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()
nb.fit(X_train, y_train)
nb.score(X_test, y_test)

0.772437599575146