In [1]:
import os
from tqdm.notebook import tqdm
import json
from spacy.lang.en import English
import numpy as np

In [2]:
tokenizer = English().tokenizer

### Generate BoW Representations for train and test data

In [3]:
data_dir = "../20news-bydate/20news-bydate-train/"

bbow = dict()
cbow = dict()
vocab = set()

for cls in tqdm(os.listdir(data_dir)):
    for doc in os.listdir(os.path.join(data_dir, cls)):
        docid = "/".join([cls, doc])
        tokens = tokenizer(open(os.path.join(data_dir, cls, doc)).read())
        
        doc_bbow = dict()
        doc_cbow = dict()
        for token in tokens:
            tok = token.text.lower() # considering token text directily. Lemma can also be taken.
            if tok.isalnum(): # only consider alphanumeric tokens
                vocab.add(tok)
                if tok not in doc_bbow:
                    doc_bbow[tok] = 1
                doc_cbow[tok] = doc_cbow.get(tok, 0) + 1
        bbow[docid] = doc_bbow
        cbow[docid] = doc_cbow

HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))




In [4]:
test_dir = "../20news-bydate/20news-bydate-test/"
test_bbow = dict()
test_cbow = dict()

classes =  os.listdir(test_dir)
for cls in tqdm(classes):
    for doc in os.listdir(os.path.join(test_dir, cls)):
        
        docid = "/".join([cls, doc])
        tokens = tokenizer(open(os.path.join(test_dir, cls, doc)).read())
        
        doc_bbow = dict()
        doc_cbow = dict()
        
        for token in tokens:
            tok = token.text.lower()
            if tok.isalnum():
                if tok not in doc_bbow:
                    doc_bbow[tok] = 1
                doc_cbow[tok] = doc_cbow.get(tok, 0) + 1
                
        test_bbow[docid] = doc_bbow
        test_cbow[docid] = doc_cbow

HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))




#### Caching the BoW representations

In [5]:
with open('../data/bbow.json', 'w') as f:
    json.dump(bbow, f)
with open('../data/cbow.json', 'w') as f:
    json.dump(cbow, f)

with open('../data/test_bbow.json', 'w') as f:
    json.dump(test_bbow, f)
with open('../data/test_cbow.json', 'w') as f:
    json.dump(test_cbow, f)

### Define the Naive Bayes Classifier for News Classification (with both Multinomial and Poisson models)

In [6]:
class NaiveBayesNewsClassifier():
    def __init__(self, model='multinomial', alpha=0.8, k=0.1):
        """Initializes the chosen type of Naive Bayes Classifier with given hyperparameters. 

        Args:
            model (str, optional): Choose from ['poisson', 'multinomial']. Defaults to 'multinomial'.
            alpha (float, optional): Trade-off weight for length normalization factor (only pertinent to poisson model). Defaults to 0.8.
            k (float, optional): Laplace smoothing parameter. Defaults to 0.1.
        """
        assert model in ['poisson', 'multinomial']
        self.k = k
        self.alpha = alpha
        self.model = model        
        
    def _estimate_likelihoods_poisson(self, bow, classes, vocab, data_dir):
        """
        Calculates values proportional to the marginal probability of each additional occurence of a
        particular word in a particular class document. Assumes word counts are drawn from poisson distribution.
        The poisson parameter estimation scales for document length to account for length variation among
        articles belonging to the same class.
        """
        topic_counts = {y: len(os.listdir(os.path.join(data_dir, y))) for y in classes}
        
        dlen = dict()
        tlen = dict()
        
        for y in classes:
            dlen[y] = dict()
            for doc in os.listdir(os.path.join(data_dir, y)):
                docid = "/".join([y, doc])
                dlen[y][docid] = np.sum(list(bow[docid].values()))
            tlen[y] = np.sum(list(dlen[y].values()))
        
        lambdas = {y: {v: 0 for v in vocab} for y in classes}
        for y in tqdm(classes):            
            for doc in tqdm(os.listdir(os.path.join(data_dir, y)), leave=False):
                docid = "/".join([y, doc])
                dlj = dlen[y][docid]
                gdc = self.alpha*(1/topic_counts[y]) + (1 - self.alpha)*dlj/tlen[y]
                normed_weight = gdc / (dlj + self.k * len(vocab))
                for v in vocab:
                    lambdas[y][v] += normed_weight * (bow[docid].get(v, 0) + self.k)
                    
        self.word_likelihoods = {y: dict() for y in classes}
        
        for y in classes:
            for v in vocab:
                self.word_likelihoods[y][v] = np.log(lambdas[y][v]) - lambdas[y][v]        
        
    
    def _estimate_likelihoods_multinomial(self, bow, classes, vocab):
        """
        Estimates likelihood of a particular word belonging to a document of a particular class.
        Assumes words are drawn from a multinomial distribution.
        """
        smoothed_counts = {y: {v: self.k for v in vocab} for y in classes}
        for y in classes:
            for doc in os.listdir(os.path.join(data_dir, y)):
                docid = "/".join([y, doc])
                for v, cnt in bow[docid].items():
                    smoothed_counts[y][v] += cnt
                    
        self.word_likelihoods = {y: dict() for y in classes}
        
        for y in tqdm(classes):
            den = np.log(np.sum(list(smoothed_counts[y].values())))
            for v in vocab:
                self.word_likelihoods[y][v] = np.log(smoothed_counts[y][v]) - den
        
        
    def train(self, bow, vocab, classes, data_dir):
        """Fits the Naive Bayes model to training data by estimating distribution parameters."""

        print('Training Beginning ...')
        self.classes =  classes        
        
        print('    Estimating class-wise likelihoods of words...')      
        if self.model == 'multinomial': self._estimate_likelihoods_multinomial(bow, classes, vocab)
        elif self.model == 'poisson': self._estimate_likelihoods_poisson(bow, classes, vocab, data_dir)
        print('    Class-wise word likelihood estimation complete!\n')
        
        print('    Estimating class priors ...')
        self.priors = {y: len(os.listdir(os.path.join(data_dir, y))) for y in classes}
        total = np.sum(list(self.priors.values()))
        for y in classes:
            self.priors[y] = np.log(self.priors[y]/total)
        print('    Class prior estimation complete!\n')
        print('Training Complete!\n')
        
        print(f'\nAccuracy on train data: {100*self._accuracy_on_bowlist(bow)}%\n')
        
    def predict(self, doc_bow):
        """Function to predict class of a particular document/news article."""
        posteriors = {y: self.priors[y] for y in self.classes}        
        for v in doc_bow:
            for y in self.classes:
                posteriors[y] += doc_bow[v]*self.word_likelihoods[y].get(v, 0)
        
        return list(posteriors.keys())[np.argmax(list(posteriors.values()))]
    
    def _accuracy_on_bowlist(self, bow_list):
        """Returns prediction accuracy over a set of documents."""
        total_cnt, correct_cnt = 0, 0
        for doc, doc_bow in bow_list.items():
            total_cnt += 1
            cls = doc.split('/')[0]
            pred = self.predict(doc_bow)
            if pred == cls: correct_cnt += 1
        return np.round(correct_cnt/total_cnt, 4)
    
    def test(self, bow):
        """Member function for allowing users to evaluate on test datasets."""
        print('Evaluating test data ...')
        print(f'\nEvaluation complete!\nAccuracy on test data: {100*self._accuracy_on_bowlist(bow)}%\n')

### Using Binary-BoW representation and Multinomial model for News Classification using Naive Baye's

In [7]:
nb_bbow_mul = NaiveBayesNewsClassifier(k=0.1, model='multinomial')
nb_bbow_mul.train(bbow, vocab, classes, data_dir)

nb_bbow_mul.test(test_bbow)

Training Beginning ...
    Estimating class-wise likelihoods of words...


HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))


    Class-wise word likelihood estimation complete!

    Estimating class priors ...
    Class prior estimation complete!

Training Complete!


Accuracy on train data: 99.09%

Evaluating test data ...

Evaluation complete!
Accuracy on test data: 82.46%



### Using Count-BoW representation and Multinomial model for News Classification using Naive Baye's

In [8]:
nb_cbow_mul = NaiveBayesNewsClassifier(k=0.1, model='multinomial')
nb_cbow_mul.train(cbow, vocab, classes, data_dir)

nb_cbow_mul.test(test_cbow)

Training Beginning ...
    Estimating class-wise likelihoods of words...


HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))


    Class-wise word likelihood estimation complete!

    Estimating class priors ...
    Class prior estimation complete!

Training Complete!


Accuracy on train data: 98.66%

Evaluating test data ...

Evaluation complete!
Accuracy on test data: 82.78999999999999%



### Using Count-BoW representation and Poisson model for News Classification using Naive Baye's

In [9]:
nb_cbow_poi = NaiveBayesNewsClassifier(model='poisson', k=1e-5, alpha = 0.9)
nb_cbow_poi.train(cbow, vocab, classes, data_dir)

nb_cbow_poi.test(test_cbow)

Training Beginning ...
    Estimating class-wise likelihoods of words...


HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=480.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=584.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=591.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=590.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=578.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=593.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=585.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=594.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=598.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=597.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=600.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=595.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=591.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=594.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=593.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=599.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=546.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=564.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=465.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=377.0), HTML(value='')))


    Class-wise word likelihood estimation complete!

    Estimating class priors ...
    Class prior estimation complete!

Training Complete!


Accuracy on train data: 99.69%

Evaluating test data ...

Evaluation complete!
Accuracy on test data: 83.19%

