In [1]:
"""
Text classification using multinomial Naive Bayes.

Data: http://qwone.com/~jason/20Newsgroups/
"""

from __future__  import division
from collections import defaultdict
import numpy     as np
import math

class DocumentClassifier:
    
    def __init__(self, train_data_file, train_label_file, 
                 test_data_file, test_label_file, vocab_file):
        
        # Reading train data.
        with open(train_data_file, "r") as f:
            fn = lambda x: map(int, x.split(" "))
            train_data = map(fn, filter(None, f.read().split("\n")))
        with open(train_label_file, "r") as f:
            train_label = map(int, filter(None, f.read().split("\n")))

        # Reading test data.
        with open(test_data_file, "r") as f:
            fn = lambda x: map(int, x.split(" "))
            test_data = defaultdict(lambda : list())
            for data in filter(None, f.read().split("\n")):
                data = map(int, data.split(" "))
                test_data[data[0]].append([data[1], data[2]])
        with open(test_label_file, "r") as f:
            test_label = map(int, filter(None, f.read().split("\n")))

        with open(vocab_file, "r") as f:
            vocab = f.read().split("\n")
        
        # Instance data.
        self.train_data = train_data
        self.train_label = train_label
        self.test_data = test_data
        self.test_label = test_label
        self.vocab = vocab
        self.class_doc_prob, self.class_word_prob = \
            self.get_smooth_probabilities_from_train_data()
        self.classes = self.class_doc_prob.keys()
        
    def get_smooth_probabilities_from_train_data(self):
        """
        # Calculate smooth probabilities.
        # i ) πj , the fraction of documents that belong to that class;
        # ii) Pj , a probability distribution over V that models the documents of that class.
        """
        class_doc_count = defaultdict(int)
        for cls in self.train_label:
            class_doc_count[cls] += 1

        class_word_count = defaultdict(lambda : defaultdict(int))
        for data in self.train_data:
            docid, wordid, word_count = data
            cls = self.train_label[docid-1]
            class_word_count[cls][wordid] += word_count

        class_doc_prob = defaultdict(float)
        class_word_prob = defaultdict(lambda : defaultdict(float))

        # Calculating pi for each newspaper(class).
        no_docs = sum(class_doc_count.values())
        for cls, doc_count in class_doc_count.items():
            class_doc_prob[cls] = doc_count / no_docs

        # Calculating pi for each word belonging to a particular newspaper(class).
        for cls in class_word_count:
            no_of_words_cls = sum(class_word_count[cls].values())
            for wordid in range(1, len(self.vocab) + 1):
                class_word_prob[cls][wordid] = (class_word_count[cls][wordid] + 1) / ( no_of_words_cls + 2)
        return class_doc_prob, class_word_prob
    
    def get_probability_of_doc(self, cls, doc_details):
        """
        Get the probability of a document 
        belonging to a given class.
        """
        prob = math.log(self.class_doc_prob[cls]) 
        for wordid, count in doc_details:
            prob += count * math.log(self.class_word_prob[cls][wordid])
        return prob

    def predict_class_of_doc(self, doc_details):
        """
        Predicts the class using unigram model given a doc details.
        """
        probs = np.array([self.get_probability_of_doc(cls, doc_details) 
                          for cls in self.classes])
        predicted_class = self.classes[probs.argmax()]
        return predicted_class

    def test_accuracy(self):
        error_count = 0
        for docid, doc_details in self.test_data.items():
            predicted_class = self.predict_class_of_doc(doc_details)
            true_class = self.test_label[docid-1]
            if predicted_class != true_class:
                error_count += 1

        print "Error is %.2f " %(error_count * 100 /len(self.test_label))

In [2]:
classifier = DocumentClassifier("train.data", "train.label", 
                                "test.data", "test.label",
                                "vocabulary.txt")
classifier.test_accuracy()

Error is 22.24 
