Sulution from https://datumguy.wordpress.com/2017/08/14/hackerranks-stack-exchange-question-classifier-review/

In [1]:
import json
from nltk.corpus import stopwords
from nltk.stem import LancasterStemmer
import random
import re
import string

In [2]:
import nltk
nltk.download

<bound method Downloader.download of <nltk.downloader.Downloader object at 0x7f6c8f47d7c0>>

In [3]:
class TFIDFreqClasifier:
    def __init__(self):
        self.stemmer = LancasterStemmer()
        self.label_freq = {}
        self.corpus = {}
        self.vocabulary = {}
        self.vocabulary_count = {}
        self.label_probability = {}

        self.punctuation_regex = re.compile('[{0}]'.format(re.escape(string.punctuation)))
        # !"#$%&'()*+, -./:;<=>?@[\]^_`{|}~
    
    stemmer = LancasterStemmer()
    punctuation_regex = re.compile('[{0}]'.format(re.escape(string.punctuation)))
 
    def compute_doc_label_probability(self, label, doc):
        probability = self.label_probability[label]
        for term in doc.split():
            probability *= (self.vocabulary[label][term] if term in self.vocabulary[label] else 0) + 1
            probability /= self.vocabulary_count[label] + len(self.vocabulary[label])
        return probability
 
    def pre_process(self, doc):
        pre_processed_doc = ''
        for term in doc.lower().split():
            term = self.punctuation_regex.sub('', term)
            if term and term not in stopwords.words('english'):
                pre_processed_doc += self.stemmer.stem(term) + ' '
        return pre_processed_doc.rstrip()
 
    def classify(self, doc):
        most_probable_label = None
        max_probability = 0
        pre_processed_doc = self.pre_process(doc)
        if pre_processed_doc:
            for label in self.vocabulary.keys():
                probability = self.compute_doc_label_probability(label, pre_processed_doc)
                if probability > max_probability:
                    most_probable_label = label
                    max_probability = probability
        return most_probable_label
       
        
    def train(self, training_set):
        label_probability_unit = 1/len(training_set)
        for doc, label in training_set:
            pre_processed_doc = self.pre_process(doc)
            if pre_processed_doc:
                # Update label probability
                if label not in self.label_probability:
                    self.label_probability[label] = 0
                self.label_probability[label] += label_probability_unit

                # Update terms frequency
                for term in pre_processed_doc.split():
                    if term not in self.corpus:
                        self.corpus[term] = 0
                    if label not in self.vocabulary:
                        self.vocabulary[label] = {}
                        self.vocabulary_count[label] = 0
                    if term not in self.vocabulary[label]:
                        self.vocabulary[label][term] = 0
                    self.corpus[term] += 1
                    self.vocabulary[label][term] += 1
                    self.vocabulary_count[label] += 1


    def test(self, labeled_docs):
        matched = 0
        for doc, expected_label in labeled_docs:
            label = self.classify(doc)
            if label == expected_label:
                matched += 1
        return matched

In [None]:
# if __name__ == '__main__':
labeled_set = []
with open('training.json', encoding='utf-8') as f:
    N = int(f.readline())
    for line in f:
        json_obj = json.loads(line)
        labeled_set.append((json_obj['question'] + ' ' + json_obj['excerpt'], json_obj['topic']))

random.shuffle(labeled_set)
training_portion = N//2
training_set = labeled_set[:training_portion]
test_set = labeled_set[training_portion:]

model = TFIDFreqClasifier()
model.train(training_set)
matched = model.test(test_set)

print('Accuracy', '{0:.2%}'.format(matched/len(test_set)))