In [72]:
import re
from collections import defaultdict

In [73]:
  class NaiveBayes:
    
    def __init__(self, classes):
        self.classes = classes
        self.vocab = set()
        self.class_word_counts = defaultdict(lambda: defaultdict(int))
        self.class_doc_counts = defaultdict(int)
        self.num_docs = 0
        
    def preprocess(self, text):
        # Remove punctuations and convert to lowercase
        text = re.sub(r'[^\w\s]', '', text).lower()
        # Remove stop words
        stop_words = set(['a', 'an', 'the', 'in', 'on', 'at', 'of', 'to', 'for', 'by', 'with', 'from', 'and'])
        tokens = text.split()
        tokens = [token for token in tokens if token not in stop_words]
        return tokens
        
    def train(self, documents):
        for document, category in documents:
            tokens = self.preprocess(document)
            self.vocab.update(tokens)
            self.class_doc_counts[category] += 1
            self.num_docs += 1
            for word in tokens:
                self.class_word_counts[category][word] += 1
        
    def predict(self, documents):
        lst = []
        for document in documents:
            print(document)
            tokens = self.preprocess(document)
            posteriors = {category: 0 for category in self.classes}
            for category in self.classes:
                prior = self.class_doc_counts[category] / self.num_docs
                posterior = prior
                for word in tokens:
                    word_count = self.class_word_counts[category][word]
                    total_count = sum(self.class_word_counts[category].values())
                    conditional = word_count / total_count
                    posterior *= conditional
                posteriors[category] = posterior
            lst.append(max(posteriors, key=posteriors.get))
        return lst

In [74]:
docs = [
    ('Its hot outside', 'weather'),
    ('flight is at 6', 'flight'),
    ('The news is good', 'politics'),
    ('The economy is decreasing', 'economy'),
    ('The movie was great', 'entertainment'),
    ('I love pizza', 'food'),
    ('The game was exciting', 'sports'),
    ('The team played poorly', 'sports'),
    ('The election is coming up', 'politics'),O
]

nb = NaiveBayes(['weather', 'flight', 'politics', 'economy', 'entertainment', 'food','sports'])
nb.train(docs)

# Predict the category of a new document
new_doc = ['hot flight is at 6']
category = nb.predict(new_doc)
category

hot flight is at 6


['weather']