# Text Classification and Naive Bayes
# Multinomial Naive Bayes

In [57]:
from functools import reduce

In [58]:
# training data
data = dict(
    c = ["Chinese Beijing Chinese", "Chinese Chinese Shanghai", "Chinese Macao"],
    j = ["Tokyo Japan Chinese"]
)

# test data
test_data = "Chinese Chinese Chinese Tokyo Japan"

In [59]:
data

{'c': ['Chinese Beijing Chinese', 'Chinese Chinese Shanghai', 'Chinese Macao'],
 'j': ['Tokyo Japan Chinese']}

## Collecting all the words in a single class list

In [60]:
def all_words_in_a_class(data):
    a = dict()
    for key, value in data.items():
        l = []
        for val in value:
            l.extend(val.split()) # split the sentence and store each words in a list
        a.setdefault(key, []).extend(l) # store the list of words in regarding class
    return a
    
all_words_in_a_class(data)

{'c': ['Chinese',
  'Beijing',
  'Chinese',
  'Chinese',
  'Chinese',
  'Shanghai',
  'Chinese',
  'Macao'],
 'j': ['Tokyo', 'Japan', 'Chinese']}

## Count the words in a class

In [61]:
def count_each_words_in_a_class(a):
    b = dict()
    for key, value in a.items():
        z = dict()
        for i in set(value): # all the distinct values in a class
            z.setdefault(i, value.count(i)) # store the occurence of the word 
        b.setdefault(key, z) # store the dictionary in the regarding class
    return b
    
count_each_words_in_a_class(all_words_in_a_class(data))

{'c': {'Shanghai': 1, 'Chinese': 5, 'Beijing': 1, 'Macao': 1},
 'j': {'Japan': 1, 'Chinese': 1, 'Tokyo': 1}}

## Calculating the Prior Probabilites

In [62]:
def priors(data):
    # count the number of each class in the data
    j = {key: len(value) for key, value in data.items()}
    
    # total number of data
    s = reduce(lambda x, y: x + y, j.values())
    
    # get the probability of each class
    for key, value in j.items():
        j[key] /= s

    return j

priors(data)

{'c': 0.75, 'j': 0.25}

## Calculating the conditional probability

In [63]:
def conditional_prob(word, class_, data):
    b = count_each_words_in_a_class(all_words_in_a_class(data))
    
    # count of "word" in a class
    count_w_c = b.get(class_).get(word) or 0 

    # count of total words in a class
    count_c = sum(b.get(class_).values())

    # count of distinct words
    c = set()
    for key, value in b.items():
        for i in value.keys():
            c.add(i)
    V = len(c)

    return (count_w_c + 1) / (count_c + V)

conditional_prob("Chinese", "c", data)

0.42857142857142855

## Predict any test data

In [64]:
def predict(test_data, data):
    result = dict()
    for key in data.keys():
        prob = priors(data).get(key) # prior probability of the class
        for word in test_data.split():
            prob *= conditional_prob(word, key, data) 
        result.setdefault(key, prob)
    return result

## Determining which class the test data belongs

In [65]:
def class_label(predicted_dict):
    return max(predicted_dict, key=predicted_dict.get)

In [66]:
p = predict(test_data, data)
p

{'c': 0.00030121377997263036, 'j': 0.00013548070246744226}

In [67]:
class_label(p)

'c'

### Test data belongs to class `c`