In [44]:
from functools import reduce

In [45]:
# training data
data = dict(
    c = ["Chinese Beijing Chinese", "Chinese Chinese Shanghai", "Chinese Macao"],
    j = ["Tokyo Japan Chinese"]
)

# test data
test_data = "Chinese Chinese Chinese Tokyo Japan"

In [46]:
data

{'c': ['Chinese Beijing Chinese', 'Chinese Chinese Shanghai', 'Chinese Macao'],
 'j': ['Tokyo Japan Chinese']}

In [47]:
test_data

'Chinese Chinese Chinese Tokyo Japan'

## Removing duplicate values from the document

In [48]:
def remove_duplicate_values(data):
    for key, value in data.items():
        data[key] = [" ".join(list(set(val.split()))) for val in value]
    return data

In [49]:
data = remove_duplicate_values(data)
data

{'c': ['Chinese Beijing', 'Chinese Shanghai', 'Chinese Macao'],
 'j': ['Chinese Japan Tokyo']}

In [50]:
test_data = " ".join(list(set(test_data.split())))

test_data

'Chinese Japan Tokyo'

In [51]:
def all_words_in_a_class(data):
    a = dict()
    for key, value in data.items():
        l = []
        for val in value:
            l.extend(val.split()) # split the sentence and store each words in a list
        a.setdefault(key, []).extend(l) # store the list of words in regarding class
    return a
    
all_words_in_a_class(data)

{'c': ['Chinese', 'Beijing', 'Chinese', 'Shanghai', 'Chinese', 'Macao'],
 'j': ['Chinese', 'Japan', 'Tokyo']}

In [52]:
def count_each_words_in_a_class(a):
    b = dict()
    for key, value in a.items():
        z = dict()
        for i in set(value): # all the distinct values in a class
            z.setdefault(i, value.count(i)) # store the occurence of the word 
        b.setdefault(key, z) # store the dictionary in the regarding class
    return b
    
count_each_words_in_a_class(all_words_in_a_class(data))

{'c': {'Chinese': 3, 'Beijing': 1, 'Macao': 1, 'Shanghai': 1},
 'j': {'Chinese': 1, 'Tokyo': 1, 'Japan': 1}}

In [53]:
def priors(data):
    # count the number of each class in the data
    j = {key: len(value) for key, value in data.items()}
    
    # total number of data
    s = reduce(lambda x, y: x + y, j.values())
    
    # get the probability of each class
    for key in j.keys():
        j[key] /= s

    return j

priors(data)

{'c': 0.75, 'j': 0.25}

In [54]:
def conditional_prob(word, class_, data):
    b = count_each_words_in_a_class(all_words_in_a_class(data))
    
    # count of "word" in a class
    count_w_c = b.get(class_).get(word) or 0 

    # count of total words in a class
    count_c = sum(b.get(class_).values())

    # count of distinct words
    c = set()
    for value in b.values():
        for i in value.keys():
            c.add(i)
    V = len(c)

    return (count_w_c + 1) / (count_c + V)

# example
conditional_prob("Chinese", "c", data)

0.3333333333333333

In [55]:
def predict(test_data, data):
    result = dict()
    for key in data.keys():
        prob = priors(data).get(key) # prior probability of the class
        for word in test_data.split():
            prob *= conditional_prob(word, key, data) 
        result.setdefault(key, prob)
    return result

In [56]:
def class_label(predicted_dict):
    return max(predicted_dict, key=predicted_dict.get)

In [57]:
p = predict(test_data, data)
p

{'c': 0.001736111111111111, 'j': 0.0027434842249657062}

In [58]:
class_label(p)

'j'