In [162]:
# get a list of possible labels for a given word

class MarkovChain:
    def __init__(self, labels, corpus):
        self.corpus = corpus
        self.labels = labels
        
    def get_language_counts(self, index):
        # read the text and get all of the characters
        text_name = self.corpus[index]
        lang = self.labels[index]
        text = open(text_name, 'r').readlines()
        languages = {}
        chars = {}
        languages[lang] = chars
        for line in text:
            for char in line:
                char = char.lower()
                if char not in chars and char.isalpha():
                    let = {char: 1}
                    chars.update(let)
                elif char.isalpha():
                    chars[char] += 1
        return languages
    
    def get_all_language_counts(self):
        all_languages = {}
        for i in range(len(self.labels)):
            lang = self.get_language_counts(i)
            label = self.labels[i]
            all_languages.update(lang)
        return all_languages
    
    def get_conditional_counts(self, l):
        conditional = {}
        text_name = self.corpus[l]
        text = open(text_name, 'r').readlines()
        for line in text:
            for i in range(len(line)):
                char = line[i].lower()
                if char.isalpha() and i < len(line) - 1:
                    next_char = line[i+1].lower()
                    if next_char.isalpha():
                        if char in conditional:
                            if next_char in conditional[char]:
                                conditional[char][next_char] += 1
                            else:
                                conditional[char][next_char] = 1
                        else:
                            conditional[char] = {next_char: 1}
        return conditional
    
    def get_all_conditional_counts(self):
        conditionals = {}
        for l in range(len(self.labels)):
            conditionals[self.labels[l]] = self.get_conditional_counts(l)
        return conditionals
    
    def get_joint_counts(self):
        counts = self.get_all_language_counts()
        cond = self.get_all_conditional_counts()
        joint = (counts, cond)
        return joint
    
    def get_probability(self, joint, c1, c2, lang):
        prob = 1
        # multiply by the chance of getting the first character
        char_count = joint[0][lang][c1]
        total_char_count = sum(joint[0][lang].values())
        prob *= char_count / total_char_count
        return prob

    def get_conditional_word(self, joint, word, lang):
        input = list(word)
        word_prob = 1
        # for each character in the word, get the probability of the next character
        for i in range(len(input) - 1):
            c1 = input[i]
            c2 = input[i+1]
            pair_prob = self.get_probability(joint, c1, c2, lang)
            word_prob *= pair_prob
        return word_prob
    
    def get_all_conditional_words(self, joint, word):
        chances = {}
        for lang in self.labels:
            chance = self.get_conditional_word(joint, word, lang)
            chances[lang] = chance
        # get the relative probabilities
        total = sum(chances.values())
        probs = {}
        for lang in self.labels:
            probs[lang] = chances[lang] / total
        return probs
        

In [163]:
labels = ["rw", "sw", "en", "fr"]
corpus = ["rw_book.txt", "sw_book.txt", "eng_book.txt", "fr_book.txt"]
EngChain = MarkovChain(labels, corpus)


In [164]:
joint = EngChain.get_joint_counts()

prob = EngChain.get_all_conditional_words(joint, "tree")
print(prob)

({'rw': {'u': 24410, 'b': 16564, 's': 13129, 'h': 10008, 'a': 43735, 'k': 13380, 't': 9257, 'i': 28876, 'm': 12224, 'e': 19461, 'n': 19817, 'y': 10976, 'չ': 115, 'r': 19135, 'ձ': 55, 'v': 1538, 'o': 16067, 'd': 4200, 'l': 2088, 'w': 7028, 'x': 259, 'p': 1527, 'c': 3428, 'f': 1470, 'q': 194, 'g': 11371, 'z': 4923, 'թ': 1535, 'յ': 1176, 'օ': 67, 'j': 1158, 'ա': 778, 'ҫ': 23, 'ө': 15, 'р': 245, 'б': 47, 'в': 36, 'ҕ': 10, 'ӯ': 70, 'ӡ': 202, 'ҏ': 9, 'ү': 12, 'ҵ': 39, 'ғ': 6, 'ӻ': 6, 'ұ': 8, 'ҡ': 18, 'փ': 6, 'ӱ': 6, 'ՠ': 14, 'ҍ': 11, 'ҁ': 1, 'ҩ': 63, 'ҥ': 8, 'ӭ': 29, 'ҭ': 40, 'ճ': 4, 'ե': 23, 'ւ': 29, 'ҳ': 16, 'ң': 12, 'ҧ': 12, 'ӷ': 2, 'ӹ': 2, 'ծ': 5, 'ղ': 2, 'ɠ': 7, 'ԫ': 3, 'ӿ': 1, 'ӳ': 6, 'ɝ': 5, 'τ': 1, 'վ': 7, 'ը': 6, 'ո': 3, 'կ': 5, 'ԉ': 1, 'ɕ': 1, 'ɮ': 2, 'ԯ': 4, 'ї': 1, 'ҋ': 4, 'ɩ': 2, 'ӎ': 4, 'ԩ': 1, 'ҹ': 4, 'ѕ': 5, 'е': 9, 'д': 13, 'ԑ': 1, 'щ': 1, 'и': 19, 'з': 8, 'ж': 2, 'ǡ': 1, 'ȯ': 1, 'й': 10, 'а': 26, 'г': 24, 'җ': 12, 'ґ': 1, 'դ': 1, 'ӈ': 1, 'ӊ': 1, 'լ': 2, 'շ': 4, 'ҷ': 1}, 'sw