In [None]:
import numpy as np
import os
import glob
import random

In [None]:
data_folder = "languageID"
data_files = os.listdir(data_folder)

languages = ['e', 's', 'j']
training_files = []
for l in languages:
    for i in range(10):
        filename = f"{l}{i}.txt"
        file_path = os.path.join(data_folder, filename)
        if os.path.exists(file_path):
            training_files.append(filename)


file_language_counts = []
for l in languages:
    file_language_counts.append(len([file for file in training_files if file.startswith(l)]))

smooth_prior_probs = []
alpha = 0.5
for count in file_language_counts:
    smooth_prior_probs.append((count+alpha)/(sum(file_language_counts)+alpha*len(languages)))

In [None]:
print('p̂(y=e) = '+str(smooth_prior_probs[0]))
print('p̂(y=s) = '+str(smooth_prior_probs[1]))
print('p̂(y=j) = '+str(smooth_prior_probs[2]))

In [None]:
def get_class_cond_prob(lang_files, language):
    char_counts = {char:0 for char in "abcdefghijklmnopqrstuvwxyz "}
    total_chars = 0
    for file in lang_files:
        with open(os.path.join("languageID/", file), "r") as f:
            all_text = f.read()
            for char in all_text:
                if char in char_counts:
                    char_counts[char] += 1
                    total_chars += 1
    
    char_cond_probs = []
    alpha = 0.5
    for char in "abcdefghijklmnopqrstuvwxyz ":
        N_c = char_counts[char]
        cond_prob = (N_c+alpha)/(total_chars+27*alpha)
        char_cond_probs.append(cond_prob)
        print(f"θ{char},{language} = {cond_prob:.5f}")
    return char_cond_probs

In [None]:
get_class_cond_prob([file for file in training_files if file.startswith('e')], 'e')

In [None]:
get_class_cond_prob([file for file in training_files if file.startswith('s')], 's')

In [None]:
get_class_cond_prob([file for file in training_files if file.startswith('j')], 'j')

In [None]:
def get_class_count(lang_files):
    char_counts = {char:0 for char in "abcdefghijklmnopqrstuvwxyz "}
    total_chars = 0
    for file in lang_files:
        with open(os.path.join("languageID/", file), "r") as f:
            all_text = f.read()
            for char in all_text:
                if char in char_counts:
                    char_counts[char] += 1
                    total_chars += 1
    char_counts_array = []
    for char in "abcdefghijklmnopqrstuvwxyz ":
        N_c = char_counts[char]
        char_counts_array.append(N_c)
        print(f"x_{char} = {N_c:.0f}")
    return char_counts_array

In [None]:
get_class_count([file for file in data_files if file.startswith('e10')])

In [None]:
def get_log_likelihood(char_counts, class_cond_prob, vocab_size=27):
    net_likelihood = 0
    for i in range(vocab_size):
        net_likelihood += char_counts[i]*np.log10(class_cond_prob[i])
    return net_likelihood

In [None]:
all_log_likelihoods = []

for l in languages:
    all_log_likelihoods.append(get_log_likelihood(get_class_count([file for file in data_files if file.startswith('e10')]), get_class_cond_prob([file for file in training_files if file.startswith(l)], l)))

In [None]:
all_log_likelihoods

In [None]:
for i in range(3):
    print(f'log(p̂(x|y={languages[i]})) = {all_log_likelihoods[i]}')

In [None]:
10**all_log_likelihoods[0]

In [None]:
!pip install mpmath


In [None]:
import mpmath
exp_values = []
for i in range(3):
    exp_values.append(mpmath.power(10, all_log_likelihoods[i]))

In [None]:
for i in range(3):
    print(f'p̂(x|y={languages[i]}) = {exp_values[i]}')

In [None]:
#net_count_vocab = get_class_count([file for file in training_files])

In [None]:
#net_count_vocab_prob = [i/sum(net_count_vocab) for i in net_count_vocab]

In [None]:
def get_posterior_denom(char_count, net_count_vocab_prob, vocab_size=27):
    constant_denom = 0
    for i in range(vocab_size):
        constant_denom += char_count[i]*np.log10(net_count_vocab_prob[i])
    return constant_denom

In [None]:
constant_denom = get_posterior_denom(get_class_count([file for file in data_files if file.startswith('e10')]), net_count_vocab_prob)

In [None]:
all_posteriors = []
for i in range(3):
    post = np.log10(smooth_prior_probs[i])+all_log_likelihoods[i]  #-constant_denom
    all_posteriors.append(post)

In [None]:
for i in range(3):
    print(f'log(p̂(y={languages[i]}|x)) = {all_posteriors[i]}')

In [None]:
exp_posteriors = []
for i in range(3):
    exp_posteriors.append(mpmath.power(10, all_posteriors[i]))

In [None]:
for i in range(3):
    print(f'p̂(y={languages[i]}|x) = {exp_posteriors[i]}')

In [None]:
exp_posteriors[0]/sum(exp_posteriors), exp_posteriors[1]/sum(exp_posteriors), exp_posteriors[2]/sum(exp_posteriors)

In [None]:
def get_prediction(file, languages, training_files, smooth_prior_probs):
    char_count_file = get_class_count([file])
    all_log_likelihoods = []
    for l in languages:
        all_log_likelihoods.append(get_log_likelihood(char_count_file, get_class_cond_prob([file for file in training_files if file.startswith(l)], l)))
    all_posteriors = []
    for i in range(3):
        post = np.log10(smooth_prior_probs[i])+all_log_likelihoods[i]
        all_posteriors.append(post)
    exp_posteriors = []
    for i in range(3):
        exp_posteriors.append(mpmath.power(10, all_posteriors[i]))
    return languages[np.argmax(np.array(exp_posteriors))]

In [None]:
test_files = [file for file in data_files if file not in training_files]

In [None]:
predictions_all_testfiles = []
for file in test_files:
    predictions_all_testfiles.append(get_prediction(file, languages, training_files, smooth_prior_probs))

In [None]:
groundtruth = [file[0] for file in test_files]

In [None]:
confusion_mat = {}
for l in languages:
    for k in languages:
        confusion_mat[(l, k)] = 0

In [None]:
for i in range(len(predictions_all_testfiles)):
    confusion_mat[predictions_all_testfiles[i], groundtruth[i]] += 1

In [None]:
from tabulate import tabulate

data = []
for l in languages:
    arr = []
    for k in languages:
        arr.append(confusion_mat[l, k])
    data.append(arr)
    
column_headers = ["", 'e', 'j', 'k']
row_headers = ['e', 'j', 'k']

table = tabulate(data, headers=column_headers, showindex=row_headers, tablefmt="grid")

In [None]:
print("\n")
print(table)

In [None]:
input_file = 'languageID/e11.txt'
with open(input_file, 'r') as file:
    text = file.read()
    
all_chars = list(text)
random.shuffle(all_chars)

text_shuffled = ''.join(all_chars)
with open('languageID/shuffled_e11.txt', 'w') as file:
    file.write(text_shuffled)

In [None]:
get_prediction('shuffled_e11.txt', languages, training_files, smooth_prior_probs)