In [1]:
import math
import pickle
from nltk import word_tokenize
from nltk.util import ngrams

In [2]:
with open('english_uni.pickle', "rb") as handle:
    english_uni = pickle.load(handle)

with open('english_bi.pickle', "rb") as handle:
    english_bi = pickle.load(handle)

with open('french_uni.pickle', "rb") as handle:
    french_uni = pickle.load(handle)

with open('french_bi.pickle', "rb") as handle:
    french_bi = pickle.load(handle)

with open('italian_uni.pickle', "rb") as handle:
    italian_uni = pickle.load(handle)

with open('italian_bi.pickle', "rb") as handle:
    italian_bi = pickle.load(handle)

In [3]:
# total vocabulary length
V = len(english_uni) + len(french_uni) + len(italian_uni)

In [4]:
def compute_prob(text, unigram_dict, bigram_dict, V):
    text = text.replace("\n", "")
    unigrams_test = word_tokenize(text)
    bigrams_test = list(ngrams(unigrams_test, 2))

    p_laplace = 1 

    for bigram in bigrams_test:
        n = bigram_dict[bigram] if bigram in bigram_dict else 0
        d = unigram_dict[bigram[0]] if bigram[0] in unigram_dict else 0

        p_laplace = p_laplace * ((n + 1) / (d + V))

    return p_laplace


In [5]:
def language_probability(text):
    p_eng = compute_prob(text, english_uni, english_bi, V)
    p_fr = compute_prob(text, french_uni, french_bi, V)
    p_it = compute_prob(text, italian_uni, italian_bi, V)

    return {"English": p_eng, "French": p_fr, "Italian": p_it}

In [6]:
# predictions 
with open("./data/LangId.test.txt", "r") as f:
    texts = f.readlines()

predictions = []
for text in texts:
    probs = language_probability(text)
    pred = max(probs, key = probs.get)
    predictions.append(pred)

# getting true values 
with open("./data/LangId.sol.txt", "r") as f: 
    sol = f.readlines()

true = [line.split(" ")[1].strip() for line in sol]

In [7]:
def calculate_accuracy(true, predictions):
    if len(true) != len(predictions):
        raise ValueError("The length of predictions and true values must be same")
    
    correct_preds = sum([p == t for p, t in zip(predictions, true)])
    incorred_preds = len(predictions) - correct_preds
    accuracy = 100 * correct_preds / len(predictions)
    
    return float("{:0.2f}".format(accuracy)), incorred_preds

In [8]:
acc, incorrect_pred = calculate_accuracy(true, predictions)
print(f"Accuracy of Ngram Model: {acc}")
print(f"Number of incorrect predictions: {incorrect_pred}")

96.67