In [1]:
import math
import nltk
from nltk.corpus import udhr

In [2]:
# Loading the corpus and creating set of languages
english = udhr.raw('English-Latin1')
french = udhr.raw('French_Francais-Latin1')
italian = udhr.raw('Italian_Italiano-Latin1')
spanish = udhr.raw('Spanish_Espanol-Latin1')

In [3]:
# Training, development samples for English, French, Italian and Spanish
english_train, english_dev = english[0:1000], english[1000:1100]
french_train, french_dev = french[0:1000], french[1000:1100]
italian_train, italian_dev = italian[0:1000], italian[1000:1100]
spanish_train, spanish_dev = spanish[0:1000], spanish[1000:1100]

In [4]:
# Test samples for English, French, Italian and Spanish
english_test = udhr.words('English-Latin1')[0:1000]
french_test = udhr.words('French_Francais-Latin1')[0:1000]
italian_test = udhr.words('Italian_Italiano-Latin1')[0:1000]
spanish_test = udhr.words('Spanish_Espanol-Latin1')[0:1000]

In [5]:
# Function to select only alpha-numeric characters
def process_text(text):
    for char in text:
        if not char.isalnum():  # check if character aplha-numeric
            text = text.replace(char, "")
    return list(text)

In [6]:
# Function to create and return unigram, bigram, trigram
def ngram_creation(text, no):
    gram_text = nltk.ngrams(text, no)  # ngram creation
    return list(gram_text)

In [7]:
# Function to calculate probability
def prob_calc(sample, values):
    prob = math.log(sample / values)  # Probability calculation
    return prob

In [8]:
# Creates a dictionary of unigram probabilities
def uni_probability(freq_dist):
    values_sum = sum(freq_dist.values())  # Sum of all the values from Frequency Distribution
    uni_prob = {}
    for sample in freq_dist:
        uni_prob[sample] = prob_calc(
                    freq_dist[sample], values_sum)  # For each sample in Freq dist, calculating probability
    return uni_prob

In [9]:
# Creates a dictionary of bigram probabilities
def bi_probability(freq_dist, bigram):
    freq_dist_keys = freq_dist.keys()
    bi_prob = {}
    bi_fd = nltk.FreqDist(bigram)
    bi_fd_keys = bi_fd.keys()
    for sample1, sample2 in bi_fd_keys:
        for sample in freq_dist_keys:
            if sample == sample1:
                bi_prob[sample1, sample2] = prob_calc(
                    bi_fd[sample1, sample2], freq_dist.get(sample))
    return bi_prob, bi_fd

In [10]:
# Creates a dictionary of trigram probabilities
def tri_probabilty(bi_fd, trigram):
    tri_fd = nltk.FreqDist(trigram)
    bi_fd_keys = bi_fd.keys()
    tri_fd_keys = tri_fd.keys()
    tri_prob = {}
    for sample1, sample2, sample3 in tri_fd_keys:
        for sample_m, sample_n in bi_fd_keys:
            if sample1 == sample_m and sample2 == sample_n:
                tri_prob[sample1, sample2, sample3] = prob_calc(
                    tri_fd[sample1, sample2, sample3], bi_fd[sample_m, sample_n])
    return tri_prob

In [11]:
# Calculating accuracies of both the unigram language models
def compare_uni(language1_test, lang1_uni_prob, lang2_uni_prob):
    language1_acc = 0  # Accuracy value of language 1
    language2_acc = 0  # Accuracy value of language 2
    for sample in language1_test:
        acc2 = 0
        acc1 = 0
        if sample.isalnum():  # check if charcter is alpha-numeric
            lowered_sample = sample.lower()  # Lower the character
            for character1 in lowered_sample:
                for character2 in lang1_uni_prob.keys():
                    if character1 == character2:
                        acc1 = acc1 + lang1_uni_prob[character1]  # Addition of log probabilities stored already
                for character3 in lang2_uni_prob:
                    if character1 == character3:
                        acc2 = acc2 + lang2_uni_prob[character1]  # Addition of log probabilities stored already
            acc2 = math.exp(acc2)  # Exponent of log probabilities
            acc1 = math.exp(acc1)  # Exponent of log probabilities
            if acc2 > acc1:
                language2_acc = language2_acc + 1
            elif acc2 <= acc1:
                language1_acc = language1_acc + 1
    return language1_acc, language2_acc

In [12]:
#  Calculating accuracies of both the bigram language models
def compare_bi(language1_test, lang1_bi_prob, lang2_bi_prob):
    language1_acc = 0  # Accuracy value of language1
    language2_acc = 0  # Accuracy value of language2

    for sample in language1_test:
        acc2 = 0
        acc1 = 0
        if len(sample) > 1:  # Check if word is greater than 1
            lowered_sample = list(nltk.ngrams(sample.lower(), 2))
            for character1 in lowered_sample:
                for character2 in lang1_bi_prob.keys():
                    if character1 == character2:
                        acc1 = acc1 + lang1_bi_prob[character1]  # Addition of log probabilities stored already
                for character3 in lang2_bi_prob.keys():
                    if character1 == character3:
                        acc2 = acc2 + lang2_bi_prob[character1]  # Addition of log probabilities stored already
            acc1 = math.exp(acc1)  # Exponent of log probabilities
            acc2 = math.exp(acc2)
            if acc2 > acc1:
                language2_acc = language2_acc + 1
            elif acc2 <= acc1:
                language1_acc = language1_acc + 1
    return language1_acc, language2_acc

In [13]:
#  Calculating accuracies of both the Trigram language models
def compare_tri(language1_test, lang1_tri_prob, lang2_tri_prob):
    language1_acc = 0
    language2_acc = 0

    for sample in language1_test:
        acc2 = 0
        acc1 = 0
        if len(sample) > 2:  # Check if word is greater than 2
            lowered_sample = list(nltk.ngrams(sample.lower(), 3))
            for character1 in lowered_sample:
                for character2 in lang1_tri_prob.keys():
                    if character1 == character2:
                        acc1 = acc1 + lang1_tri_prob[character1]  # Addition of log probabilities stored already
                for character3 in lang2_tri_prob.keys():
                    if character1 == character3:
                        acc2 = acc2 + lang2_tri_prob[character1]  # Addition of log probabilities stored already
            acc1 = math.exp(acc1)  # Exponent of log probabilities
            acc2 = math.exp(acc2)  # Exponent of log probabilities
            if acc2 > acc1:
                language2_acc = language2_acc + 1
            elif acc2 <= acc1:
                language1_acc = language1_acc + 1
    return language1_acc, language2_acc

In [14]:
# English
english_lowered = english_train.lower()  # English training set lowered
eng_trainset_proc = process_text(english_lowered)  # Training set pre-processing
eng_uni = ngram_creation(eng_trainset_proc, 1)  # Unigram creation
eng_bi = ngram_creation(eng_trainset_proc, 2)  # Bigram creation
eng_tri = ngram_creation(eng_trainset_proc, 3)  # Trigram creatiom
english_freq_dist = nltk.FreqDist(eng_trainset_proc)  # Frequency distribution of characters in English training set
eng_uni_prob = uni_probability(english_freq_dist)  # Unigram probability calculation
eng_bi_prob, eng_bi_fd = bi_probability(english_freq_dist, eng_bi)  # Bigram probability calculation
eng_tri_prob = tri_probabilty(eng_bi_fd, eng_tri)  # Trigram probability calculation

In [15]:
# Spanish
spanish_lowered = spanish_train.lower()  # Spanish training set lowered
spa_trainset_proc = process_text(spanish_lowered)  # Training set pre-processing
spa_uni = ngram_creation(spa_trainset_proc, 1)  # Unigram creation
spa_bi = ngram_creation(spa_trainset_proc, 2)  # Bigram creation
spa_tri = ngram_creation(spa_trainset_proc, 3)  # Trigram creation
spa_freq_dist = nltk.FreqDist(spa_trainset_proc)  # Frequency distribution of characters in Spanish training set
spa_uni_prob = uni_probability(spa_freq_dist)  # Unigram probability calculation
spa_bi_prob, spa_bi_fd = bi_probability(spa_freq_dist, spa_bi)  # Bigram probability calculation
spa_tri_prob = tri_probabilty(spa_bi_fd, spa_tri)  # Trigram probability creation

In [16]:
# Italian
italian_lowered = italian_train.lower()  # Italian training set lowered
it_trainset_proc = process_text(italian_lowered)  # Training set pre-processing
it_uni = ngram_creation(it_trainset_proc, 1)  # Unigram creation
it_bi = ngram_creation(it_trainset_proc, 2)  # Bigram creation
it_tri = ngram_creation(it_trainset_proc, 3)  # Trigram creation
it_freq_dist = nltk.FreqDist(it_trainset_proc)  # Frequency distribution of characters in Italian training set
it_uni_prob = uni_probability(it_freq_dist)  # Unigram probability calculation
it_bi_prob, it_bi_fd = bi_probability(it_freq_dist, it_bi)  # Bigram probability calculation
it_tri_prob = tri_probabilty(it_bi_fd, it_tri)  # Trigram probability calculation

In [17]:
# French
french_lowered = french_train.lower()  # French training set lowered
french_trainset_proc = process_text(french_lowered)  # Training set pre-processing
french_uni = ngram_creation(french_trainset_proc, 1)  # Unigram creation
french_bi = ngram_creation(french_trainset_proc, 2)  # Bigram creation
french_tri = ngram_creation(french_trainset_proc, 3)  # Trigram creation
french_freq_dist = nltk.FreqDist(french_trainset_proc)  # Frequency distribution of characters in French training set
french_uni_prob = uni_probability(french_freq_dist)  # Unigram probability calculation
french_bi_prob, french_bi_fd = bi_probability(french_freq_dist, french_bi)  # Bigram probability calculation
french_tri_prob = tri_probabilty(french_bi_fd, french_tri)  # Trigram probability calculation

In [18]:
# English VS French
acc1, acc2 = compare_uni(english_test, eng_uni_prob, french_uni_prob)
eng_uni_accuracy = str(round(((acc1 / len(english_test)) * 100), 2))
french_uni_accuracy = str(round(((acc2 / len(french_test)) * 100), 2))
print("\nProblem 1 ===> \n")
print("English VS French: ")
print("English Unigram =>")
print(eng_uni)
print("French Unigram =>")
print(french_uni)
print("Unigram accuracy of English model: ", eng_uni_accuracy)
print("Unigram accuracy of French model: ", french_uni_accuracy)
print("---------------------------------------------------------------------------------------")


Problem 1 ===> 

English VS French: 
English Unigram =>
[('u',), ('n',), ('i',), ('v',), ('e',), ('r',), ('s',), ('a',), ('l',), ('d',), ('e',), ('c',), ('l',), ('a',), ('r',), ('a',), ('t',), ('i',), ('o',), ('n',), ('o',), ('f',), ('h',), ('u',), ('m',), ('a',), ('n',), ('r',), ('i',), ('g',), ('h',), ('t',), ('s',), ('p',), ('r',), ('e',), ('a',), ('m',), ('b',), ('l',), ('e',), ('w',), ('h',), ('e',), ('r',), ('e',), ('a',), ('s',), ('r',), ('e',), ('c',), ('o',), ('g',), ('n',), ('i',), ('t',), ('i',), ('o',), ('n',), ('o',), ('f',), ('t',), ('h',), ('e',), ('i',), ('n',), ('h',), ('e',), ('r',), ('e',), ('n',), ('t',), ('d',), ('i',), ('g',), ('n',), ('i',), ('t',), ('y',), ('a',), ('n',), ('d',), ('o',), ('f',), ('t',), ('h',), ('e',), ('e',), ('q',), ('u',), ('a',), ('l',), ('a',), ('n',), ('d',), ('i',), ('n',), ('a',), ('l',), ('i',), ('e',), ('n',), ('a',), ('b',), ('l',), ('e',), ('r',), ('i',), ('g',), ('h',), ('t',), ('s',), ('o',), ('f',), ('a',), ('l',), ('l',), ('m',)

In [19]:
acc1, acc2 = compare_bi(english_test, eng_bi_prob, french_bi_prob)
eng_bi_accuracy = str(round(((acc1 / len(english_test)) * 100), 2))
french_bi_accuracy = str(round(((acc2 / len(french_test)) * 100), 2))
print("English Bigram =>")
print(eng_bi)
print("French Bigram =>")
print(french_bi)
print("Bigram accuracy of English model: ", eng_bi_accuracy)
print("Bigram accuracy of French model: ", french_bi_accuracy)
print("---------------------------------------------------------------------------------------")

English Bigram =>
[('u', 'n'), ('n', 'i'), ('i', 'v'), ('v', 'e'), ('e', 'r'), ('r', 's'), ('s', 'a'), ('a', 'l'), ('l', 'd'), ('d', 'e'), ('e', 'c'), ('c', 'l'), ('l', 'a'), ('a', 'r'), ('r', 'a'), ('a', 't'), ('t', 'i'), ('i', 'o'), ('o', 'n'), ('n', 'o'), ('o', 'f'), ('f', 'h'), ('h', 'u'), ('u', 'm'), ('m', 'a'), ('a', 'n'), ('n', 'r'), ('r', 'i'), ('i', 'g'), ('g', 'h'), ('h', 't'), ('t', 's'), ('s', 'p'), ('p', 'r'), ('r', 'e'), ('e', 'a'), ('a', 'm'), ('m', 'b'), ('b', 'l'), ('l', 'e'), ('e', 'w'), ('w', 'h'), ('h', 'e'), ('e', 'r'), ('r', 'e'), ('e', 'a'), ('a', 's'), ('s', 'r'), ('r', 'e'), ('e', 'c'), ('c', 'o'), ('o', 'g'), ('g', 'n'), ('n', 'i'), ('i', 't'), ('t', 'i'), ('i', 'o'), ('o', 'n'), ('n', 'o'), ('o', 'f'), ('f', 't'), ('t', 'h'), ('h', 'e'), ('e', 'i'), ('i', 'n'), ('n', 'h'), ('h', 'e'), ('e', 'r'), ('r', 'e'), ('e', 'n'), ('n', 't'), ('t', 'd'), ('d', 'i'), ('i', 'g'), ('g', 'n'), ('n', 'i'), ('i', 't'), ('t', 'y'), ('y', 'a'), ('a', 'n'), ('n', 'd'), ('d', 'o'

In [20]:
acc1, acc2 = compare_tri(english_test, eng_tri_prob, french_tri_prob)
eng_tri_accuracy = str(round(((acc1 / len(english_test)) * 100), 2))
french_tri_accuracy = str(round(((acc2 / len(french_test)) * 100), 2))
print("English Trigram =>")
print(eng_tri)
print("French Trigram =>")
print(french_tri)
print("Trigram accuracy of English model: ", eng_tri_accuracy)
print("Trigram accuracy of French model: ", french_tri_accuracy)
print("---------------------------------------------------------------------------------------")

English Trigram =>
[('u', 'n', 'i'), ('n', 'i', 'v'), ('i', 'v', 'e'), ('v', 'e', 'r'), ('e', 'r', 's'), ('r', 's', 'a'), ('s', 'a', 'l'), ('a', 'l', 'd'), ('l', 'd', 'e'), ('d', 'e', 'c'), ('e', 'c', 'l'), ('c', 'l', 'a'), ('l', 'a', 'r'), ('a', 'r', 'a'), ('r', 'a', 't'), ('a', 't', 'i'), ('t', 'i', 'o'), ('i', 'o', 'n'), ('o', 'n', 'o'), ('n', 'o', 'f'), ('o', 'f', 'h'), ('f', 'h', 'u'), ('h', 'u', 'm'), ('u', 'm', 'a'), ('m', 'a', 'n'), ('a', 'n', 'r'), ('n', 'r', 'i'), ('r', 'i', 'g'), ('i', 'g', 'h'), ('g', 'h', 't'), ('h', 't', 's'), ('t', 's', 'p'), ('s', 'p', 'r'), ('p', 'r', 'e'), ('r', 'e', 'a'), ('e', 'a', 'm'), ('a', 'm', 'b'), ('m', 'b', 'l'), ('b', 'l', 'e'), ('l', 'e', 'w'), ('e', 'w', 'h'), ('w', 'h', 'e'), ('h', 'e', 'r'), ('e', 'r', 'e'), ('r', 'e', 'a'), ('e', 'a', 's'), ('a', 's', 'r'), ('s', 'r', 'e'), ('r', 'e', 'c'), ('e', 'c', 'o'), ('c', 'o', 'g'), ('o', 'g', 'n'), ('g', 'n', 'i'), ('n', 'i', 't'), ('i', 't', 'i'), ('t', 'i', 'o'), ('i', 'o', 'n'), ('o', 'n', 

In [21]:
# Spanish Vs Italian
acc1, acc2 = compare_uni(spanish_test, spa_uni_prob, it_uni_prob)
spa_uni_accuracy = str(round(((acc1 / len(spanish_test)) * 100), 2))
it_uni_accuracy = str(round(((acc2 / len(italian_test)) * 100), 2))
print("\nProblem 2 ===> \n")
print("Spanish VS Italian: ")
print("Spanish Unigram =>")
print(spa_uni)
print("Italian Unigram =>")
print(it_uni)
print("Unigram accuracy of Spanish model: ", spa_uni_accuracy)
print("Unigram accuracy of Italian model: ", it_uni_accuracy)
print("---------------------------------------------------------------------------------------")


Problem 2 ===> 

Spanish VS Italian: 
Spanish Unigram =>
[('d',), ('e',), ('c',), ('l',), ('a',), ('r',), ('a',), ('c',), ('i',), ('ó',), ('n',), ('u',), ('n',), ('i',), ('v',), ('e',), ('r',), ('s',), ('a',), ('l',), ('d',), ('e',), ('d',), ('e',), ('r',), ('e',), ('c',), ('h',), ('o',), ('s',), ('h',), ('u',), ('m',), ('a',), ('n',), ('o',), ('s',), ('a',), ('d',), ('o',), ('p',), ('t',), ('a',), ('d',), ('a',), ('y',), ('p',), ('r',), ('o',), ('c',), ('l',), ('a',), ('m',), ('a',), ('d',), ('a',), ('p',), ('o',), ('r',), ('l',), ('a',), ('a',), ('s',), ('a',), ('m',), ('b',), ('l',), ('e',), ('a',), ('g',), ('e',), ('n',), ('e',), ('r',), ('a',), ('l',), ('e',), ('n',), ('s',), ('u',), ('r',), ('e',), ('s',), ('o',), ('l',), ('u',), ('c',), ('i',), ('ó',), ('n',), ('2',), ('1',), ('7',), ('a',), ('i',), ('i',), ('i',), ('d',), ('e',), ('1',), ('0',), ('d',), ('e',), ('d',), ('i',), ('c',), ('i',), ('e',), ('m',), ('b',), ('r',), ('e',), ('d',), ('e',), ('1',), ('9',), ('4',), ('8',

In [22]:
acc1, acc2 = compare_bi(spanish_test, spa_bi_prob, it_bi_prob)
spa_bi_accuracy = str(round(((acc1 / len(spanish_test)) * 100), 2))
it_bi_accuracy = str(round(((acc2 / len(italian_test)) * 100), 2))
print("Spanish Bigram =>")
print(spa_bi)
print("Italian Bigram =>")
print(it_bi)
print("Bigram accuracy of Spanish model: ", spa_bi_accuracy)
print("Bigram accuracy of Italian model: ", it_bi_accuracy)
print("---------------------------------------------------------------------------------------")

Spanish Bigram =>
[('d', 'e'), ('e', 'c'), ('c', 'l'), ('l', 'a'), ('a', 'r'), ('r', 'a'), ('a', 'c'), ('c', 'i'), ('i', 'ó'), ('ó', 'n'), ('n', 'u'), ('u', 'n'), ('n', 'i'), ('i', 'v'), ('v', 'e'), ('e', 'r'), ('r', 's'), ('s', 'a'), ('a', 'l'), ('l', 'd'), ('d', 'e'), ('e', 'd'), ('d', 'e'), ('e', 'r'), ('r', 'e'), ('e', 'c'), ('c', 'h'), ('h', 'o'), ('o', 's'), ('s', 'h'), ('h', 'u'), ('u', 'm'), ('m', 'a'), ('a', 'n'), ('n', 'o'), ('o', 's'), ('s', 'a'), ('a', 'd'), ('d', 'o'), ('o', 'p'), ('p', 't'), ('t', 'a'), ('a', 'd'), ('d', 'a'), ('a', 'y'), ('y', 'p'), ('p', 'r'), ('r', 'o'), ('o', 'c'), ('c', 'l'), ('l', 'a'), ('a', 'm'), ('m', 'a'), ('a', 'd'), ('d', 'a'), ('a', 'p'), ('p', 'o'), ('o', 'r'), ('r', 'l'), ('l', 'a'), ('a', 'a'), ('a', 's'), ('s', 'a'), ('a', 'm'), ('m', 'b'), ('b', 'l'), ('l', 'e'), ('e', 'a'), ('a', 'g'), ('g', 'e'), ('e', 'n'), ('n', 'e'), ('e', 'r'), ('r', 'a'), ('a', 'l'), ('l', 'e'), ('e', 'n'), ('n', 's'), ('s', 'u'), ('u', 'r'), ('r', 'e'), ('e', 's'

In [23]:
acc1, acc2 = compare_tri(spanish_test, spa_tri_prob, it_tri_prob)
spa_tri_accuracy = str(round(((acc1 / len(spanish_test)) * 100), 2))
it_tri_accuracy = str(round(((acc2 / len(italian_test)) * 100), 2))
print("Spanish Trigram =>")
print(spa_tri)
print("Italian Trigram =>")
print(it_tri)
print("Trigram accuracy of Spanish model: ", spa_tri_accuracy)
print("Trigram accuracy of Italian model: ", it_tri_accuracy)
print("---------------------------------------------------------------------------------------")
print("---------------------------------------------------------------------------------------")

Spanish Trigram =>
[('d', 'e', 'c'), ('e', 'c', 'l'), ('c', 'l', 'a'), ('l', 'a', 'r'), ('a', 'r', 'a'), ('r', 'a', 'c'), ('a', 'c', 'i'), ('c', 'i', 'ó'), ('i', 'ó', 'n'), ('ó', 'n', 'u'), ('n', 'u', 'n'), ('u', 'n', 'i'), ('n', 'i', 'v'), ('i', 'v', 'e'), ('v', 'e', 'r'), ('e', 'r', 's'), ('r', 's', 'a'), ('s', 'a', 'l'), ('a', 'l', 'd'), ('l', 'd', 'e'), ('d', 'e', 'd'), ('e', 'd', 'e'), ('d', 'e', 'r'), ('e', 'r', 'e'), ('r', 'e', 'c'), ('e', 'c', 'h'), ('c', 'h', 'o'), ('h', 'o', 's'), ('o', 's', 'h'), ('s', 'h', 'u'), ('h', 'u', 'm'), ('u', 'm', 'a'), ('m', 'a', 'n'), ('a', 'n', 'o'), ('n', 'o', 's'), ('o', 's', 'a'), ('s', 'a', 'd'), ('a', 'd', 'o'), ('d', 'o', 'p'), ('o', 'p', 't'), ('p', 't', 'a'), ('t', 'a', 'd'), ('a', 'd', 'a'), ('d', 'a', 'y'), ('a', 'y', 'p'), ('y', 'p', 'r'), ('p', 'r', 'o'), ('r', 'o', 'c'), ('o', 'c', 'l'), ('c', 'l', 'a'), ('l', 'a', 'm'), ('a', 'm', 'a'), ('m', 'a', 'd'), ('a', 'd', 'a'), ('d', 'a', 'p'), ('a', 'p', 'o'), ('p', 'o', 'r'), ('o', 'r', 