In [54]:
import os
import re
import numpy as np
from math import log
from collections import defaultdict

In [55]:
data_directory = os.path.join(os.getcwd(), 'data')
training_files = []

'''
Iterates through the current directory to fetch all training files i.e training.de, training.en, training.es
Stores the absolute path of these files in the training_files list
'''
[training_files.append(data_directory + '/' + training_file) for training_file in os.listdir(data_directory) if training_file.startswith('training')]
print(training_files)

['/afs/inf.ed.ac.uk/user/s24/s2446690/Desktop/anlp/assignment1/data/training.es', '/afs/inf.ed.ac.uk/user/s24/s2446690/Desktop/anlp/assignment1/data/training.en', '/afs/inf.ed.ac.uk/user/s24/s2446690/Desktop/anlp/assignment1/data/training.de']


In [56]:
 vocabulary = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 
               'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 
               '0', '.', ' ']

In [57]:
# Question 1
def preprocess_line(line):
    # The regex below defines a character set consisting of only alphabets, digits, spaces, full-stops and sentence markers
    # Remove all unwanted characters that are not part of this character set
    # Additionally replace all digits with 0
    # Finally, add beginning and end of sentence markers before returning the line
    return '##' + re.sub(r'[^a-z\d\s.]+', '', re.sub(r'\d', '0', line.lower())) + '#'

In [68]:
# Question 3 a.
def generate_trigrams_from_vocabulary():
    # We use the '#' character to symbolise both, the beginning i.e., <s> and end i.e., </s> of a sentence
    
    # Generate trigrams with the beginning-of-sentence marker i.e., of type <s><s>a
    set_of_all_possible_trigrams = ['##' + character for character in vocabulary]
    
    # Generate all trigrams that are read from the second sentence marker onwards i.e., of type <s>ab
    set_of_all_possible_trigrams.extend(['#' + character_one + character_two for character_one in vocabulary for character_two in vocabulary])
    
    # Generate all trigrams with the end-of-sentence marker i.e. of type ab</s>
    set_of_all_possible_trigrams.extend([character_n_minus_one + character_n_minus_two + '#' for character_n_minus_one in vocabulary for character_n_minus_two in vocabulary])
    
    # Generate all trigrams that occur within the sentence (i.e., everything in between the sentence markers)
    set_of_all_possible_trigrams.extend([character_one + character_two + character_three for character_one in vocabulary for character_two in vocabulary for character_three in vocabulary])
    
    # Return the set of all possible trigrams generated from the given vocabulary
    return set_of_all_possible_trigrams

In [None]:
# Question 3 b.
def model_trigrams_from_training_data(training_file):
    set_of_all_possible_trigrams = generate_trigrams_from_vocabulary()
    distribution_over_next_characters = defaultdict()
    
    # Contruct a map with unique bigram histories as keys and a list of all-possible-next-characters as their value-pair (defined as dict within dict model)
    for trigram in set_of_all_possible_trigrams:
        bigram = trigram[0:2]
        next_character = trigram[2]
        if bigram not in distribution_over_next_characters:
            distribution_over_next_characters[bigram] = defaultdict()
        if next_character not in distribution_over_next_characters[bigram]:
            distribution_over_next_characters[bigram][next_character] = 0
            
    # Read the training file and compute all trigram counts
    with open(training_file, 'r') as file:
        for line in file:
            # Preprocess the data line-wise
            preprocessed_line = preprocess_line(line.rstrip())
            # Define a dictionary to hold all the trigram counts in the training data
            for index in range(len(preprocessed_line) - 2):
                trigram = preprocessed_line[index: index + 3]
                bigram_history = trigram[0:2]
                next_character = trigram[2]
                # Prevents inclusion of invalid trigrams like #{char}# and {char}#{char}
                if bigram_history in distribution_over_next_characters and next_character in distribution_over_next_characters[bigram_history]:
                    distribution_over_next_characters[bigram_history][next_character] += 1
    file.close()
    
    total_num_bigrams = 0
    vocabulary_size = len(vocabulary)
    # Extract the language suffix from the training file (i.e. en, es or de)
    # Create a new file to write out the trained model probabilities for each of the above training files
    trigram_model = data_directory + '/' + '{}_trained_trigram_model'.format(training_file[-2:])
    
    # Compute the probability distributions over the trigrams in the training data and write these out to a file
    with open(trigram_model, 'w+') as file:
        for bigram_history in distribution_over_next_characters:
            # Calculate the total number of trigram instances with the same bigram history
            total_num_bigrams = sum(distribution_over_next_characters[bigram_history].values())
            for next_character in distribution_over_next_characters[bigram_history]:
                # For each bigram in the vocabulary, perform add alpha smoothing over the next possible trigram character
                distribution_over_next_characters[bigram_history][next_character] += 1
                distribution_over_next_characters[bigram_history][next_character] /= (total_num_bigrams +  vocabulary_size)
                file.write(bigram_history + next_character + '\t' + "{:.3e}".format(distribution_over_next_characters[bigram_history][next_character]) + '\n')
    file.close()

In [None]:
for training_file in training_files:
    model_trigrams_from_training_data(training_file)

In [59]:
# Question 4 a.
def generate_from_LM(language_model, n_characters):
    lm_trigram_probabilities = defaultdict()
    
    # Read the language model
    with open(language_model, 'r') as file:
        for line in file:
            # Split the line to extract the trigram and its probability, respectively
            trigram_and_its_probability = line.rstrip().split('\t')
            
            # Unwrap the contents of the list trigram_and_its_probability
            trigram = trigram_and_its_probability[0]
            trigram_probability = float(trigram_and_its_probability[1])
            
            # Extract the bigram history and next character for each trigram that's read from the language model
            bigram_history = trigram[0:2]
            next_character = trigram[2]
            
            # Store the probabilties for each trigram in a dictionary (using the same dict within dict format as before)
            if bigram_history not in lm_trigram_probabilities:
                lm_trigram_probabilities[bigram_history] = defaultdict()
            if next_character not in lm_trigram_probabilities[bigram_history]:
                lm_trigram_probabilities[bigram_history][next_character] = trigram_probability
    file.close()
    
    bigram_history_lookup = '##'
    generated_sequence = '##'
    
    for num_characters in range(n_characters):
        # Randomly samples the next character of the trigram sequence, given the bigram history and probability distribution over the trigram vocabulary
        distribution = lm_trigram_probabilities[bigram_history_lookup]
        all_possible_characters = np.array(list(distribution.keys()))
        probablity_distribution = np.array(list(distribution.values()))
        bins = np.cumsum(probablity_distribution)
        chosen_next_character = ''.join(map(str, all_possible_characters[np.digitize(np.random.random_sample(1), bins)]))
        if chosen_next_character == '#':
            bigram_history_lookup = '##'
        else:
            bigram_history_lookup = bigram_history_lookup[1] + chosen_next_character
        generated_sequence += chosen_next_character
        num_characters += 1
    
    return generated_sequence

In [60]:
# Question 4 b.
def write_generated_sequence_to_file(sequence, language_model):
    with open ('{}_generated_sequence'.format(language_model), 'w+') as file:
        for character in sequence:
            if character == '#':
                continue
            else:
                file.write(character)
    file.close()

In [None]:
# Question 4 c.
# For the English model trained by us
language_model = data_directory + '/' + 'en_trained_trigram_model'
generated_sequence = generate_from_LM(language_model, 300)
write_generated_sequence_to_file(generated_sequence, language_model)

# For the pre-trained English model model-br.en
language_model = data_directory + '/' + 'model-br.en'
generated_sequence = generate_from_LM(language_model, 300)
write_generated_sequence_to_file(generated_sequence, language_model)

In [61]:
# Question 5 a.
def compute_perplexity(language_model, test_document):
    lm_trigram_probabilities = defaultdict()
    
    # Read the language model
    with open(language_model, 'r') as file:
        for line in file:
            # Split the line to extract the trigram and its probability, respectively
            trigram_and_its_probability = line.rstrip().split('\t')
            
            # Unwrap the contents of the list trigram_and_its_probability
            trigram = trigram_and_its_probability[0]
            trigram_probability = float(trigram_and_its_probability[1])
            
            # Extract the bigram history and next character for each trigram that's read from the language model
            bigram_history = trigram[0:2]
            next_character = trigram[2]
            
            # Store the probabilties for each trigram in a dictionary (using the same dict within dict format as before)
            if bigram_history not in lm_trigram_probabilities:
                lm_trigram_probabilities[bigram_history] = defaultdict()
            if next_character not in lm_trigram_probabilities[bigram_history]:
                lm_trigram_probabilities[bigram_history][next_character] = trigram_probability
    file.close()
    
    sum_of_log_probabilties = float(0)
    hm = float(0)
    N = float(0)
    
    with open(test_document, 'r') as file:
        for line in file:
            preprocessed_line = preprocess_line(line.rstrip())
            # Exclude the first two '##' in each line for calculating the total number of character tokens in the file
            N += len(preprocessed_line) - 2
            for index in range(len(preprocessed_line) - 2):
                trigram = preprocessed_line[index: index + 3]
                bigram_history = trigram[0:2]
                next_character = trigram[2]
                sum_of_log_probabilties += math.log2(lm_trigram_probabilities[bigram_history][next_character])
    file.close()
    
    hm = (-1.0/N) * sum_of_log_probabilties
    perplexity = math.pow(2, hm)
    return perplexity

In [None]:
# Question 5 b.
test_data = data_directory + '/' + 'test'

language_model = data_directory + '/' + 'en_trained_trigram_model'
compute_perplexity(language_model, test_data)

language_model = data_directory + '/' + 'es_trained_trigram_model'
compute_perplexity(language_model, test_data)

language_model = data_directory + '/' + 'de_trained_trigram_model'
compute_perplexity(language_model, test_data)