# Language Modelling Lab (week 4)

This notebook provides the "starter" code in the week 4 lab


In [1]:
# mounting
try:
    from google.colab import drive
    drive.mount('/content/drive', force_remount=True)
    IN_COLAB = True
except:
    IN_COLAB = False

1.  We need to get the names of files in the training directory and split them into training and testing 50:50.


In [2]:
import os
import random
import math
import operator
import nltk
from nltk import word_tokenize as tokenize
nltk.download('punkt')

[nltk_data] Error loading punkt: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:1006)>


False

In [3]:

# this needs to be the parent directory for the training corpus
TRAINING_DIR = "./sentence-completion/Holmes_Training_Data"

if IN_COLAB:
    TRAINING_DIR = "/content/drive/MyDrive/modules/2.2/2.2-Language P-2/week4-NN_bigram_unigram/lab4resources_full/sentence-completion/Holmes_Training_Data"


def get_training_testing(training_dir=TRAINING_DIR, split=0.5):

    filenames = os.listdir(training_dir)
    n = len(filenames)
    print("There are {} files in the training directory: {}".format(n, training_dir))
    random.seed(53)  # if you want the same random split every time
    random.shuffle(filenames)
    index = int(n*split)
    return (filenames[:index], filenames[index:])


trainingfiles, heldoutfiles = get_training_testing()

There are 526 files in the training directory: ./sentence-completion/Holmes_Training_Data


In [4]:
len(trainingfiles)
print(trainingfiles)

['TARZ210.TXT', 'WHRT110.TXT', 'FALK10.TXT', 'BCPTV10.TXT', 'CHUZZ10.TXT', 'LCONT10.TXT', 'TMONK10.TXT', 'LTPRN10.TXT', 'MYANT10.TXT', 'CUBRK10.TXT', 'SSACC10.TXT', 'SILAP10.TXT', 'HPPRT10.TXT', 'MORLL10.TXT', 'MSTON10.TXT', 'HFINN10.TXT', 'JBALL10.TXT', 'DLANG10.TXT', 'FNHER10.TXT', 'SNOWI10.TXT', 'LADDI10.TXT', 'SCARP10.TXT', 'PORAP10.TXT', 'SLGRL10.TXT', 'HLDRM10.TXT', 'ALEXB10.TXT', 'FANFB10.TXT', 'MRMEN10.TXT', 'CONFI10.TXT', 'TDITW10.TXT', 'PCLP310.TXT', '19TOM10.TXT', 'ISLNI10.TXT', 'TONOB10.TXT', 'BADGE10.TXT', 'ACRDI10.TXT', '2RBNH10.TXT', 'RDFRY10.TXT', 'WMARS12.TXT', 'PLGRM10.TXT', 'BADAM10.TXT', 'LOSTW10.TXT', 'PALIN10.TXT', 'HBOOK10.TXT', 'PRSIT10.TXT', 'RDDSK10.TXT', 'ILAND10.TXT', 'LORNA10.TXT', '09WOZ10.TXT', 'DGRAY10.TXT', 'THEAM10.TXT', 'HFDTR10.TXT', 'RRLMP10.TXT', 'TOM2010.TXT', 'TBTLP10.TXT', 'MDMAR10.TXT', 'LMYER10.TXT', 'SCRLT12.TXT', 'BUNNR10.TXT', 'WAYMN10.TXT', 'SKYGM10.TXT', 'MAIDM10.TXT', 'PFIRS11.TXT', 'SANDB10.TXT', 'OLIVR10.TXT', 'AGENT10.TXT', 'FRHNT10.T

## 1 Building a unigram model


In [5]:
class language_model_both():

    def __init__(self, trainingdir=TRAINING_DIR, files=[]):
        # store the names of the files containing training data and run the training method
        self.training_dir = trainingdir
        self.files = files

        self.train()

    def train(self):
        # initialise an empty dictionary which will be the unigram model {w:P(w)} when training is complete
        self.trigrams = {}
        self.unigram = {}
        self.bigrams = {}
        # process all of the training data, accumulating counts of events
        self._processfiles()
        # convert the accumulated counts to probabilities
        # self._convert_to_probs()
        self._store_sorted_unigrams()

    def _store_sorted_unigrams(self):
        sorted_unigrams = dict(
            sorted(self.unigram.items(), key=lambda item: item[1], reverse=True))
        self.sorted_unigrams = sorted_unigrams

    def _processfiles(self):
        # process each file in turn
        for afile in self.files:
            # print("Processing {}".format(afile))
            try:
                with open(os.path.join(self.training_dir, afile)) as instream:
                    for line in instream:
                        line = line.rstrip()
                        if len(line) > 0:
                            self._processline(line)
            except UnicodeDecodeError:
                print("UnicodeDecodeError processing {}: ignoring file".format(afile))

    def _processline(self, line):
        # process each line of a file
        # each line is tokenized and has a special start and end token added
        # counts of tokens are added to the self.unigram count model
        tokens = ["_START1", "__START2"]+tokenize(line)+["__END1", "___END2"]



        for i in range(len(tokens))
          self.unigram[tokens[i]] = self.unigram.get(tokens[i], 0)+1

          if i != len(tokens)-1:
            if tokens[i] not in self.bigrams.keys():
              self.bigrams[tokens[i]] = {}
            self.bigrams[tokens[i]][tokens[i+1]] = self.bigrams[tokens[i]].get(tokens[i+1], 0) + 1

          if i < len(tokens) - 2:
            if tokens[i] not in self.trigrams.keys:
              self.trigrams[tokens[i]] = {}
            if tokens[i+1] not in self.trigrams[tokens[i]].keys():
              self.trigrams[tokens[i]][tokens[i+1]] = {}
            self.trigrams[tokens[i]][tokens[i+1]][tokens[i+2]] = self.trigrams[tokens[i]][tokens[i+1]].get(tokens[i+2], 0) + 1

    def _convert_to_probs(self):
        # self.unigram initially counts counts for each token {token:freq(token)}
        # sum all of the frequencies and divide each frequency by that sum to get probabilities

        self.unigram = {k: v/sum(self.unigram.values())
                        for (k, v) in self.unigram.items()}

        for target_word, context_word_dict in self.bigrams.items():
            total_count = sum(context_word_dict.values())
            for context_word, context_word_count in context_word_dict.items():
                self.bigrams[target_word][context_word] = context_word_count / total_count

    def get_prob(self, token, method="unigram"):
        # simple look up method
        if method == "unigram":
            return self.unigram.get(token, 0)
        else:
            print("Not implemented: {}".format(method))
            return 0

    def get_top_n_words(self, n):
        return list(self.sorted_unigrams)[:n]

    def make_sentence(self, method, words_to_pick_from =50)
        sentence = []
        prev_token = None
        token_to_add = "__START"

        if method =="unigram":
            while token_to_add != "." and len(sentence) < 75:
                sentence.append(token_to_add)
                prev_token = token_to_add
                new_token = random.choice(self.get_top_n_words(words_to_pick_from))
                while new_token == prev_token or new_token == "__END" or new_token == "__START":
                    new_token = random.choice(
                        self.get_top_n_words(words_to_pick_from))
                token_to_add = new_token

        elif method == "bigram":
            while token_to_add != "." and len(sentence) < 75:
                sentence.append(token_to_add)
                new_token = random.choices(list(self.bigrams[token_to_add].keys()), list(self.bigrams[token_to_add].values()))
                token_to_add = new_token[0]
        else:
            print("no method")
            return 0

        return " ".join(sentence)

    def get_perplexity(self, string, method="bigram"):

        all_log_probs = []
        n = 0
        tokens = ["__START"]+tokenize(string)+["__END"]
        print(f"{tokens=}")

        if method == "unigram":
            for token in tokens:
                all_log_probs.append(
                    math.log2(self.unigram[token]))
                n += 1
            perplex = math.exp((-1*sum(all_log_probs))/n)
        elif method == "bigram":
            for i in range(0, len(tokens)-1):
                target_word = tokens[i]
                context_word = tokens[i+1]
                print(f"{target_word=}")
                print(f"{context_word=}")
                all_log_probs.append(
                    math.log2(self.bigrams[target_word][context_word]))
                n += 1
            perplex = math.exp((-1*sum(all_log_probs))/n)
            return perplex
        else:
            print("unknown method")
            return 0

SyntaxError: expected ':' (3794603791.py, line 47)

In [None]:
# mylm.unigram

for MAX_FILES in [5, 50, 100, 200]:

    mylm = language_model(files=trainingfiles[:MAX_FILES])
    print("***************")
    print(f"N = {MAX_FILES}")
    print(f"the: p={mylm.get_prob('the'):.5F}")
    # print(f"or: p={mylm.get_prob('or'):.5F}")
    # print(f"and: p={mylm.get_prob('and'):.5F}")
    # print(f"big: p={mylm.get_prob('big'):.5F}")
    print(f"bully: p={mylm.get_prob('bully'):.5F}")
    print(f"dog: p={mylm.get_prob('dog'):.5F}")
    # print(f"log: p={mylm.get_prob('log'):.5F}")
    print(f"mischief: p={mylm.get_prob('mischief'):.5F}")
    print(f"fdikhsdkhdfs: p={mylm.get_prob('fdikhsdkhdfs'):.5F}")

Make sure you look up some probabilities of words in your model. Pick some words which you would expect to have high probabilities and some words which you would expect to have low probabilities.

As an extension, see how these change if you use a bigger portion of the training data to train your model.


In [None]:
mylm = language_model(files=trainingfiles[:100])
mylm.unigram
mylm.sorted_unigrams
mylm.make_sentence(30)

In [None]:
# SAMPLE FROM THE DISTRIBUTION

# As an extension, you could try to sample from the distribution. Assign each of the words a range
# of numbers - the size of which is proportional to its probability. You can do this by considering the
# cumulative probability distribution (iterate through the words in the distribution, adding the probability
# of the current word to the sum of all probabilities seen so far). Then you just need to pick a random
# number and select the word which has this number in its assigned range. Of course, there are library
# methods which can do this for you to — check out random.choices() at https://www.kite.com/python/
# answers/how-to-sample-a-random-number-from-a-probability-distribution-in-python

In [None]:
# BIGRAM
class language_model_bi():

    def __init__(self, trainingdir=TRAINING_DIR, files=[]):
        # store the names of the files containing training data and run the training method
        self.training_dir = trainingdir
        self.files = files
        self.train()

    def train(self):
        # initialise an empty dictionary which will be the unigram model {w:P(w)} when training is complete
        self.bigrams = {}
        # process all of the training data, accumulating counts of events
        self._processfiles()
        # convert the accumulated counts to probabilities
        self._convert_to_probs()

    def _processfiles(self):
        # process each file in turn
        for afile in self.files:
            # print("Processing {}".format(afile))
            try:
                with open(os.path.join(self.training_dir, afile)) as instream:
                    for line in instream:
                        line = line.rstrip()
                        if len(line) > 0:
                            self._processline(line)
            except UnicodeDecodeError:
                print("UnicodeDecodeError processing {}: ignoring file".format(afile))

    def _processline(self, line):
        # process each line of a file
        # each line is tokenized and has a special start and end token added
        # counts of tokens are added to the self.unigram count model
        tokens = ["__START"]+tokenize(line)+["__END"]

        for i in range(0, len(tokens)-1):

            target_word = tokens[i]
            context_word = tokens[i+1]

            if target_word not in self.bigrams.keys():
                self.bigrams[target_word] = {}

            self.bigrams[target_word][context_word] = self.bigrams[target_word].get(
                context_word, 0) + 1

    def _convert_to_probs(self):
        # self.unigram initially counts counts for each token {token:freq(token)}
        # sum all of the frequencies and divide each frequency by that sum to get probabilities
        for target_word, context_word_dict in self.bigrams.items():
            total_count = sum(context_word_dict.values())
            for context_word, context_word_count in context_word_dict.items():
                self.bigrams[target_word][context_word] = context_word_count / total_count

    def get_likely_next_word(self, target_token):
        targets_context_words = self.bigrams[target_token]
        most_likely_next_word = max(
            targets_context_words, key=targets_context_words.get)
        return most_likely_next_word

    def get_n_most_likely_words(self, target_token, n_words):
        targets_context_words = self.bigrams[target_token]

        sorted_context_words = dict(
            sorted(targets_context_words.items(), key=lambda item: item[1], reverse=True))

        return list(sorted_context_words.keys())[:n_words]

    def make_sentence(self, n_words_to_consider=10, starter="__START"):
        sentence = []
        # prev_token = None
        current_token = starter

        while current_token != "." and current_token != "__END" and len(sentence) < 75:

            sentence.append(current_token)
            # prev_token = current_token

            new_token = random.choice(
                self.get_n_most_likely_words(current_token, n_words_to_consider))
            # while new_token == prev_token and new_token != "__START":
            #     new_token = random.choice(
            #         self.get_top_n_words(current_token))

            current_token = new_token
        return " ".join(sentence)

    def make_sentence_sample_from_dist(self, starter="__START"):
        sentence = []
        # prev_token = None
        current_token = starter

        while current_token != "." and current_token != "__END" and len(sentence) < 75:
            sentence.append(current_token)
            new_token = random.choices(list(self.bigrams[current_token].keys()), list(
                self.bigrams[current_token].values()))
            # print(new_token)
            current_token = new_token[0]
        print(f"{current_token=}")
        return " ".join(sentence)

    def get_perplexity(self, string):
        all_log_probs = []
        n = 0
        tokens = ["__START"]+tokenize(string)+["__END"]
        print(f"{tokens=}")
        for i in range(0, len(tokens)-1):
            target_word = tokens[i]
            context_word = tokens[i+1]
            print(f"{target_word=}")
            print(f"{context_word=}")
            all_log_probs.append(
                math.log2(self.bigrams[target_word][context_word]))
            n += 1
        perplex = math.exp((-1*sum(all_log_probs))/n)
        return perplex

    # def perplexity(self, unseen_text):
    #     # takes in unseen text and calculates the bigram probability for it using formula

In [None]:
MAX_FILES = 100
bigrammer = language_model_bi(files=trainingfiles[-MAX_FILES:])

In [None]:
bigrammer.make_sentence_sample_from_dist()

In [None]:
bigrammer.getcomplexity("Many geese lift weights")

In [None]:
bigrammer.bigrams
for i in range(5):
    print(bigrammer.make_sentence(25, "The"))
for i in range(5):
    print(bigrammer.make_sentence_sample_from_dist("The"))

In [None]:
import math
def evaluate_perplexity(lang_model_dict, test_corpus_dir, test_files, n_gram_lookback_depth):

    all_log_probs = []
    n = 0
    for afile in test_files:
        # print("Processing {}".format(afile))
        try:
            with open(os.path.join(test_corpus_dir, afile)) as instream:
                for line in instream:
                    line = line.rstrip()
                    if len(line) > 0:

                        tokens = ["__START"]+tokenize(line)+["__END"]

                        for i in range(0, len(tokens)-n_gram_lookback_depth):
                            print(f"{i=}")
                            print(f"{i+n_gram_lookback_depth=}")
                            print(f"{tokens=}")
                            target_word = tokens[i]
                            context_word = tokens[i+n_gram_lookback_depth]
                            print(f"{target_word=}")
                            print(f"{context_word=}")
                            if n_gram_lookback_depth == 0:
                                all_log_probs.append(
                                    math.log2(lang_model_dict[target_word]))
                                n += 1
                            else:
                                all_log_probs.append(
                                    math.log2(lang_model_dict[target_word][context_word]))
                                n += 1
            perplex = math.exp((-1*sum(all_log_probs))/n)

            return perplex

        except UnicodeDecodeError:
            print("UnicodeDecodeError processing {}: ignoring file".format(afile))

In [None]:
evaluate_perplexity(bigrammer.bigrams, TRAINING_DIR,
                    trainingfiles[-MAX_FILES:], 1)

In [None]:
evaluate_perplexity(mylm.unigram, TRAINING_DIR, trainingfiles[-MAX_FILES:], 0)

In [None]:
class language_model_both():

    def __init__(self, trainingdir=TRAINING_DIR, files=[]):
        # store the names of the files containing training data and run the training method
        self.training_dir = trainingdir
        self.files = files

        self.train()

    def train(self):
        # initialise an empty dictionary which will be the unigram model {w:P(w)} when training is complete
        self.unigram = {}
        self.bigrams = {}
        # process all of the training data, accumulating counts of events
        self._processfiles()
        # convert the accumulated counts to probabilities
        self.convert_low_to_unknown(2)
        self._convert_to_probs()
        self._store_sorted_unigrams()

    def _store_sorted_unigrams(self):
        sorted_unigrams = dict(
            sorted(self.unigram.items(), key=lambda item: item[1], reverse=True))
        self.sorted_unigrams = sorted_unigrams

    def _processfiles(self):
        # process each file in turn
        for afile in self.files:
            # print("Processing {}".format(afile))
            try:
                with open(os.path.join(self.training_dir, afile)) as instream:
                    for line in instream:
                        line = line.rstrip()
                        if len(line) > 0:
                            self._processline(line)
            except UnicodeDecodeError:
                print("UnicodeDecodeError processing {}: ignoring file".format(afile))

    def _processline(self, line):
        # process each line of a file
        # each line is tokenized and has a special start and end token added
        # counts of tokens are added to the self.unigram count model
        tokens = ["__START"]+tokenize(line)+["__END"]
        # print(tokens)
        for i, token in enumerate(tokens):

            self.unigram[token] = self.unigram.get(token, 0)+1

            if token not in self.bigrams.keys():
                self.bigrams[token] = {}
            if i != len(tokens)-1:
                self.bigrams[token][tokens[i+1]
                                    ] = self.bigrams[token].get(tokens[i+1], 0) + 1

    def _convert_to_probs(self):
        # self.unigram initially counts counts for each token {token:freq(token)}
        # sum all of the frequencies and divide each frequency by that sum to get probabilities

        self.unigram = {k: v/sum(self.unigram.values())
                        for (k, v) in self.unigram.items()}

        for target_word, context_word_dict in self.bigrams.items():
            total_count = sum(context_word_dict.values())
            for context_word, context_word_count in context_word_dict.items():
                self.bigrams[target_word][context_word] = context_word_count / total_count

    def get_prob(self, token, method="unigram"):
        # simple look up method
        if method == "unigram":
            return self.unigram.get(token, 0)
        else:
            print("Not implemented: {}".format(method))
            return 0

    def get_top_n_words(self, n):
        return list(self.sorted_unigrams)[:n]

    def make_sentence(self, method, start_word="__START", words_to_pick_from=50):
        sentence = []
        prev_token = None
        token_to_add = start_word

        if method == "unigram":
            while token_to_add != "." and len(sentence) < 75:
                sentence.append(token_to_add)
                prev_token = token_to_add
                new_token = random.choice(
                    self.get_top_n_words(words_to_pick_from))
                while new_token == prev_token or new_token == "__END" or new_token == "__START":
                    new_token = random.choice(
                        self.get_top_n_words(words_to_pick_from))
                token_to_add = new_token

        elif method == "bigram":
            while token_to_add != "." and len(sentence) < 75:
                sentence.append(token_to_add)
                new_token = random.choices(list(self.bigrams[token_to_add].keys()), list(
                    self.bigrams[token_to_add].values()))
                print(f"{new_token=}")
                token_to_add = new_token[0]
        else:
            print("no method")
            return 0

        return " ".join(sentence)

    def convert_low_to_unknown(self, threshold):
        self.unigram = self.convert_dict(self.unigram, threshold=2)

        for target_word, context_dict in self.bigrams.items():
            self.bigrams[target_word] = self.convert_dict(
                context_dict, threshold)

    def convert_dict(self, dict_to_convert, threshold=2):
        to_delete = []
        dict_to_convert["__UNK"] = 0
        for word, count in dict_to_convert.items():
            if count < threshold:
                dict_to_convert["__UNK"] += count
                to_delete.append(word)

        for word in to_delete:
            del dict_to_convert[word]

        return dict_to_convert

    def get_perplexity(self, string, method="bigram"):

        all_log_probs = []
        n = 0
        tokens = ["__START"]+tokenize(string)+["__END"]
        print(f"{tokens=}")

        if method == "unigram":
            for token in tokens:
                all_log_probs.append(
                    math.log2(self.unigram[token]))
                n += 1
            perplex = math.exp((-1*sum(all_log_probs))/n)
        elif method == "bigram":
            for i in range(0, len(tokens)-1):
                target_word = tokens[i]
                context_word = tokens[i+1]
                print(f"{target_word=}")
                print(f"{context_word=}")
                all_log_probs.append(
                    math.log2(self.bigrams[target_word][context_word]))
                n += 1
            perplex = math.exp((-1*sum(all_log_probs))/n)
            return perplex
        else:
            print("unknown method")
            return 0

In [None]:
MAX_FILES = 2
both = language_model_both(files=trainingfiles[-MAX_FILES:])

In [None]:
bigrammer.bigrams
for i in range(5):
    print(both.make_sentence("unigram", start_word="The"))
for i in range(5):
    print(both.make_sentence("bigram", start_word="The"))

In [4]:
class language_model_all():
    def __init__(self, trainingdir=TRAINING_DIR, files=[]):
        # store the names of the files containing training data and run the training method
        self.training_dir = trainingdir
        self.files = files
        self.train()

    def train(self):
        # initialise an empty dictionary which will be the unigram model {w:P(w)} when training is complete
        self.trigrams = {}
        self.unigram = {}
        self.bigrams = {}
        # process all of the training data, accumulating counts of events
        self._processfiles()
        # convert the accumulated counts to probabilities
        self._convert_low_to_unknowns()
        # self._convert_to_probs()
        self._store_sorted_unigrams()

    def _store_sorted_unigrams(self):
        sorted_unigrams = dict(
            sorted(self.unigram.items(), key=lambda item: item[1], reverse=True))
        self.sorted_unigrams = sorted_unigrams

    def _processfiles(self):
        # process each file in turn
        for afile in self.files:
            # print("Processing {}".format(afile))
            try:
                with open(os.path.join(self.training_dir, afile)) as instream:
                    for line in instream:
                        line = line.rstrip()
                        if len(line) > 0:
                            self._processline(line)
            except UnicodeDecodeError:
                print("UnicodeDecodeError processing {}: ignoring file".format(afile))

    def _processline(self, line):
        # process each line of a file
        # each line is tokenized and has a special start and end token added
        # counts of tokens are added to the self.unigram count model
        tokens = ["__START1", "__START2"]+tokenize(line)+["__END1", "__END2"]

        for i in range(len(tokens)):
            self.unigram[tokens[i]] = self.unigram.get(tokens[i], 0)+1

            if i != len(tokens)-1:
                if tokens[i] not in self.bigrams.keys():
                    self.bigrams[tokens[i]] = {}
                self.bigrams[tokens[i]][tokens[i+1]
                                        ] = self.bigrams[tokens[i]].get(tokens[i+1], 0) + 1

            if i < len(tokens) - 2:
                if tokens[i] not in self.trigrams.keys():
                    self.trigrams[tokens[i]] = {}
                if tokens[i+1] not in self.trigrams[tokens[i]].keys():
                    self.trigrams[tokens[i]][tokens[i+1]] = {}
                self.trigrams[tokens[i]][tokens[i+1]][tokens[i+2]
                                                      ] = self.trigrams[tokens[i]][tokens[i+1]].get(tokens[i+2], 0) + 1

    def _convert_to_probs(self):
        # self.unigram initially counts counts for each token {token:freq(token)}
        # sum all of the frequencies and divide each frequency by that sum to get probabilities

        self.unigram = {k: v/sum(self.unigram.values())
                        for (k, v) in self.unigram.items()}

        for target_word, context_word_dict in self.bigrams.items():
            total_count = sum(context_word_dict.values())
            for context_word, context_word_count in context_word_dict.items():
                self.bigrams[target_word][context_word] = context_word_count / total_count

        for word1, inner_context_dict1 in self.trigrams.items():
            for word2, inner_context_dict2 in inner_context_dict1.items():
                total_count = sum(inner_context_dict2.values())
                for word3, word3_count in inner_context_dict2.items():
                    self.trigrams[word1][word2][word3] = word3_count/total_count

    def get_prob(self, token, method="unigram"):
        # simple look up method
        if method == "unigram":
            return self.unigram.get(token, 0)
        else:
            print("Not implemented: {}".format(method))
            return 0

    def get_top_n_words(self, n):
        return list(self.sorted_unigrams)[:n]

    def make_sentence(self, method, start_word=None, words_to_pick_from=50):
        ends = ["__END1", "__END2"]
        starts = ["__START1", "__START2"]

        if method == "unigram":
            sentence = []
            prev_token = None
            token_to_add = "__START2"
            while token_to_add != "." and token_to_add not in ends and len(sentence) <= 75:
                sentence.append(token_to_add)
                prev_token = token_to_add
                new_token = random.choice(
                    self.get_top_n_words(words_to_pick_from))
                while new_token == prev_token or new_token in starts:
                    new_token = random.choice(
                        self.get_top_n_words(words_to_pick_from))
                token_to_add = new_token

        elif method == "bigram":
            sentence = []
            prev_token = None
            token_to_add = "__START2"
            while token_to_add != "." and token_to_add not in ends and len(sentence) <= 75:
                sentence.append(token_to_add)
                new_token = random.choices(list(self.bigrams[token_to_add].keys()), list(
                    self.bigrams[token_to_add].values()))
                token_to_add = new_token[0]

        elif method == "trigram":

            sentence = ["__START1", "__START2"]

            if start_word != None:
                sentence.append(start_word)
            i = len(sentence) - 2

            next_token = random.choices(list(self.trigrams[sentence[i]][sentence[i+1]].keys(
            )), list(self.trigrams[sentence[i]][sentence[i+1]].values()))

            # print("SENTENCE BEFORE LOOP", sentence)
            while next_token != "." and next_token[0] not in ends and len(sentence) <= 75:
                i += 1
                sentence.append(next_token[0])
                next_token = random.choices(list(self.trigrams[sentence[i]][sentence[i+1]].keys(
                )), list(self.trigrams[sentence[i]][sentence[i+1]].values()))
            sentence.append(next_token[0])

        else:
            print("no method")
            return 0

        return " ".join(sentence)

    def get_perplexity(self, string, method="bigram"):
        all_log_probs = []
        n = 0
        tokens = ["__START1", "__START2"]+tokenize(string)+["__END1", "__END2"]
        print(f"{tokens=}")

        if method == "unigram":
            for token in tokens:
                all_log_probs.append(
                    math.log2(self.unigram[token]))
                n += 1
            perplex = math.exp((-1*sum(all_log_probs))/n)
            
        elif method == "bigram":
            for i in range(0, len(tokens)-1):
                context_word = tokens[i]
                word_to_predict = tokens[i+1]
                all_log_probs.append(
                    math.log2(self.bigrams[context_word][word_to_predict]))
                n += 1
            perplex = math.exp((-1*sum(all_log_probs))/n)
            return perplex
        
        elif method == "trigram":
            for i in range(0, len(tokens)-2):
                context_word_1 = tokens[i]
                context_word_2 = tokens[i+1]
                word_to_predict = tokens[i+2]
                all_log_probs.append(
                    math.log2(self.trigrams[context_word_1][context_word_2][word_to_predict]))
                n += 1
            perplex = math.exp((-1*sum(all_log_probs))/n)
            return perplex
        else:
            print("unknown method")
            return 0

    def _convert_low_to_unknowns(self, known=2):
        unknown=0
        self.number_unknowns=0
        
        
        
        for (k,count) in list(self.unigram.items()):
            if count<known:
                del self.unigram[k]
                self.unigram["__UNK"]=self.unigram.get("__UNK",0)+count
                self.number_unknowns+=1
                
                
        for (k,dict1) in list(self.bigrams.items()):
            for (kk,count) in list(dict1.items()):
                isknown=self.unigram.get(kk,0)
                if isknown==0:
                    dict1["__UNK"]=dict1.get("__UNK",0)+count
                    del dict1[kk]
                    
            isknown=self.unigram.get(k,0)
            if isknown==0:
                del self.bigrams[k]
                current=self.bigrams.get("__UNK",{})
                current.update(dict1)
                self.bigrams["__UNK"]=current
            else:
                self.bigrams[k]=dict1
                
                
                
                
        for (k,dict1) in list(self.trigrams.items()):  
                      
            for (kk,dict2) in list(dict1.items()):
                for (kkk, count) in list(dict2.items()):
                    isknown = self.unigram.get(kkk,0)
                    if isknown == 0:
                        dict2["__UNK"] = dict2.get("__UNK", 0) + count
                        del dict2[kkk]
                isknown=self.unigram.get(kk,0)
                if isknown==0:
                    current = dict1.get("__UNK",{})
                    current.update(dict2)
                    del dict1[kk]                    
            isknown=self.unigram.get(k,0)
            if isknown==0:
                current=self.trigrams.get("__UNK",{})
                current.update(dict1)
                self.trigrams["__UNK"]=current
                del self.trigrams[k]
            else:
                self.trigrams[k]=dict1

                    
                
        
        # for target_word, context_dict in list(self.bigrams.items()):
        #     if target_word not in self.unigram.keys():
        #         print(target_word)
        #         print("YO", context_dict)
                
        #         self.bigrams["__UNK"] = self._merge_dicts(self.bigrams.get("__UNK", {}), context_dict)
        #         del self.bigrams[target_word]
        
        
        # for context_word_1, context_dict_1 in self.trigrams.items():
        #     for context_word_2, context_dict_2 in context_dict_1.items():
        #         self.trigrams[context_word_1][context_word_2] = self.convert_dict(
        #         context_dict_2, threshold)
                
                
        # for contextword1, contextdict1 in list(self.trigrams.items()):
        #     for contextword2, contextdict2 in list(contextdict1.items()):
        #         if contextword2 not in self.unigram.keys():
        #             self.trigrams[contextword1]["__UNK"] = self._merge_dicts(self.trigrams[contextword1].get("__UNK", {}), contextdict2)
        #             del self.trigrams[contextword1][contextword2]
        #     if contextword1 not in self.unigram.keys():
        #         self.trigrams["__UNK"] = self._merge_nested(self.trigrams.get("__UNK", {}), contextdict1) 
                



In [19]:
test_file = "/Users/rfd/Library/CloudStorage/GoogleDrive-raymondfdavey@gmail.com/My Drive/MSc/modules/2.2/2.2-Language P-2/week4-NN_bigram_unigram/lab4resources_full/tester.txt"
all = language_model_all(files=[test_file])

print("****BIGRAM*****")
for k, dict1 in all.bigrams.items():
    flag1 = False
    flag2 = False
    if k == "__UNK":
        flag1 = True
    for kk, count in dict1.items():
        if kk == "__UNK":
            flag2 = True
if flag2:
    print("__UNK PRESENT IN 2nd LAYER")
if flag1:
    print("__UNK PRESENT IN 1st LAYER")

                
print("****TRIGRAM*****")
                
for k, dict1 in all.trigrams.items():
    flag1 = False
    flag2 = False
    flag3 = False
    if k == "__UNK":
        flag1 = True
    for kk, dict2 in dict1.items():
        if kk == "__UNK":
            flag2 = True
        for kkk, count in dict2.items():
            if kkk == "__UNK":
                flag3 = True
if flag1:
    print("__UNK PRESENT IN 1st LAYER")
if flag2:
    print("__UNK PRESENT IN 2nd LAYER")
if flag3:
    print("__UNK PRESENT IN 3rd LAYER")
                
                

****BIGRAM*****
__UNK PRESENT IN 2nd LAYER
__UNK PRESENT IN 1st LAYER
****TRIGRAM*****
__UNK PRESENT IN 1st LAYER


In [6]:
MAX_FILES = 5
all_proper = language_model_all(files=trainingfiles[-MAX_FILES:])

In [18]:
print("****BIGRAM*****")
for k, dict1 in all_proper.bigrams.items():
    flag1 = False
    flag2 = False
    if k == "__UNK":
        flag1 = True
    for kk, count in dict1.items():
        if kk == "__UNK":
            flag2 = True
if flag2:
    print("__UNK PRESENT IN 2nd LAYER")
if flag1:
    print("__UNK PRESENT IN 1st LAYER")

                
print("****TRIGRAM*****")
                
for k, dict1 in all_proper.trigrams.items():
    flag1 = False
    flag2 = False
    flag3 = False
    if k == "__UNK":
        flag1 = True
    for kk, dict2 in dict1.items():
        if kk == "__UNK":
            flag2 = True
        for kkk, count in dict2.items():
            if kkk == "__UNK":
                flag3 = True
if flag1:
    print("__UNK PRESENT IN 1st LAYER")
if flag2:
    print("__UNK PRESENT IN 2nd LAYER")
if flag3:
    print("__UNK PRESENT IN 3rd LAYER")
                


****BIGRAM*****
__UNK PRESENT IN 2nd LAYER
__UNK PRESENT IN 1st LAYER
****TRIGRAM*****
__UNK PRESENT IN 1st LAYER
__UNK PRESENT IN 3rd LAYER


In [18]:
print(all_proper.trigrams)
# all_proper.bigrams["__UNK"]




In [19]:
print(all.make_sentence(method="unigram"))
print(all.make_sentence(method="bigram"))
print(all.make_sentence(method="trigram"))

__START2 test just it a endure divine ones patience
__START2 This is just simply a test ones patience
__START1 __START2 This is just simply a test , a test , a test , a test of patience . __END1


In [20]:
MAX_FILES = 100
all_proper = language_model_all(files=trainingfiles[-MAX_FILES:])

UnicodeDecodeError processing MFRND10.TXT: ignoring file
UnicodeDecodeError processing DTROY10.TXT: ignoring file
UnicodeDecodeError processing KRSON10.TXT: ignoring file


In [21]:
print(all_proper.make_sentence(method="unigram"))
print(all_proper.make_sentence(method="bigram"))
print(all_proper.make_sentence(method="trigram"))

__START2 `` which with have with be him he from and at for that ? in at were for The __UNK '
__START2 Vida Sherwin had
__START1 __START2 however humane he may harbour some regret , and found her face , so that it may be able , if the __END1


In [27]:
all_proper.trigrams["why"]["the"]["__END1"]

9

In [None]:
sentence_to_compare = "A whistle is for communication"

all_proper.get_perplexity(sentence_to_compare, method="unigram")
all_proper.get_perplexity(sentence_to_compare, method="bigram")
all_proper.get_perplexity(sentence_to_compare, method="triigram")