<a href="https://colab.research.google.com/github/prajaktakini/Language-Models/blob/main/text_generation_ngram.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


```
References:
1. Medium Article https://towardsdatascience.com/text-generation-using-n-gram-model-8d12d9802aa0
2. Linkedin Blog https://www.linkedin.com/pulse/evolution-language-models-my-notes-ajay-taneja/?trackingId=kp09TQv9RRqE2QTC%2FOLL%2FA%3D%3D

Links from where EBooks can be downloaded: https://www.gutenberg.org/
# Book that is used in this implementation: https://www.gutenberg.org/ebooks/71747
```

In [67]:
# All imports
import string
import random
import time
from typing import List

In [68]:
# Below is the tokenizer function that tokenizes given text into tokens
def tokenize(text: str) -> List[str]:
  '''
  text -> takes in input string to be tokenized
  returns list of string tokens post tokenization
  '''
  for punctuation in string.punctuation:
    text = text.replace(punctuation, ' ' + punctuation + ' ') # Add spaces before and after punctuations
  tokens = text.split()
  return tokens

In [15]:
# Sample result returned from tokenize function
#print(tokenize("I am sorry, I made mistake"))

['I', 'am', 'sorry', ',', 'I', 'made', 'mistake']


In [69]:
def find_ngrams(n: int, tokens:list) -> list:
  '''
  n -> defines size of n-gram
  tokens -> list of tokens
  returns the list of n-grams
  '''

  # This is to avoid the missing tokens problem when the sentence starts eg. Say, our sentence is I love reading books , for 3 grams, initial consideration would be (<START>, <START>, I) and so on
  tokens = (n - 1) * ["<START>"] + tokens

  l = [(tuple([tokens[i-p-1] for p in reversed(range(n-1))]), tokens[i]) for i in range(n-1, len(tokens))] # Creates all tuples such as (('<START>', '<START>'), 'I'), (('<START>', 'I'), 'am') and so on
  return l


In [55]:
# Sample result returned from find_ngrams function
#print(find_ngrams(3, ['I', 'am', 'sorry', ',', 'I', 'made', 'mistake']))

['<START>', '<START>', 'I', 'am', 'sorry', ',', 'I', 'made', 'mistake']
[(('<START>', '<START>'), 'I'), (('<START>', 'I'), 'am'), (('I', 'am'), 'sorry'), (('am', 'sorry'), ','), (('sorry', ','), 'I'), ((',', 'I'), 'made'), (('I', 'made'), 'mistake')]


In [71]:
class Ngram(object):

  def __init__(self, n):
    # n defines n-gram count
    self.n = n

    # dictionary that keeps track of list of possible words given a context
    self.context = {}

    # counter that keeps track of how many times ngram has appeared in the corpus
    self.ngram_counter = {}


  # Updates the language model
  def update(self, sentence: str) -> None:
    n = self.n

    ngrams = find_ngrams(n, tokenize(sentence))

    # Increment ngram counter in the dictionary
    for ngram in ngrams:
      if ngram in self.ngram_counter:
        self.ngram_counter[ngram] += 1
      else:
        self.ngram_counter[ngram] = 1

      prev_words, target_word = ngram # prev_words -> ('I', 'love') and target_word -> 'reading'

      # Add the target word to the list of prev words
      if prev_words in self.context:
        self.context[prev_words].append(target_word)
      else:
        self.context[prev_words] = [target_word]


  # Given a context dictionary, finds the probability of this token to be generated. Returns a conditional probability
  # https://www.linkedin.com/pulse/evolution-language-models-my-notes-ajay-taneja/?trackingId=kp09TQv9RRqE2QTC%2FOLL%2FA%3D%3D
  def find_token_probability(self, context, token) -> float:

    try:
      count_of_ngram = self.ngram_counter[(context, token)] # gives frequency of this n-gram
      number_of_possible_targets = len(self.context[context])
      result = float(count_of_ngram) / float(number_of_possible_targets)
    except KeyError:
      result = 0.0

    return result


  # Given a context, generate a next word to append in sequence semi-randomly
  def generate_random_token(self, context):

    r = random.random()
    token_prob_map = {} # holds each token with its probability for a given context
    possible_tokens = self.context[context]

    for token in possible_tokens:
      token_prob_map[token] = self.find_token_probability(context, token)

    sum = 0
    for token in sorted(token_prob_map):
      sum += token_prob_map[token]

      if sum > r:
        return token


  # generates text for the given count of tokens
  def generate_text(self, token_count: int):
    '''
    token_count -> defines number of tokens to be generated
    '''

    n = self.n

    # provide some context to the system by prepending (n - 1) '<START>' tokens
    context_queue = (n - 1) * ['<START>']
    result = []

    for _ in range(token_count):
      obj = self.generate_random_token(tuple(context_queue))
      result.append(obj)
      if n > 1:
        context_queue.pop(0)

        # when we reached the full stop . in generation, the system would be not aware of how to proceed, because of the way how we were updating our Language Model.
        # Therefore, we would need to reinitialize the contextual queue context_queue every time our model generates a full stop.
        if obj == '.':
          context_queue = (n - 1) * ["<START>"]
        else:
          context_queue.append(obj)

    return ' '.join(result)


In [72]:
# Initialises the n-gram model and dictionaries
def create_ngram_model(n, path):
  model = Ngram(n)
  with open(path, 'r') as f:
    text = f.read()
    text = text.split('.') # split sentences
    for sentence in text:
      sentence += '.' # Add back fullstop
      model.update(sentence) # Update the dictionary
  return model


# Main function
if __name__ == "__main__":
  start = time.time()
  model = create_ngram_model(3, '/content/sample_data/the_film.txt')

  print(f'Time taken to create ngram model: {time.time() - start}')

  start = time.time()
  random.seed(5)

  print(f'{"="* 50}\nGenerated text')
  print(model.generate_text(20))
  print(f'{"=" * 50}')



Time taken to create ngram model: 0.2583951950073242
Generated text
Take a novel as an indispensable ally of the nation makes of this agreement and help preserve free future access
