# n-gram Language Model
Here we will explore how to build an n-gram language model. n-gram language models are one of primitive types of language modelling done using conditional probablity approach. Watch this video to understand more: https://www.youtube.com/watch?v=iWea12EAu6U&list=PLoROMvodv4rOhcuXMZkNm7j3fVwBBY42z&index=6

# The idea
We will build a simple sentence completion model. This model will first read through a piece of novel then generate texts based on seed word(s) using a probablistic approach.

In [1]:
import re
import os
import string

# Method to read the text file

In [2]:
def read_file(file_path: str) -> str:
    """
    This function reads a text file and returns the string.

    Parameters
    ----------
    file_path : str
        The complete text file path

    Returns
    -------
    str
        The content of the text file

    """
    with open(file_path,'r',encoding='utf-8') as file:
        s = file.read()
    return s

In [3]:
file_path = './datasets/Harry Potter and the Sorcerer.txt'
s = read_file(file_path)
s



# Method to clean text
Text cleaning is a major challenge in NLP tasks

In [4]:
def clean_text(text: str) -> str:
    '''
    Cleans the text by
    1. Changing the end of sentence tokens to add space between them and the words.
    2. Other special characters to be removed

    Parameters
    ----------
    text : str
        Unclean text

    Returns
    -------
    str
        Cleaned text

    '''
    # replace '...' with token
    text = re.sub('\.\.\.',' eosstop', text)
    # separate end of sentence tokens
    transform_dict = {'.':'  eosstop',
                      '!': ' eosexclamation',
                      '?': ' eosquestion',
                     }
    text = text.translate(str.maketrans(transform_dict))
    
    # replace open and closing quotes with a particular token
    text = re.sub(' \"',' startquote ', text)
    text = re.sub('\" ', ' endquote ', text)
    
    # expand contractions
    def decontracted(phrase):
        # specific
        phrase = re.sub(r"won\'t", "will not", phrase)
        phrase = re.sub(r"can\'t", "can not", phrase)

        # general
        phrase = re.sub(r"n\'t", " not", phrase)
        phrase = re.sub(r"\'re", " are", phrase)
        phrase = re.sub(r"\'s", " is", phrase)
        phrase = re.sub(r"\'d", " would", phrase)
        phrase = re.sub(r"\'ll", " will", phrase)
        phrase = re.sub(r"\'t", " not", phrase)
        phrase = re.sub(r"\'ve", " have", phrase)
        phrase = re.sub(r"\'m", " am", phrase)
        return phrase
    
    text = decontracted(text)
    # replace new line separations
    text = re.sub('\n+',' ', text)
    # lower case everything
    text = text.lower()
    # replace all other special characters
    text = text.translate(str.maketrans('','',string.punctuation))
    # replace multiple spaces with a single space
    text = re.sub(' +',' ', text)
    return text
    

In [5]:
text = clean_text(text=s)
text



# Idea
The idea will be as follows:
1. User will enter a seed word or series of words and model will predict the next word.
1. If its a series of words it should count the occurence of last 4-gram, if not found it will calculate occurance of 3-gram, 2-gram, 1-gram and so on.

One such E.g.:<br>
Input: harry potter <br>
Output: harry potter is ... <br>
In other words we have to find the below probabilities:<br>
P(is|harry potter) = p(is ∩ (harry, potter))/p(harry, potter)<br>
If p(harry, potter) = 0 <br>
Find p(is ∩ (potter))/p(potter)

# Generate vocabulary
This method will generate the vocabulary out of the given text

In [6]:
def generate_vocab(
    text_corpus: str
              ) -> list:
    """
    Generate the vocabulary

    Parameters
    ----------
    text_corpus : str
        The whole text

    Returns
    -------
    list
        Sorted list of unique words that appeared in our corpus.

    """
    words = text_corpus.split(' ')
    words = list(set(words))
    words.sort()
    return words

In [7]:
words = generate_vocab(text_corpus='this is a test is it a test')
words

['a', 'is', 'it', 'test', 'this']

# Search count of phrase occurances in corpus
Below method will calculate the number of times the given text sequence appeared in our corpus.
1. First split the corpus words {this|is|a|test|is|it}
1. Split the search phrase words {is|a}
1. scan each word in the given corpus and with each word searched check whether it matches the first position of search phrase word list i.e. 'is'.
1. If a match is found start a loop within the words given in search phrase.
1. With each word in the search phrase check whether the same words sequence appear together in corpus words list

In [8]:
def count_gram(
    search_phrase: str,
    text_corpus: str
    ) -> int:
    '''
    This method will count number of times the seed text appeared together in the text

    Parameters
    ----------
    search_phrase : str
        The text to search.
    text_corpus : str
        The cleaned text corpus where to search the seed text.

    Returns
    -------
    float
        DESCRIPTION.

    '''
    corpus_words = text_corpus.split(' ')
    search_phrase_words = search_phrase.split(' ')
    count = 0
    for i,word in enumerate(corpus_words):
        found: False
        # search only if first word from the seed_text_words matches the given corpus word scan
        if search_phrase_words[0] == word:
            # search for the rest of the seed text words whether it is appearing in corpus words
            # at the same sequence
            for j, seed_word in enumerate(search_phrase_words):
                if corpus_words[i+j] == seed_word:
                    found = True
                else:
                    # if the corpus word mismatches the given sequence word break the loop
                    found = False
                    break
            if found:
#                 print('Sequence found in positions: {}'.format(i))
                count += 1
    return count

In [9]:
count_of_text = count_gram(search_phrase='harry potter', text_corpus=text)
count_of_text

30

# Preprocess the seed text
This method will make sure we are looking at a maximum of 4 gram text sequence. <br>
Input: this is a beautifully constructed long text which we do not want <br>
Output: we do not want

In [14]:
def preprocess_seed_text(
    seed_text: str,
    n_gram: int = 4
)-> str:
    '''
    This method will make sure the we are looking at maximum of a 4-gram search

    Parameters
    ----------
    seed_text : str
        The seed_text as entered by user
    n_gram : int (optional)
        N-grams to process.
        Default value 4

    Returns
    -------
    str
        The truncated 4-gram seed text

    '''
    words = seed_text.split(' ')
    words = words[-n_gram:]
    return ' '.join(words)

In [16]:
preprocess_seed_text(seed_text='this is a beautifully constructed long text which we do not want',
                     n_gram = 2
                    )

'not want'

# Computation of probabilities
Here we will compute the following probabilities<br>
Suppose we want to generate a text as follows: 'harry potter is a wizard'. We need to go on computing the probabilities as follows:
1. p(is ∩ (harry, potter))/p(harry, potter)<br>
   = count(harry potter is)/count(harry potter)
1. p(a ∩ (harry, potter, is))/p(harry, potter, is)
1. p(wizard ∩ (harry, potter, is, a))/p(harry, potter, is, a)<br>

And so on. Now it may so happen that we want to generate using an n-gram model with n=2. The below computations are to be done.
1. p(is ∩ (harry, potter))/p(harry, potter)<br>
   = count(harry potter is)/count(harry potter)
1. p(a ∩ (potter, is))/p(potter, is)
1. p(wizard ∩ (his, a))/p(is, a)

In [25]:
def generate_next_word(
    seed_text: str,
    text_corpus: str,
    n_gram: int = 4
) -> str:
    '''
    Find the probabilities of each word in our vocabulary to appear given the seed texts

    Parameters
    ----------
    seed_text : str
        DESCRIPTION.
    text_corpus : str
        DESCRIPTION.
    n_gram : int (optional)
        N-grams to process.
        Default value 4

    Returns
    -------
    None.

    '''
    # truncate the seed text
    seed_text_truncated = preprocess_seed_text(seed_text, n_gram)
    # get the vocabulary of words
    next_word = ''
    # for storing the probabilities
    probs = []
    # search with the whole truncated seed_text
    denominator_count = count_gram(search_phrase=seed_text_truncated,text_corpus=text_corpus)
    if denominator_count == 0:
        # remove first word from seed text
        seed_text = ' '.join(seed_text.split()[1:])
        return seed_text
    else:
        # compute the count of seed text of vocabulary word appearing after the given seed text
        for word in vocabulary:
            new_search_phrase = seed_text_truncated + ' ' + word
            # compute the numerator count
            numerator_count = count_gram(search_phrase=new_search_phrase,text_corpus=text_corpus)
            # compute the probability of the given word
            prob = numerator_count/denominator_count
            probs.append(prob)
        # find the position where the maximum probability is occuring
        maxpos = probs.index(max(probs))
        # give the next word with the highest probability
        next_word = vocabulary[maxpos]
        return seed_text + ' ' + next_word

In [None]:
generate_next_word('harry potter is',text,2)

In [26]:
def continue_text_generation(seed_text: str,
                             text_corpus: str,
                             n_gram:int = 4
                            ):
    '''
    Find the probabilities of each word in our vocabulary to appear given the seed texts

    Parameters
    ----------
    seed_text : str
        DESCRIPTION.
    text_corpus : str
        DESCRIPTION.
    n_gram : int (optional)
        N-grams to process.
        Default value 4

    Returns
    -------
    None.

    '''
    # continue the text generation process till seed_text is present and end of sentence is not reached
    while seed_text != '' and 'eos' not in seed_text.split()[-1]:
        seed_text = generate_next_word(seed_text, text_corpus, n_gram)
    print(seed_text)

# Main script
Take user input of word sequences.

In [30]:
seed_text = 'the wizard'
n_gram=2
file_path = './datasets/Harry Potter and the Sorcerer.txt'
file_text = read_file(file_path)
cleaned_file_text = clean_text(file_text)
vocabulary = generate_vocab(cleaned_file_text)
continue_text_generation(seed_text=seed_text, text_corpus=cleaned_file_text, n_gram=n_gram)

the wizard coins and looking at the end of the way eosstop


In [44]:
import pandas as pd
s = pd.Series(data=cleaned_file_text.split())

In [52]:
pd.set_option("display.max_rows", None, "display.max_columns", None)
display(s.value_counts())

the                 57
and                 44
eosstop             44
of                  29
a                   28
that                24
to                  23
in                  23
i                   17
war                 12
fight               10
we                   8
their                8
he                   8
is                   8
our                  7
opposed              7
not                  7
who                  7
all                  6
without              6
this                 5
those                5
wars                 5
would                5
own                  5
you                  5
president            5
us                   5
what                 5
oppose               4
am                   4
on                   4
eosquestion          4
up                   4
let’s                4
want                 4
bush                 4
with                 4
but                  4
down                 4
know                 4
an                   4
from       