## Language model for text generation

In [23]:
from nltk.tokenize import word_tokenize, sent_tokenize
from collections import defaultdict
from nltk import bigrams, trigrams
from tqdm import tqdm
from collections import Counter
import random
import os 
import re

### Get sentences of corpus

In [24]:
corpus = ""
for doc in os.listdir("../data/definitions/prediction/"):
    if doc.endswith(".txt"):
        text = open("../data/definitions/prediction/"+doc, "r").read()
        corpus += text

sents = sent_tokenize(corpus)

### Create dict that holds frequencies of words following a bigram
<ol>
    <li>Iterate words of each sentence and remove punctuation</li>
    <li>Get trigrams of sentence with padding left and right</li>
    <li>Save first two words as key and last word as follower and increase count</li>
</ol>
<br/>
<b>Parameters 'pad_left' and 'pad_right' in Trigrams are used to get the following:</b> <br/>
<b>Example sentence:</b> $$"Clustering\:is\:nice"$$<br/>
<b>Without params:</b> $$[("Clustering", "is", "nice")]$$
<b>With padding:</b> $$[(None,\:None,\:"Clustering"), (None,\:"Clustering", "is"), ... , ("is", "nice",\:None), ("nice",\:None,\:None)]$$
<br/>
--> This allows to determine the frequency of words at the beginning of the sentence

In [25]:
model = defaultdict(lambda: defaultdict(lambda: 0))
for sent in tqdm(sents):
    words = word_tokenize(sent)
    words = [w.lower() for w in words if re.match("[a-z]+", w.lower()) is not None]
    tgs = trigrams(words, pad_left=True, pad_right=True)
    for w1, w2, w3 in tgs:
        model[(w1, w2)][w3] += 1

100%|██████████| 179/179 [00:00<00:00, 1929.34it/s]


In [26]:
# Amount of sentences starting with "The"
print(model[None, None]["the"])

29


### Calculate the conditional Probabilities
For each stored trigram the probability $P(w3|w1, w2)$ is calculated

In [27]:
for w1_w2 in tqdm(model):
    total_count = float(sum(model[w1_w2].values()))
    for w3 in model[w1_w2]:
        model[w1_w2][w3] /= total_count

100%|██████████| 3025/3025 [00:00<00:00, 279392.44it/s]


In [28]:
# Probability of a sentence starting with "the"
print(model[(None, None)]["the"])

0.16201117318435754


### Generate new sentences
Using a random number to ensure that not only the words with highest probability gets assigned as next word, because this will result in almost identical sentences.

In [29]:
import random
for i in range(20):
    text = [None, None]
    prob = 1.0
    sentence_finished = False

    while not sentence_finished:
        # r = random.random()
        r = random.uniform(0.5, 1)
        # print("Random: {}".format(r))
        accumulator = .0
        # print("Possible words: {}".format(len(model[tuple(text[-2:])].keys())))
        for word in model[tuple(text[-2:])].keys():
            accumulator += model[tuple(text[-2:])][word]
            # print("Accumulator: {}".format(accumulator))
            # print("=> {}".format( accumulator >= r))
            prob *= model[tuple(text[-2:])][word]  
            if accumulator >= r:
                text.append(word)
                break

        if text[-2:] == [None, None] or len(text) == 30:
            sentence_finished = True
    if len(text) > 6:
        if not os.path.exists("../data/sentence_generation_outputs"):
            os.mkdir("../data/sentence_generation_outputs/")
        file = open("../data/sentence_generation_outputs/sentence_{}.txt".format(i+1), "w")
        file.write(' '.join([t for t in text if t]))
        
        print("Probability of text={}".format(prob))
        print(' '.join([t for t in text if t]))
        print("\n")


Probability of text=4.6319015258922035e-127
when there is no precise definition of an outlier outliers are observations which do not imply order


Probability of text=1.0573121510623575e-142
like all forms of regression analysis it is a category of regression analysis can be applied to the discrete response variables used in finance investing and other is


Probability of text=1.1303796588322627e-191
predicting values outside of the explanatory variables are predicted rather than on the joint probability distribution of the response


Probability of text=1.2045457987219775e-71
using the least squares least absolute deviations minimizing the sum of absolute values of residuals and the y-axis would represent the child iq score would be.regression analysis is


Probability of text=3.0320700648629e-126
under this hypothesis the accuracy of classification rules can be accurately expressed by the other independent variables are collected without an accompanying response value the fitted line