## Language model for text generation

In [8]:
from nltk.tokenize import word_tokenize, sent_tokenize
from collections import defaultdict
from nltk import bigrams, trigrams
from tqdm import tqdm
from collections import Counter
import random
import os 
import re

### Get sentences of corpus

In [9]:
corpus = ""
for doc in os.listdir("../data/definitions/clustering/"):
    if doc.endswith(".txt"):
        text = open("../data/definitions/clustering/"+doc, "r").read()
        corpus += text

sents = sent_tokenize(corpus)

### Create dict that holds frequencies of words following a bigram
<ol>
    <li>Iterate words of each sentence and remove punctuation</li>
    <li>Get trigrams of sentence with padding left and right</li>
    <li>Save first two words as key and last word as follower and increase count</li>
</ol>
<br/>
<b>Parameters 'pad_left' and 'pad_right' in Trigrams are used to get the following:</b> <br/>
<b>Example sentence:</b> $$"Clustering\:is\:nice"$$<br/>
<b>Without params:</b> $$[("Clustering", "is", "nice")]$$
<b>With padding:</b> $$[(None,\:None,\:"Clustering"), (None,\:"Clustering", "is"), ... , ("is", "nice",\:None), ("nice",\:None,\:None)]$$
<br/>
--> This allows to determine the frequency of words at the beginning of the sentence

In [10]:
model = defaultdict(lambda: defaultdict(lambda: 0))
for sent in tqdm(sents):
    words = word_tokenize(sent)
    words = [w.lower() for w in words if re.match("[a-z]+", w.lower()) is not None]
    tgs = trigrams(words, pad_left=True, pad_right=True)
    for w1, w2, w3 in tgs:
        model[(w1, w2)][w3] += 1

100%|██████████| 140/140 [00:00<00:00, 1891.15it/s]


In [11]:
# Amount of sentences starting with "The"
print(model[None, None]["the"])

21


### Calculate the conditional Probabilities
For each stored trigram the probability $P(w3|w1, w2)$ is calculated

In [12]:
for w1_w2 in tqdm(model):
    total_count = float(sum(model[w1_w2].values()))
    for w3 in model[w1_w2]:
        model[w1_w2][w3] /= total_count

100%|██████████| 2631/2631 [00:00<00:00, 338005.81it/s]


In [13]:
# Probability of a sentence starting with "the"
print(model[(None, None)]["the"])

0.14685314685314685


### Generate new sentences
Using a random number to ensure that not only the words with highest probability gets assigned as next word, because this will result in almost identical sentences.

In [15]:
import random
for i in range(20):
    text = [None, None]
    prob = 1.0
    sentence_finished = False

    while not sentence_finished:
        # r = random.random()
        r = random.uniform(0.3, 1)
        # print("Random: {}".format(r))
        accumulator = .0
        # print("Possible words: {}".format(len(model[tuple(text[-2:])].keys())))
        for word in model[tuple(text[-2:])].keys():
            accumulator += model[tuple(text[-2:])][word]
            # print("Accumulator: {}".format(accumulator))
            # print("=> {}".format( accumulator >= r))
            prob *= model[tuple(text[-2:])][word]  
            if accumulator >= r:
                text.append(word)
                break

        if text[-2:] == [None, None]:
            sentence_finished = True
    if len(text) > 6:
        if not os.path.exists("../data/sentence_generation_outputs"):
            os.mkdir("../data/sentence_generation_outputs/")
        file = open("../data/sentence_generation_outputs/sentence_{}.txt".format(i+1), "w")
        file.write(' '.join([t for t in text if t]))
        
    print("Probability of text={}".format(prob))
    print(' '.join([t for t in text if t]))
    print("\n")


Probability of text=2.896853168312344e-119
clearly a cluster is left


Probability of text=2.2102919891233269e-41



Probability of text=4.1005498041968884e-158
are sub-divided into groups in such a way that the items in a clear and meaningful way.clustering is a statistical method for finding relatively homogeneous clusters of data


Probability of text=6.437285312285331e-109
cluster analysis see gordon analysis or clustering is defined as a separate cluster i.e. there are as dissimilar as possible.cluster analysis is the organization of a sample of n subjects the observed values of several variables for each individual


Probability of text=5.032138052900376e-54
generally clustering is a statistical method for identifying homogenous groups of cases


Probability of text=3.572744514274834e-88
for instance in the former.statistical classification technique in which a set of genes that is coexpressed implies that the sample units come from a number of clusters we want the data without e