# Generating poetry using Natural Language processing (Markov Principle and Bayes Theorem)

In [1]:
import numpy as np
import string

np.random.seed(1234)

In [2]:
initial = {} # Start of the phrase
first_order = {} # Second word only
second_order = {} # 2nd order transition probabilities

In [3]:
def remove_punctuations(s):
  return s.translate(str.maketrans('', '', string.punctuation))

In [4]:
!wget -nc https://raw.githubusercontent.com/lazyprogrammer/machine_learning_examples/master/hmm_class/robert_frost.txt

--2022-05-19 16:59:37--  https://raw.githubusercontent.com/lazyprogrammer/machine_learning_examples/master/hmm_class/robert_frost.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.110.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 56286 (55K) [text/plain]
Saving to: ‘robert_frost.txt’


2022-05-19 16:59:37 (5.10 MB/s) - ‘robert_frost.txt’ saved [56286/56286]



In [6]:
def add2dict(d,k,v): # dictionary, key, value(list storing all possible next words)
  if k not in d:
    d[k] =[]
    d[k].append(v)

In [7]:
for line in open('robert_frost.txt'):
  tokens = remove_punctuations(line.rstrip().lower()).split()

  T = len(tokens)
  for i in range(T):
    t = tokens[i]
    if i == 0: # indicates that i is the first word in a sentence
      # Measure disstribution of first word
      initial[t] = initial.get(t, 0.) + 1 # get method to ensure that count is 0 if t does not exist in the dictionary
    else:
        t_1 = tokens[i-1]
        if i == T -1: # end of a sentence 
          # Measure probability of ending the line
          add2dict(second_order,(t_1,t), 'END')
        if i == 1:
          # measure probability of second word in a sentence
          # Given only the first word
          add2dict(first_order,t_1,t)
        else:
          t_2= tokens[i-2]
          add2dict(second_order,(t_2,t_1),t)


In [9]:
# Normalize the distribution
initial_total = sum(initial.values()) # Only initial dict contains actual word counts,other dicts only contain probabilities of next words
for t,c in initial.items(): # c = counts, t = tokens
  initial[t] = c/initial_total 

In [10]:
# Create dictionary of counts for the otherdicts and normalise the counts by converting them into probabilities
def list2pdict(ts): # ts= list of tokens
  d={}
  n = len(ts)
  for t in ts:
    d[t] = d.get(t, 0.) + 1
  for t,c in d.items():
    d[t] = c/n
  return d

In [11]:
for t_1, ts in first_order.items():
  first_order[t_1] = list2pdict(ts)

In [12]:
for k, ts in second_order.items():
  second_order[k] = list2pdict(ts)

In [23]:
def sample_word(d):
  p0 = np.random.random()
  cumulative = 0
  for t,p in d.items():
    cumulative+= p
    if p0 < cumulative:
      return t
  assert(False) # should never get here

In [27]:
def generate():
  for i in range(4):
    sentence = []

    # initial word
    w0 = sample_word(initial)
    sentence.append(w0)

    # Sample second word
    w1 = sample_word(first_order[w0])
    sentence.append(w1)

    # Second order transitions until the EOL
    while True:
      w2 = sample_word(second_order[(w0,w1)])
      if w2== 'END':
        break
      sentence.append(w2)
      w0=w1
      w1=w2
    print(' '.join(sentence))

In [28]:
generate()

i doubted if i should ever come back
hes celebrating something strange
or been made master of the year
was wrought through trees without a farmhouse near


****