# $\text{Introduction}$

![](https://storage.googleapis.com/kaggle-datasets-images/34436/46048/ef20fcd937e23fe3062cd2b7ab48f212/data-original.jpg?t=2018-06-30-08-42-20)

In the following Kernel we will try and see can we get some reasonable poems using a simple bigram model and word conditional probabilites.
we all know that making a bombastic nlp model on such a small dataset will give us poems that resemble Poes poems and stories. The question I will investigate in this Kernel is how good of a Poem can a bigram model with some randomness produce.

The logic behind the following code will be as follows:
1. Create a bigram model of all Poem / Story text's 
2. Create helper functions to extract the probability and decide on the next word using the following probability:

Let $w1$ be the current word we will find all the bigrams that start with $w1$ and calculate $P(w2 | w1 )$ $=>$ $count(w1w2) / count(w1)$ 

We will fill a list with 5 words with the highest probabilities and use another variable $'alpha'$ to decide which word should we take.
We will choose a random number from a normal distribution and if this number is larger than $alpha$ we will randomly select a word from the less probable words meaning words 1-4 (without the most probable word at index 0 ).
else we will return the most probable word at index 0.



In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('darkgrid')
import plotly.express as ex
import plotly.graph_objs as go
import plotly.figure_factory as ff
from wordcloud import WordCloud,STOPWORDS
import nltk as nlp
import string 
import re

In [None]:
p_data = pd.read_csv('/kaggle/input/poe-short-stories-corpuscsv/preprocessed_data.csv')
p_data.head(3)

# Preprocessing And Constraction Of Our Text's Language 

In [None]:
title_language = []
text_language  = []

title_bow = {}
text_bow = {} 

for index,row in p_data.iterrows():
    title_language += row['title'].lower().split(' ')
    text_language += row['text'].lower().split(' ')

for index,row in p_data.iterrows():
    title =  row['title'].lower().split(' ')
    text  = row['text'].lower().split(' ')
    for te in text:
        text_bow[te] = text_bow.get(te,0) +1
    for ti in title:
        title_bow[ti] = title_bow.get(ti,0) +1
          
    
title_language = list(set(title_language))
text_language = list(set(text_language))

    

In [None]:
all_texts = ' '.join(p_data.text.values)

# Creating Helper Functions For N-gram Model Construction And Probability Calculation

In [None]:
def generate_ngram(n,text):
    s = text.lower()
    s = re.sub(r'[^a-zA-Z0-9\s]', ' ', s)
    tokens = s.split(' ')
    ngrams = zip(*[tokens[i:] for i in range(n)])
    return [" ".join(ngram) for ngram in ngrams]

bigram = generate_ngram(2,all_texts)
def get_nword_probs(word):
    contains = [w.split(' ')[1] for w in bigram if w.split(' ')[0] == word and w.split(' ')[1] != '']
    cont_dic = {}
    for word in contains:
        cont_dic[word] = cont_dic.get(word,0)+1
    occ = len(contains)
    cont_dic = {word:cont_dic[word]/occ for word in cont_dic.keys()}
    return cont_dic    

def get_next_word(cur_word,alpha):
    prob_dic = get_nword_probs(cur_word)
    prob_dic_top_5 = sorted(prob_dic, key=prob_dic.get, reverse=True)[:5]
    if np.random.normal(0,1,1) > alpha and len(prob_dic_top_5)>4:
        return prob_dic_top_5[int(np.round(np.random.uniform(1,4,1)))]
    elif len(prob_dic_top_5) == 0:
        return list(STOPWORDS)[int(np.round(np.random.uniform(0,len(STOPWORDS)-1,1)))]
    else:
        return prob_dic_top_5[0]

def get_random_words(n_words):
    tsample = p_data.text.sample(int(np.sqrt(n_words)))
    words = []
    for i in tsample:
        words += i.split(' ')
    choice = np.round(np.random.uniform(0,len(words),n_words))
    return [words[int(i)] for i in choice]

## Test Poem 1

In [None]:
words = get_random_words(1)
poem_length = 80
poem = ''
cur_word = words[0]
for i in range(0,poem_length):
    poem+= (' '+(get_next_word(cur_word,0.5)))
    if np.random.normal(0,1,1) >0.8:
        poem+='\n'
    elif np.random.normal(0,1,1) >0.7:
        poem+=','
    elif np.random.normal(0,1,1) >0.9:
        words = get_random_words(5)
        words = [word for word in words if word not in STOPWORDS]
        if len(words) == 0:
            cur_word = get_next_word(cur_word,0.5)
        else:
            cur_word = words[0]
    else:
        cur_word = get_next_word(cur_word,0.5)


In [None]:
print(poem)

## Test Poem 2

In [None]:
words = get_random_words(1)
poem_length = 80
poem = ''
cur_word = words[0]
for i in range(0,poem_length):
    poem+= (' '+(get_next_word(cur_word,0.8)))
    if np.random.normal(0,1,1) >0.8:
        poem+='\n'
    elif np.random.normal(0,1,1) >0.7:
        poem+=','
    elif np.random.normal(0,1,1) >0.9:
        words = get_random_words(5)
        words = [word for word in words if word not in STOPWORDS]
        if len(words) == 0:
            cur_word = get_next_word(cur_word,0.8)
        else:
            cur_word = words[0]
    else:
        cur_word = get_next_word(cur_word,0.8)


In [None]:
print(poem)

## Test Poem 3

In [None]:
words = get_random_words(1)
poem_length = 120
poem = ''
cur_word = words[0]
for i in range(0,poem_length):
    poem+= (' '+(get_next_word(cur_word,0.62)))
    if np.random.normal(0,1,1) >0.8:
        poem+='\n'
    elif np.random.normal(0,1,1) >0.7:
        poem+=','
    elif np.random.normal(0,1,1) >0.9:
        words = get_random_words(5)
        words = [word for word in words if word not in STOPWORDS]
        if len(words) == 0:
            cur_word = get_next_word(cur_word,0.62)
        else:
            cur_word = words[0]
    else:
        cur_word = get_next_word(cur_word,0.62)


In [None]:
print(poem)