In [1]:
from ebooklib import epub
import os
import ebooklib
import random

<h1> Ingest all of the books and extract the text from them </h1>

In [2]:
ignore_list = ["<class 'ebooklib.epub.EpubImage'>","<class 'ebooklib.epub.EpubItem'>"]

In [3]:
all_books = []
for filename in os.listdir('books/'):
    if filename.endswith('.epub'):
        book = epub.read_epub('books/' + filename)
        book_text = ''
        for doc in book.get_items():
            if not str(type(doc)) in ignore_list:
                doc_content = doc.content.decode()
                book_text += doc_content
        book_text = book_text.split('\n')
        for line in book_text:
            if line.startswith('<p>'):
                line = line.replace('<p>','').replace('</p>','')
                line=line.replace(';','.').replace('!','.').replace('?','.')
                line = line.split(".")
                for sent in line:
                    if sent.startswith(" "):
                        sent = sent[1:]
                    if sent.endswith(" "):
                        sent = sent[:-1]
                    if '<a' in sent:
                        sent_start = sent.split('<a')[0]
                        try:
                            sent_end = sent.split('</a>')[1]
                        except:
                            sent_end = ''
                        sent = sent_start + sent_end
                    sent = sent.replace('<i>','').replace('</i>','').replace('<b>','').replace('</b>','')
                    if len(sent) > 1 and sent != '<br/>':
                        all_books.append(sent.lower())

<h1> Create corpus and reverse corpus </h1>

In [4]:
corpus = {}
reverse_corpus = {0:''}
counter=1
punct = ['!','"','#','$','%','&',"'",'(',')','*','+',',','-','.','/',':',';','<','=','>','?','@','[','\\',']','^','_','`','{','Â¦','}','~']
for sent in all_books:
    for punc in punct:
        sent=sent.replace(punc,' ')
    sent=sent.split(' ')
    for word in sent:
        if not word in corpus and word != ' ' and len(word) > 0:
            corpus[word]=counter
            reverse_corpus[counter]=word
            counter+=1

<h1> Convert sentences to arrays using the corpus </h1>

In [5]:
all_books_indexed = []
for sent in all_books:
    temp_list = []
    for punc in punct:
        sent=sent.replace(punc,' ')
    sent=sent.split(' ')
    for word in sent:
        try:
            temp_list.append(corpus[word])
        except:
            continue
    all_books_indexed.append(temp_list)

<h1> Find the max sentence length </h1>

In [51]:
max_length = 0
for i in range(len(all_books_indexed)):
    if len(all_books_indexed[i]) > max_length:
        max_length = len(all_books_indexed[i])

<h1> Pad sentences to all be the same length </h1>

In [8]:
for i in range(len(all_books_indexed)):
    while len(all_books_indexed[i]) < max_length:
        all_books_indexed[i].insert(0,0)

<h1> Create a dictionary for each position in the array containing each word that appears. </h1>

In [9]:
position_dict = {}
for sent in all_books_indexed:
    for i in range(max_length):
        if not i in position_dict:
            position_dict[i] = [{}]
        if not sent[i] in position_dict[i][0]:
            position_dict[i][0][sent[i]]=1
        else:
            position_dict[i][0][sent[i]]+=1

for position in position_dict:
    total = 0
    temp_list = []
    temp_word_list = []
    for word in position_dict[position][0]:
        count = position_dict[position][0][word]
        temp_list.append(count)
        temp_word_list.append(word)
        total += count
    for i in range(len(temp_list)):
        temp_list[i] = temp_list[i]/total
        if i >= 1:
            temp_list[i] += temp_list[i-1]
    position_dict[position].append(temp_list)
    position_dict[position].append(temp_word_list)
total_sents = total

<h1> Create a dictionaries with look-backs. </h1>

In [None]:
position_dict_2 = {}
for sent in all_books_indexed:
    for i in range(max_length):
        if not i in position_dict_2:
            position_dict_2[i] = [{}]
        if i > 0:
            if not (sent[i-1],sent[i]) in position_dict_2[i][0]:
                position_dict_2[i][0][(sent[i-1],sent[i])]=1
            else:
                position_dict_2[i][0][(sent[i-1],sent[i])]+=1
        else:
            if not (0,0) in position_dict_2[i][0]:
                position_dict_2[i][0][(0,0)]=1
            else:
                position_dict_2[i][0][(0,0)]+=1

for position in position_dict_2:
    total = 0
    temp_list = []
    temp_word_list = []
    for word in position_dict_2[position][0]:
        count = position_dict_2[position][0][word]
        temp_list.append(count)
        temp_word_list.append(word)
        total += count
    for i in range(len(temp_list)):
        temp_list[i] = temp_list[i]/total
        if i >= 1:
            temp_list[i] += temp_list[i-1]
    position_dict_2[position].append(temp_list)
    position_dict_2[position].append(temp_word_list)
total_sents = total

In [30]:
position_dict_3 = {}
for sent in all_books_indexed:
    for i in range(max_length):
        if not i in position_dict_3:
            position_dict_3[i] = [{}]
        if i > 1:
            if not (sent[i-2],sent[i-1],sent[i]) in position_dict_3[i][0]:
                position_dict_3[i][0][(sent[i-2],sent[i-1],sent[i])]=1
            else:
                position_dict_3[i][0][(sent[i-2],sent[i-1],sent[i])]+=1
        else:
            if not (0,0,0) in position_dict_3[i][0]:
                position_dict_3[i][0][(0,0,0)]=1
            else:
                position_dict_3[i][0][(0,0,0)]+=1

for position in position_dict_3:
    total = 0
    temp_list = []
    temp_word_list = []
    for word in position_dict_3[position][0]:
        count = position_dict_3[position][0][word]
        temp_list.append(count)
        temp_word_list.append(word)
        total += count
    for i in range(len(temp_list)):
        temp_list[i] = temp_list[i]/total
        if i >= 1:
            temp_list[i] += temp_list[i-1]
    position_dict_3[position].append(temp_list)
    position_dict_3[position].append(temp_word_list)
total_sents = total

In [38]:
position_dict_4 = {}
for sent in all_books_indexed:
    for i in range(max_length):
        if not i in position_dict_4:
            position_dict_4[i] = [{}]
        if i > 2:
            if not (sent[i-3],sent[i-2],sent[i-1],sent[i]) in position_dict_4[i][0]:
                position_dict_4[i][0][(sent[i-3],sent[i-2],sent[i-1],sent[i])]=1
            else:
                position_dict_4[i][0][(sent[i-3],sent[i-2],sent[i-1],sent[i])]+=1
        else:
            if not (0,0,0,0) in position_dict_4[i][0]:
                position_dict_4[i][0][(0,0,0,0)]=1
            else:
                position_dict_4[i][0][(0,0,0,0)]+=1

for position in position_dict_4:
    total = 0
    temp_list = []
    temp_word_list = []
    for word in position_dict_4[position][0]:
        count = position_dict_4[position][0][word]
        temp_list.append(count)
        temp_word_list.append(word)
        total += count
    for i in range(len(temp_list)):
        temp_list[i] = temp_list[i]/total
        if i >= 1:
            temp_list[i] += temp_list[i-1]
    position_dict_4[position].append(temp_list)
    position_dict_4[position].append(temp_word_list)
total_sents = total

<h1>Define the function to generate the text</h1>

In [66]:
def make_sent():
    full_sent = ''
    full_sent_len = 0
    random_base = 0
    while full_sent_len < 1:
        full_sentence = []
        for i in range(max_length):
            rand_num = random.uniform(random_base,1)
            if i == 0:
                position_weights = position_dict[i][1]

                indexer = 0
                for x in range(len(position_weights)):
                    if rand_num > position_weights[x]:
                        indexer = x

                full_sentence.append(reverse_corpus[position_dict[i][2][indexer]])
                full_sent+=reverse_corpus[position_dict[i][2][indexer]]
            # else:
            elif i ==1:
                temp_list = []
                word_list = []
                total_count = 0
                position_words = position_dict_2[i][2]
                for pos in position_words:
                    pos0 = reverse_corpus[int(pos[0])]
                    if full_sentence[i-1] == pos0 :
                        total_count += position_dict_2[i][0][pos]
                        word_list.append(pos)
                for pos in word_list:
                    temp_val= position_dict_2[i][0][pos]
                    temp_list.append(temp_val/total_count)
                for val in range(len(temp_list)):
                    if val > 0:
                        temp_list[val] += temp_list[val-1]
                indexer = 0
                for x in range(len(temp_list)):
                    if rand_num > temp_list[x]:
                            indexer = x
                full_sentence.append(reverse_corpus[word_list[indexer][1]])
                full_sent+=reverse_corpus[word_list[indexer][1]]
            elif i ==2:
                temp_list = []
                word_list = []
                total_count = 0
                position_words = position_dict_3[i][2]
                for pos in position_words:
                    pos0 = reverse_corpus[int(pos[0])]
                    pos1 = reverse_corpus[int(pos[1])]
                    if full_sentence[i-1] == pos1 and full_sentence[i-2] == pos0:
                        total_count += position_dict_3[i][0][pos]
                        word_list.append(pos)
                for pos in word_list:
                    temp_val= position_dict_3[i][0][pos]
                    temp_list.append(temp_val/total_count)
                for val in range(len(temp_list)):
                    if val > 0:
                        temp_list[val] += temp_list[val-1]
                indexer = 0
                for x in range(len(temp_list)):
                    if rand_num > temp_list[x]:
                            indexer = x
                full_sentence.append(reverse_corpus[word_list[indexer][2]])
                full_sent+=reverse_corpus[word_list[indexer][2]]
            else:
                temp_list = []
                word_list = []
                total_count = 0
                position_words = position_dict_4[i][2]
                for pos in position_words:
                    pos0 = reverse_corpus[int(pos[0])]
                    pos1 = reverse_corpus[int(pos[1])]
                    pos2 = reverse_corpus[int(pos[2])]
                    if full_sentence[i-1] == pos2 and full_sentence[i-2] == pos1 and full_sentence[i-3] == pos0:
                        total_count += position_dict_4[i][0][pos]
                        word_list.append(pos)
                for pos in word_list:
                    temp_val= position_dict_4[i][0][pos]
                    temp_list.append(temp_val/total_count)
                for val in range(len(temp_list)):
                    if val > 0:
                        temp_list[val] += temp_list[val-1]
                indexer = 0
                for x in range(len(temp_list)):
                    if rand_num > temp_list[x]:
                            indexer = x
                full_sentence.append(reverse_corpus[word_list[indexer][3]])
                full_sent+=reverse_corpus[word_list[indexer][3]]

        full_sent_len = len(full_sent)
        random_base += 0.001
    string_sent = ''
    for word in full_sentence:
        if len(word)>0:
            string_sent += word + " "
    print(string_sent)


<h1>Test it out</h1>

In [74]:
make_sent()

afterward it was noticed that the wizard always performed his famous trick with eight piglets but it seemed to please the people just as well as if they had been from infancy accustomed 


In [75]:
make_sent()

folklore legends myths and fairy tales have followed childhood through the ages for every healthy youngster has a wholesome and instinctive love for stories fantastic marvelous and manifestly unreal 
