# Generating Shakespearean Text

1. Importing ibraries

In [1]:
import numpy as np
import re #text cleaning
from urllib.request import urlopen #url handling
import threading 
from multiprocessing.pool import ThreadPool #for pool for multithreading
import spacy
import markovify
import nltk
from nltk.corpus import gutenberg #using gutenberg corpus (files contain dialogues only)
import warnings
# warnings.filterwarnings('ignore')
# nltk.download('gutenberg')
# !python3 -m spacy download en_core_web_sm #english setting

In [None]:
# inspect Gutenberg corpus
# print(gutenberg.fileids())

2. Store reference text files

In [2]:
# hamlet_raw = urlopen("https://www.gutenberg.org/cache/epub/1524/pg1524.txt").read().decode('utf8')
hamlet_raw = gutenberg.raw('shakespeare-hamlet.txt')
macbeth_raw = gutenberg.raw('shakespeare-macbeth.txt')
caesar_raw = gutenberg.raw('shakespeare-caesar.txt')

In [None]:
# inspect reference text files

# print("H:",hamlet_raw[100:200])
# print("M:",macbeth_raw[100:200])
# print("C:",caesar_raw[100:200])

3. Clean reference text files

In [3]:
#utility (sub) function for text cleaning
def text_cleaner(text):
    text = re.sub(r'--', ' ', text)
    text = re.sub('[\[].*?[\]]', '', text)
    text = re.sub(r'(\b|\s+\-?|^\-?)(\d+|\d*\.\d+)\b','', text)
    text = ' '.join(text.split())
    return text

In [4]:
pool = ThreadPool() #create multithreading pool
nlp = spacy.load('en_core_web_sm') #use English language

In [5]:
#main function for cleaning and parsing text
def cleaning(raw): 

    raw = re.sub(r'Chapter \d+', '', raw)
    cleaned = text_cleaner(raw)
    parsed = nlp(cleaned)
    sents = ' '.join([sent.text for sent in parsed.sents if len(sent.text) > 1])
#     print(sents[-100:])
    return sents

In [6]:
#store cleaned sentences in shakespeare_sents

shakespeare_sents = pool.map(cleaning, [hamlet_raw, macbeth_raw, caesar_raw])
# shakespeare_sents

4. Markov model for sentence generation

In [7]:
chains = []
mk_chain = markovify.Text(shakespeare_sents, state_size = 3)
chains.append(mk_chain)
model = markovify.combine(chains)

In [None]:
# Testing
# model.make_sentence()

In [8]:
lock = threading.Lock()
arr = [] #global variable for all sentences

#utility function for threading
def create_text_array(n):
    
    global arr
    local_arr = []
    for i in range(n):
        
        txt = model.make_sentence()
        while txt == None:
            txt = model.make_short_sentence(100)
            
        local_arr.append(txt)
        
    lock.acquire()
    arr = arr + local_arr
    lock.release()
        
        
    return arr


In [9]:
#Run multithreading to store data into global arr

thread1 = threading.Thread(target = create_text_array, args = [25])
thread2 = threading.Thread(target = create_text_array, args = [25])

thread1.start()
thread2.start()

thread1.join()
thread2.join()

# print(arr)

5. Store generated sentences into text file

In [10]:
with open('new_file.txt', 'w') as file:
    for line in arr:
        file.write("%s\n" % line)