<a href="https://colab.research.google.com/github/soumyamalviya92-pixel/prodigytask3/blob/main/prodigy3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install nltk
!pip install spacy
!pip install markovify
!pip install -m spacy download en

In [None]:
import spacy
import re
import markovify
import nltk
from nltk.corpus import gutenberg
import warnings
warnings.filterwarnings('ignore')
nltk.download('gutenberg')
!python -m spacy download en_core_web_sm

In [None]:
#inspect Gutenberg corpus
print(gutenberg.fileids())

In [None]:
#import novels as text objects
hamlet = gutenberg.raw('shakespeare-hamlet.txt')
macbeth = gutenberg.raw('shakespeare-macbeth.txt')
caesar = gutenberg.raw('shakespeare-caesar.txt')
#print first 100 characters of each
print('nRaw:n', hamlet[:100])
print('nRaw:n', macbeth[:100])
print('nRaw:n', caesar[:100])

In [None]:
#utility function for text cleaning
def text_cleaner(text):
  text = re.sub(r'--', ' ', text)
  text = re.sub('[[].*?[]]', '', text)
  text = re.sub(r'(b|s+-?|^-?)(d+|d*.d+)b','', text)
  text = ' '.join(text.split())
  return text

In [None]:
#remove chapter indicator
hamlet = re.sub(r'Chapter d+', '', hamlet)
macbeth = re.sub(r'Chapter d+', '', macbeth)
caesar = re.sub(r'Chapter d+', '', caesar)
#apply cleaning function to corpus
hamlet = text_cleaner(hamlet)
caesar = text_cleaner(caesar)
macbeth = text_cleaner(macbeth)

In [None]:
#parse cleaned novels
nlp = spacy.load('en_core_web_sm')
hamlet_doc = nlp(hamlet)
macbeth_doc = nlp(macbeth)
caesar_doc = nlp(caesar)

In [None]:
hamlet_sents = ' '.join([sent.text for sent in hamlet_doc.sents if len(sent.text) > 1])
macbeth_sents = ' '.join([sent.text for sent in macbeth_doc.sents if len(sent.text) > 1])
caesar_sents = ' '.join([sent.text for sent in caesar_doc.sents if len(sent.text) > 1])
shakespeare_sents = hamlet_sents + macbeth_sents + caesar_sents
#inspect our text
print(shakespeare_sents)

In [None]:
#create text generator using markovify
generator_1 = markovify.Text(shakespeare_sents, state_size=3)

In [None]:
#We will randomly generate three sentences
for i in range(3):
  print(generator_1.make_sentence())
#We will randomly generate three more sentences of no more than 100 characters
for i in range(3):
  print(generator_1.make_short_sentence(max_chars=100))

In [None]:
#next we will use spacy's part of speech to generate more legible text
class POSifiedText(markovify.Text):
   def word_split(self, sentence):
      return ['::'.join((word.orth_, word.pos_)) for word in nlp(sentence)]
   def word_join(self, words):
      sentence = ' '.join(word.split('::')[0] for word in words)
      return sentence
#Call the class on our text
generator_2 = POSifiedText(shakespeare_sents, state_size=3)
# And finally, print more sentences using our new generator.

#now we will use the above generator to generate sentences
for i in range(5):
  print(generator_2.make_sentence())
#print 100 characters or less sentences
for i in range(5):
  print(generator_2.make_short_sentence(max_chars=100))