In [1]:
from collections import defaultdict
import nltk
nltk.download('punkt')
import numpy as np
import spacy
import re
from nltk.corpus import gutenberg
import warnings
warnings.filterwarnings('ignore')
nltk.download('gutenberg')
import string

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\rahul\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package gutenberg to
[nltk_data]     C:\Users\rahul\AppData\Roaming\nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


In [10]:
#import novels as text objects
hamlet = gutenberg.raw('shakespeare-hamlet.txt')
macbeth = gutenberg.raw('shakespeare-macbeth.txt')
caesar = gutenberg.raw('shakespeare-caesar.txt')
#print first 100 characters of each
print('\nRaw:\n', hamlet[:100])
print('\nRaw:\n', macbeth[:100])
print('\nRaw:\n', caesar[:100])


Raw:
 [The Tragedie of Hamlet by William Shakespeare 1599]


Actus Primus. Scoena Prima.

Enter Barnardo a

Raw:
 [The Tragedie of Macbeth by William Shakespeare 1603]


Actus Primus. Scoena Prima.

Thunder and Lig

Raw:
 [The Tragedie of Julius Caesar by William Shakespeare 1599]


Actus Primus. Scoena Prima.

Enter Fla


In [3]:
#utility function for text cleaning
def text_cleaner(text):
    text = re.sub(r'--', ' ', text)
    text = re.sub('[\[].*?[\]]', '', text)
    text = re.sub(r'(\b|\s+\-?|^\-?)(\d+|\d*\.\d+)\b','', text)
    text = ' '.join(text.split())
    return text

In [4]:
#remove chapter indicator
hamlet = re.sub(r'Chapter \d+', '', hamlet)
macbeth = re.sub(r'Chapter \d+', '', macbeth)
caesar = re.sub(r'Chapter \d+', '', caesar)
#apply cleaning function to corpus
hamlet = text_cleaner(hamlet)
caesar = text_cleaner(caesar)
macbeth = text_cleaner(macbeth)

In [5]:
#parse cleaned novels
nlp = spacy.load('en_core_web_sm')
hamlet_doc = nlp(hamlet)
macbeth_doc = nlp(macbeth)
caesar_doc = nlp(caesar)

In [6]:
hamlet_sents = ' '.join([sent.text for sent in hamlet_doc.sents if len(sent.text) > 1])
macbeth_sents = ' '.join([sent.text for sent in macbeth_doc.sents if len(sent.text) > 1])
caesar_sents = ' '.join([sent.text for sent in caesar_doc.sents if len(sent.text) > 1])
shakespeare_sents = hamlet_sents + macbeth_sents + caesar_sents
#inspect our text
#print(shakespeare_sents)

In [7]:
#converting the entire corpus to upper case
shakespeare_sents=shakespeare_sents.upper()

#eliminating punctuation from the corpus
shakespeare=shakespeare_sents.translate(str.maketrans('', '', string.punctuation))

In [8]:
#initializing a blank dicitonary
transition_dict=defaultdict(int)

In [9]:
#defining the order of the markov chain
morder=3

In [11]:
#function to create the transition proabilities for given order using nested dictionaries
def create_transition_dict(text,morder):
    for i in range(len(text)-morder):
        x=str(text[i:i+morder])
        y=text[i+morder]
        if transition_dict[x]==0:
            transition_dict[x]={}
            transition_dict[x][y]=1
        else:
            if transition_dict[x].get(y) is None:
                transition_dict[x][y]=1
            else:
                transition_dict[x][y]+=1

In [12]:
#creating tokens for the words in the corpus
temp=nltk.word_tokenize(shakespeare)

In [13]:
#calling the function
create_transition_dict(temp,morder)

In [14]:
#printing the transition counts
transition_dict

defaultdict(int,
            {"['ACTUS', 'PRIMUS', 'SCOENA']": {'PRIMA': 1},
             "['PRIMUS', 'SCOENA', 'PRIMA']": {'ENTER': 2, 'THUNDER': 1},
             "['SCOENA', 'PRIMA', 'ENTER']": {'BARNARDO': 1, 'FLAUIUS': 1},
             "['PRIMA', 'ENTER', 'BARNARDO']": {'AND': 1},
             "['ENTER', 'BARNARDO', 'AND']": {'FRANCISCO': 1},
             "['BARNARDO', 'AND', 'FRANCISCO']": {'TWO': 1},
             "['AND', 'FRANCISCO', 'TWO']": {'CENTINELS': 1},
             "['FRANCISCO', 'TWO', 'CENTINELS']": {'BARNARDO': 1},
             "['TWO', 'CENTINELS', 'BARNARDO']": {'WHOS': 1},
             "['CENTINELS', 'BARNARDO', 'WHOS']": {'THERE': 1},
             "['BARNARDO', 'WHOS', 'THERE']": {'FRAN': 1},
             "['WHOS', 'THERE', 'FRAN']": {'NAY': 1},
             "['THERE', 'FRAN', 'NAY']": {'ANSWER': 1},
             "['FRAN', 'NAY', 'ANSWER']": {'ME': 1},
             "['NAY', 'ANSWER', 'ME']": {'STAND': 1},
             "['ANSWER', 'ME', 'STAND']": {'VNFOLD': 1},
  

In [15]:
#function to normalize the transition probaibility dictionary
def create_prob_dist():
    for i,_ in transition_dict.items():
        ans=float(sum(transition_dict[i].values()))
        for j in transition_dict[i].keys():
            transition_dict[i][j]=transition_dict[i][j]/ans

In [16]:
#function call
create_prob_dist()

In [17]:
#printing the normalized transition dictionary
transition_dict

defaultdict(int,
            {"['ACTUS', 'PRIMUS', 'SCOENA']": {'PRIMA': 1.0},
             "['PRIMUS', 'SCOENA', 'PRIMA']": {'ENTER': 0.6666666666666666,
              'THUNDER': 0.3333333333333333},
             "['SCOENA', 'PRIMA', 'ENTER']": {'BARNARDO': 0.5, 'FLAUIUS': 0.5},
             "['PRIMA', 'ENTER', 'BARNARDO']": {'AND': 1.0},
             "['ENTER', 'BARNARDO', 'AND']": {'FRANCISCO': 1.0},
             "['BARNARDO', 'AND', 'FRANCISCO']": {'TWO': 1.0},
             "['AND', 'FRANCISCO', 'TWO']": {'CENTINELS': 1.0},
             "['FRANCISCO', 'TWO', 'CENTINELS']": {'BARNARDO': 1.0},
             "['TWO', 'CENTINELS', 'BARNARDO']": {'WHOS': 1.0},
             "['CENTINELS', 'BARNARDO', 'WHOS']": {'THERE': 1.0},
             "['BARNARDO', 'WHOS', 'THERE']": {'FRAN': 1.0},
             "['WHOS', 'THERE', 'FRAN']": {'NAY': 1.0},
             "['THERE', 'FRAN', 'NAY']": {'ANSWER': 1.0},
             "['FRAN', 'NAY', 'ANSWER']": {'ME': 1.0},
             "['NAY', 'ANSWER', 'ME']

In [18]:
#function for prediction of next word
def create_text(sample):
    previous=nltk.word_tokenize(sample)
    print(previous)
    if(transition_dict[str(previous)]==0):
        return '?'
    
    words=list(transition_dict[str(previous)].keys())
    vals=list(transition_dict[str(previous)].values())
    
    return np.random.choice(words,p=vals)

In [19]:
#function to predict 100 words given the start sequence
def word_predict(start,morder,maxlen=100):
    sentence=start
    #print(sentence)
    prev=start
    #print(prev)
    for i in range(maxlen):
        pred=create_text(prev)
        sentence+=" "+pred
        #print(sentence)
        x=nltk.word_tokenize(sentence)
        prev=' '.join(x[-morder:])
    print(sentence)

In [None]:
#test string to demo the function for order 5(run only when morder=5)
tester="I AM SURE OF THAT"

In [None]:
#test string to demo the function for order 4(run only when morder=4)
tester="I AM SURE OF"

In [20]:
#test string to demo the function for order 3(run only when morder=3)
tester="I AM SURE"

In [None]:
#test string to demo the function for order 2(run only when morder=2)
tester="I AM"

In [None]:
#test string to demo the function for order 1(run only when morder=1)
tester="I"

In [22]:
word_predict(tester,morder)

['I', 'AM', 'SURE']
['AM', 'SURE', 'IT']
['SURE', 'IT', 'DID']
['IT', 'DID', 'NOT']
['DID', 'NOT', 'LYE']
['NOT', 'LYE', 'THERE']
['LYE', 'THERE', 'WHEN']
['THERE', 'WHEN', 'I']
['WHEN', 'I', 'WENT']
['I', 'WENT', 'TO']
['WENT', 'TO', 'BED']
['TO', 'BED', 'THAT']
['BED', 'THAT', 'YOU']
['THAT', 'YOU', 'DOE']
['YOU', 'DOE', 'LYE']
['DOE', 'LYE', 'SO']
['LYE', 'SO', 'LATE']
['SO', 'LATE', 'PORT']
['LATE', 'PORT', 'FAITH']
['PORT', 'FAITH', 'SIR']
['FAITH', 'SIR', 'WE']
['SIR', 'WE', 'WERE']
['WE', 'WERE', 'CAROWSING']
['WERE', 'CAROWSING', 'TILL']
['CAROWSING', 'TILL', 'THE']
['TILL', 'THE', 'SECOND']
['THE', 'SECOND', 'COCK']
['SECOND', 'COCK', 'AND']
['COCK', 'AND', 'DRINKE']
['AND', 'DRINKE', 'SIR']
['DRINKE', 'SIR', 'IS']
['SIR', 'IS', 'A']
['IS', 'A', 'GREAT']
['A', 'GREAT', 'OBSERUER']
['GREAT', 'OBSERUER', 'AND']
['OBSERUER', 'AND', 'HE']
['AND', 'HE', 'LOOKES']
['HE', 'LOOKES', 'QUITE']
['LOOKES', 'QUITE', 'THROUGH']
['QUITE', 'THROUGH', 'THE']
['THROUGH', 'THE', 'DEEDS']
['THE',