<a href="https://colab.research.google.com/github/sh-shrey/AI/blob/main/Markov_story_generation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
import os
import re
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import random

In [2]:
story_path = "/content/drive/MyDrive/Colab Notebooks/sherlock/sherlock/"

def read_all_stories(story_path):
    txt = []
    for _, _, files in os.walk(story_path):
        for file in files:
            with open(story_path+file) as f:
                for line in f:
                    line = line.strip()
                    if line=='----------': break
                    if line!='':txt.append(line)
    return txt
        
stories = read_all_stories(story_path)
print("number of lines = ", len(stories))

number of lines =  215021


cleaning the text


In [None]:
def clean_txt(txt):
    cleaned_txt = []
    for line in txt:
        line = line.lower()
        line = re.sub(r"[,.\"\'!@#$%^&*(){}?/;`~:<>+=-\\]", "", line)
        tokens = word_tokenize(line)
        words = [word for word in tokens if word.isalpha()]
        cleaned_txt+=words
    return cleaned_txt

cleaned_stories = clean_txt(stories)
print("number of words = ", len(cleaned_stories))

number of words =  2332247


In [None]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
def make_markov_model(cleaned_stories, n_gram=2):
    markov_model = {}
    for i in range(len(cleaned_stories)-n_gram-1):
        curr_state, next_state = "", ""
        for j in range(n_gram):
            curr_state += cleaned_stories[i+j] + " "
            next_state += cleaned_stories[i+j+n_gram] + " "
        curr_state = curr_state[:-1]
        next_state = next_state[:-1]
        if curr_state not in markov_model:
            markov_model[curr_state] = {}
            markov_model[curr_state][next_state] = 1
        else:
            if next_state in markov_model[curr_state]:
                markov_model[curr_state][next_state] += 1
            else:
                markov_model[curr_state][next_state] = 1
    
    # calculating transition probabilities
    for curr_state, transition in markov_model.items():
        total = sum(transition.values())
        for state, count in transition.items():
            markov_model[curr_state][state] = count/total
    return markov_model 

In [None]:
markov_model = make_markov_model(cleaned_stories)

In [None]:
print("number of states = ", len(markov_model.keys()))

number of states =  208716


In [None]:
print("All possible transitions from 'the game' state: \n")
print(markov_model['the game'])

All possible transitions from 'the game' state: 

{'is afoot': 0.036036036036036036, 'was up': 0.09009009009009009, 'was in': 0.02702702702702703, 'is hardly': 0.02702702702702703, 'would have': 0.036036036036036036, 'is up': 0.06306306306306306, 'is and': 0.036036036036036036, 'in their': 0.036036036036036036, 'was whist': 0.036036036036036036, 'in that': 0.036036036036036036, 'the lack': 0.036036036036036036, 'for all': 0.06306306306306306, 'may wander': 0.02702702702702703, 'now a': 0.02702702702702703, 'my own': 0.02702702702702703, 'at any': 0.02702702702702703, 'mr holmes': 0.02702702702702703, 'ay whats': 0.02702702702702703, 'my friend': 0.02702702702702703, 'fairly by': 0.02702702702702703, 'is not': 0.02702702702702703, 'was not': 0.02702702702702703, 'was afoot': 0.036036036036036036, 'for the': 0.036036036036036036, 'your letter': 0.02702702702702703, 'worth it': 0.02702702702702703, 'you are': 0.02702702702702703, 'i am': 0.02702702702702703, 'now count': 0.027027027027027

In [None]:
def generate_story(markov_model, limit=100, start='my god'):
    n = 0
    curr_state = start
    next_state = None
    story = ""
    story+=curr_state+" "
    while n<limit:
        next_state = random.choices(list(markov_model[curr_state].keys()),
                                    list(markov_model[curr_state].values()))
        
        curr_state = next_state[0]
        story+=curr_state+" "
        n+=1
    return story

In [None]:
for i in range(20):
    print(str(i)+". ", generate_story(markov_model, start="the subject", limit=8))

0.  the subject for to do it i asked after we have heard well state your case to be 
1.  the subject i had formed from the newspaper reports were entirely erroneous conclusion which shows my dear watson 
2.  the subject very naturally mr ferguson but human nature is weak i wish you therefore to think that 
3.  the subject is very late and when he was a knife i glanced at my watch was dark 
4.  the subject of the accused disappears and yet no friends whom i suspected i read in his face 
5.  the subject for to do it so well now that such an insult again i have said on 
6.  the subject but there was much puzzled by something which brought him the coronet to someone in the 
7.  the subject of the jaw it is surely nothing in life more painful than his violence or his 
8.  the subject said holmes laying aside his lens not only my honour my gems and my heart set 
9.  the subject have you dropped from his cigar then suddenly came a gentle flow of soothing explanation from 
10.  the subject but

In [None]:
print(generate_story(markov_model,start='the case',limit=100))

the case and it was a pale man with sandy whiskers rose up from him and he walked in that dark grimy apartment which looked out with three barges in tow blundered in between us it was the cause of death my correspondence however is as i hope to get back to business watson would you mind touching the bell he handed over the back garden wall like the cowardly dogs follow me gentlemen i will look he whipped out his lens and then i stopped and waited their time until they could set him to remember that a stonemason named slater walking from forest row in a stately manner he departed for europe i made a note which made me walk in here as a single man should know it well we have established a considerable though not a married man and have it out of the profession which has become a singularly dark one the landlord pricked up his ears at every siding and they were talking a sudden cry of pain from the planks and his beady eyes gleaming out of the grate there was no one not miss it for six it 