# Imports

In [28]:
import requests
import nltk.tokenize
import ml.core.repo_paths
import ml.core.download
import dataclasses
from typing import Dict, Generic, TypeVar, Collection, Callable, Iterable
import itertools
import random

nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\sophi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

# Getting corpus

## Downloading

In [14]:
artifact_dir_data_raw = ml.core.repo_paths.get_dir_artifacts_data_raw("markov_chains_sherlock_holmes_generator", create=True)
corpus_path = artifact_dir_data_raw / "cano.txt"
ml.core.download.download_http("https://sherlock-holm.es/stories/plain-text/cano.txt", corpus_path)

Downloading 'https://sherlock-holm.es/stories/plain-text/cano.txt' to c:\Users\sophi\Code\ml\artifacts\data\markov_chains_sherlock_holmes_generator\raw\cano.txt...


3.87MB [00:00, 8.92MB/s]

  Download complete.





## Tokenizing

In [29]:
words = []

with open(corpus_path, "r") as file:
    for line in file:
        words.extend(nltk.tokenize.word_tokenize(line.lower()))

print(words[:100])

['the', 'complete', 'sherlock', 'holmes', 'arthur', 'conan', 'doyle', 'table', 'of', 'contents', 'a', 'study', 'in', 'scarlet', 'the', 'sign', 'of', 'the', 'four', 'the', 'adventures', 'of', 'sherlock', 'holmes', 'a', 'scandal', 'in', 'bohemia', 'the', 'red-headed', 'league', 'a', 'case', 'of', 'identity', 'the', 'boscombe', 'valley', 'mystery', 'the', 'five', 'orange', 'pips', 'the', 'man', 'with', 'the', 'twisted', 'lip', 'the', 'adventure', 'of', 'the', 'blue', 'carbuncle', 'the', 'adventure', 'of', 'the', 'speckled', 'band', 'the', 'adventure', 'of', 'the', 'engineer', "'s", 'thumb', 'the', 'adventure', 'of', 'the', 'noble', 'bachelor', 'the', 'adventure', 'of', 'the', 'beryl', 'coronet', 'the', 'adventure', 'of', 'the', 'copper', 'beeches', 'the', 'memoirs', 'of', 'sherlock', 'holmes', 'silver', 'blaze', 'the', 'yellow', 'face', 'the', 'stock-broker', "'s", 'clerk']


# Creating Markov chain from scratch

In [30]:
TState = TypeVar("TState")

@dataclasses.dataclass
class MarkovChain(Generic[TState]):
    _states: Dict[TState, Dict[TState, float]] = dataclasses.field(default_factory=dict)

    def insert_state(self, state: TState) -> None:
        if state not in self._states:
            self._states[state] = {}
    
    def states(self) -> Collection[TState]:
        return self._states.keys()
    
    def insert_transition(self, source: TState, destination: TState, initial: float) -> None:
        assert source in self._states
        assert destination in self._states

        if destination not in self._states[source]:
            self._states[source][destination] = initial

    def update_transition_weight(self, source: TState, destination: TState, updater: Callable[[float], float]) -> None:
        assert source in self._states
        assert destination in self._states
        assert destination in self._states[source]

        self._states[source][destination] = updater(self._states[source][destination])
    
    def get_transition_weight(self, source: TState, destination: TState) -> float:
        assert source in self._states
        assert destination in self._states
        assert destination in self._states[source]

        return self._states[source][destination]

    def get_state_transitions(self, source: TState) -> Collection[TState]:
        assert source in self._states

        return self._states[source].keys()

    def sum_state_transition_weights(self, source: TState) -> float:
        assert source in self._states

        return sum(self._states[source].values())

## Creating Markov chain from the corpus

In [31]:
def create_unnormalized_markov_chain(words: Collection[str]) -> MarkovChain[str]:
    markov_chain = MarkovChain()

    for i in range(len(words) - 1):
        markov_chain.insert_state(words[i])
        markov_chain.insert_state(words[i + 1])
        markov_chain.insert_transition(words[i], words[i + 1], 0.0)
        markov_chain.update_transition_weight(words[i], words[i + 1], lambda weight: weight + 1.0)

    return markov_chain

def normalize_markov_chain(markov_chain: MarkovChain[str]) -> None:
    for state in markov_chain.states():
        weight_sum = markov_chain.sum_state_transition_weights(state)

        for next_state in markov_chain.get_state_transitions(state):
            markov_chain.update_transition_weight(state, next_state, lambda weight: weight / weight_sum)

def create_markov_chain(words: Collection[str]) -> MarkovChain[str]:
    markov_chain = create_unnormalized_markov_chain(words)
    normalize_markov_chain(markov_chain)
    return markov_chain

markov_chain = create_markov_chain(words)

for i in itertools.islice(markov_chain.states(), 5):
    print(f"State {i!r}:")

    for j in markov_chain.get_state_transitions(i):
        print(f"  {j!r}: {markov_chain.get_transition_weight(i, j):.3f}")

State 'the':
  'complete': 0.000
  'sign': 0.000
  'four': 0.001
  'adventures': 0.000
  'red-headed': 0.000
  'boscombe': 0.000
  'five': 0.000
  'man': 0.013
  'twisted': 0.000
  'adventure': 0.003
  'blue': 0.001
  'speckled': 0.000
  'engineer': 0.000
  'noble': 0.000
  'beryl': 0.000
  'copper': 0.000
  'memoirs': 0.000
  'yellow': 0.000
  'stock-broker': 0.000
  '``': 0.000
  'musgrave': 0.000
  'reigate': 0.000
  'crooked': 0.000
  'resident': 0.000
  'greek': 0.000
  'naval': 0.000
  'final': 0.000
  'return': 0.000
  'empty': 0.000
  'norwood': 0.000
  'dancing': 0.000
  'solitary': 0.000
  'priory': 0.000
  'six': 0.000
  'three': 0.002
  'golden': 0.000
  'missing': 0.001
  'abbey': 0.000
  'second': 0.003
  'hound': 0.001
  'baskervilles': 0.000
  'valley': 0.001
  'cardboard': 0.000
  'red': 0.001
  'bruce-partington': 0.000
  'dying': 0.000
  'disappearance': 0.001
  'devil': 0.001
  'case-book': 0.000
  'illustrious': 0.000
  'blanched': 0.000
  'mazarin': 0.000
  'susse

# Generate new text

In [40]:
def random_walk(markov_chain: MarkovChain[TState], initial: TState) -> Iterable[TState]:
    current = initial

    while True:
        yield current

        next_states = list(markov_chain.get_state_transitions(current))
        next_weights = [markov_chain.get_transition_weight(current, next_state) for next_state in next_states]
        current = random.choices(next_states, next_weights, k=1)[0]

for i in itertools.islice(random_walk(markov_chain, "the"), 10):
    print(i, end=" ")

print()

the door which led to you know before it 's 
