In [134]:
import re
import random

from collections import Counter

In [135]:
path = "./corpora/sherlock.txt"
text: str

with open(path, "r", encoding="utf-8") as file:
    text = file.read().lower()

text[:10_000]

'\n\n\n\n                               a study in scarlet\n\n                               arthur conan doyle\n\n\n\n\n\n\n\n                                table of contents\n\n         part i\n        mr. sherlock holmes\n        the science of deduction\n        the lauriston garden mystery\n        what john rance had to tell\n        our advertisement brings a visitor\n        tobias gregson shows what he can do\n        light in the darkness\n\n         part ii\n        on the great alkali plain\n        the flower of utah\n        john ferrier talks with the prophet\n        a flight for life\n        the avenging angels\n        a continuation of the reminiscences of john watson, m.d.\n        the conclusion\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n                                      part i\n\n                   (being a reprint from the reminiscences of\n                              john h. watson, m.d.,\n                      late of the army medical department.)\n\n\n\n\n

In [136]:
def extract_words(text: str) -> list[str]:
    word_re_pat = re.compile(r"\w+")
    return re.findall(word_re_pat, text)

words = extract_words(text)
words[:10]  # Display the first 10 words for verification

['a',
 'study',
 'in',
 'scarlet',
 'arthur',
 'conan',
 'doyle',
 'table',
 'of',
 'contents']

In [137]:
def find_successors(word: str, corpus: list[str]) -> dict[str, float]:
    counter = Counter()
    
    for i, w in enumerate(corpus[:-1]):
        if w == word:
            counter[corpus[i+1]] += 1

    total = counter.total()
    probabilities = {k:v/total for k, v in counter.items()}
            
    return probabilities

d = find_successors(words[123], words)
d

{'proceeded': 0.0014727540500736377,
 'before': 0.0029455081001472753,
 'was': 0.009572901325478646,
 'succeeded': 0.0014727540500736377,
 'at': 0.0022091310751104565,
 'promotion': 0.0007363770250368188,
 'disaster': 0.0007363770250368188,
 'attached': 0.0007363770250368188,
 'grazed': 0.0007363770250368188,
 'courage': 0.0007363770250368188,
 'weak': 0.0007363770250368188,
 'had': 0.008836524300441826,
 'even': 0.0022091310751104565,
 'when': 0.005891016200294551,
 'became': 0.0014727540500736377,
 'emaciated': 0.0007363770250368188,
 'landed': 0.0007363770250368188,
 'sixpence': 0.0007363770250368188,
 'idlers': 0.0007363770250368188,
 'spending': 0.0007363770250368188,
 'rusticate': 0.0007363770250368188,
 'to': 0.008836524300441826,
 'less': 0.0007363770250368188,
 'turning': 0.0014727540500736377,
 'he': 0.01914580265095729,
 'we': 0.012518409425625921,
 'as': 0.008100147275405008,
 'who': 0.004418262150220913,
 'which': 0.0066273932253313695,
 'the': 0.06553755522827688,
 'eccen

In [138]:
def train_model(corpus: list[str]) -> dict[str, dict[str, float]]:
    mem = {}
    for word in corpus:
        if not word in mem:
            mem[word] = find_successors(word, corpus)
    return mem

In [139]:
model = train_model(words)

In [140]:

def predict_next(word: str, model: dict[str, dict[str, float]], temperature: int = 10) -> str:

    assert 0 < temperature <= 10, f"Temeprature must be between 1 and 10"

    possibilities = model.get(word)

    if possibilities is None:
        return "EOF"

    top_words = sorted(possibilities.items(), key=lambda x: x[1], reverse=True)[:temperature]
    
    words, probs = zip(*top_words)
    total = sum(probs)
    normalized_probs = [p/total for p in probs]
    
    return random.choices(words, weights=normalized_probs, k=1)[0]

In [141]:
token = input("Start Chat: ").lower()
while token != "EOF":
    print(token)
    token = predict_next(token, model, temperature=10)


london
my
mind
the
same
far
from
the
case
of
the
house
was
to
the
door
to
the
young
mormons
before
i
have
had
been
no
sir
then
the
house
was
the
whole
matter
over
his
heart
of
the
room
and
i
am
a
great
wilderness
than
that
he
cried
gregson
lestrade
are
a
little
way
of
a
little
time
and
he
was
the
man
was
still
a
little
too
much
to
the
young
men
who
are
you
don
t
mind
said
sherlock
holmes
sprang
to
be
a
few
and
i
asked
the
same
cab
asked
in
her
father
she
had
been
the
door
i
have
to
me
that
he
said
sherlock
holmes
had
no
means
could
not
have
had
a
large
bundle
containing
a
little
mystery
what
is
no
more
than
the
door
of
the
room
he
was
not
have
you
say
was
the
door
was
that
the
two
officers
who
would
be
of
the
two
in
the
two
officers
as
the
house
what
do
you
to
me
for
the
man
s
private
hotel
he
was
the
door
and
the
man
and
i
was
that
they
will
have
a
small
knot
which
he
said
that
s
eyes
of
the
man
of
all
i
was
no
other
thing
which
i
have
the
room
where
the
room
the
room
i
was
so
i
have


KeyboardInterrupt: 