In [3]:
!wget -nc https://raw.githubusercontent.com/lazyprogrammer/machine_learning_examples/master/hmm_class/edgar_allan_poe.txt -e https_proxy=http://daicelproxy3:80
!wget -nc https://raw.githubusercontent.com/lazyprogrammer/machine_learning_examples/master/hmm_class/robert_frost.txt -e https_proxy=http://daicelproxy3:80

--2023-06-01 13:36:48--  https://raw.githubusercontent.com/lazyprogrammer/machine_learning_examples/master/hmm_class/edgar_allan_poe.txt
Resolving daicelproxy3 (daicelproxy3)... 159.228.46.205
Connecting to daicelproxy3 (daicelproxy3)|159.228.46.205|:80... connected.
Proxy request sent, awaiting response... 200 OK
Length: 26622 (26K) [text/plain]
Saving to: 'edgar_allan_poe.txt'

     0K .......... .......... .....                           100%  330K=0.08s

2023-06-01 13:36:48 (330 KB/s) - 'edgar_allan_poe.txt' saved [26622/26622]

--2023-06-01 13:36:48--  https://raw.githubusercontent.com/lazyprogrammer/machine_learning_examples/master/hmm_class/robert_frost.txt
Resolving daicelproxy3 (daicelproxy3)... 159.228.46.205
Connecting to daicelproxy3 (daicelproxy3)|159.228.46.205|:80... connected.
Proxy request sent, awaiting response... 200 OK
Length: 56286 (55K) [text/plain]
Saving to: 'robert_frost.txt'

     0K .......... .......... .......... .......... .......... 90%  592K 0s
    50K 

In [4]:
import numpy as np
import matplotlib.pyplot as plt
import string
from sklearn.model_selection import train_test_split

In [5]:
input_files = [
    'edgar_allan_poe.txt',
    'robert_frost.txt'
]

In [8]:
!head edgar_allan_poe.txt

LO! Death hath rear'd himself a throne
In a strange city, all alone,
Far down within the dim west
Where the good, and the bad, and the worst, and the best,
Have gone to their eternal rest.
 
There shrines, and palaces, and towers
Are not like any thing of ours
Oh no! O no! ours never loom
To heaven with that ungodly gloom!


In [9]:
!head robert_frost.txt

Two roads diverged in a yellow wood,
And sorry I could not travel both
And be one traveler, long I stood
And looked down one as far as I could
To where it bent in the undergrowth; 

Then took the other, as just as fair,
And having perhaps the better claim
Because it was grassy and wanted wear,
Though as for that the passing there


In [10]:
# collect data into lists
input_texts = []
labels = []

for label, f in enumerate(input_files):
    print(f"{f} corresponds to label {label}")

    for line in open(f):
        line = line.rstrip().lower()
        if line:
            # remove punctuation
            line = line.translate(str.maketrans("","", string.punctuation))

            input_texts.append(line)
            labels.append(label)

edgar_allan_poe.txt corresponds to label 0
robert_frost.txt corresponds to label 1


In [19]:
train_text, test_text, Ytrain, Ytest = train_test_split(input_texts, labels)

In [20]:
len(Ytrain), len(Ytest)

(1615, 539)

In [21]:
train_text[:5]

['that they could hear tell of was scarified',
 'at the end of our path a liquescent',
 'to be a nova',
 'not much concerned for them i say',
 'wishing that night']

In [22]:
test_text[:5]

['i sat up on the floor and shouted toffile',
 'the hands of men',
 'and then becoming reconciled',
 'when they plunged in the tyrant their steel',
 'something i must have learned riding in trains']

In [15]:
idx = 1
word2idx = {"<unk>":0}

In [25]:
# populate word2idx
for text in train_text:
    tokens = text.split()
    for token in tokens:
        if token not in word2idx:
            word2idx[token] = idx
            idx += 1

In [26]:
word2idx

{'<unk>': 0,
 'that': 1,
 'they': 2,
 'could': 3,
 'hear': 4,
 'tell': 5,
 'of': 6,
 'was': 7,
 'scarified': 8,
 'at': 9,
 'the': 10,
 'end': 11,
 'our': 12,
 'path': 13,
 'a': 14,
 'liquescent': 15,
 'to': 16,
 'be': 17,
 'nova': 18,
 'not': 19,
 'much': 20,
 'concerned': 21,
 'for': 22,
 'them': 23,
 'i': 24,
 'say': 25,
 'wishing': 26,
 'night': 27,
 'but': 28,
 'skies': 29,
 'angel': 30,
 'trod': 31,
 'didst': 32,
 'glide': 33,
 'away': 34,
 'only': 35,
 'thine': 36,
 'eyes': 37,
 'remained': 38,
 'with': 39,
 'desperate': 40,
 'energy': 41,
 't': 42,
 'hath': 43,
 'beaten': 44,
 'down': 45,
 'john': 46,
 'threw': 47,
 'door': 48,
 'wide': 49,
 'he': 50,
 'didnt': 51,
 'enter': 52,
 'up': 53,
 'where': 54,
 'trees': 55,
 'grow': 56,
 'short': 57,
 'mosses': 58,
 'tall': 59,
 'upon': 60,
 'those': 61,
 'crystalline': 62,
 'celestial': 63,
 'spheres': 64,
 'banners': 65,
 'yellow': 66,
 'glorious': 67,
 'golden': 68,
 'out': 69,
 'in': 70,
 'plowed': 71,
 'ground': 72,
 'cold': 73,
 

In [27]:
len(word2idx)

2519

In [29]:
# convert data into integer format

train_text_int = []
test_text_int = []

for text in train_text:
    tokens = text.split()
    line_as_int = [word2idx[token] for token in tokens]
    train_text_int.append(line_as_int)

for text in test_text:
    tokens = text.split()
    line_as_int = [word2idx.get(token, 0) for token in tokens]
    test_text_int.append(line_as_int)

In [30]:
train_text_int[100:105]

[[418, 211, 419, 60, 191],
 [10, 420, 90, 6, 99, 421, 422, 423, 299],
 [24, 424, 10, 161, 425, 6, 151, 426],
 [2, 145, 427, 428, 10, 429, 430, 6, 151, 431],
 [432, 433, 434, 53, 435, 6, 436, 437]]

In [34]:
test_text_int[100:105]

[[28, 24, 0, 50, 7, 1596, 79, 0],
 [6, 10, 0, 1, 138, 134],
 [39, 10, 2018, 6, 125, 2397],
 [0, 163, 50, 1680, 16, 0],
 [24, 0, 185, 562, 17, 501, 10, 515, 122, 54]]

In [35]:
# initialize A and pi matrices - for both classes
V = len(word2idx)

A0 = np.ones((V, V))
pi0 = np.ones(V)

A1 = np.ones((V, V))
pi1 = np.ones(V)

In [37]:
# compute counts for A and pi

def compute_counts(text_as_int, A, pi):
    for tokens in text_as_int:
        last_idx = None
        for idx in tokens:
            if last_idx is None:
                # its the first word in a sentence
                pi[idx] += 1
            else:
                # the last word exists, so count a transition
                A[last_idx, idx] += 1
            
            # update last idx
            last_idx = idx


compute_counts([t for t, y in zip(train_text_int, Ytrain) if y == 0], A0, pi0)
compute_counts([t for t, y in zip(train_text_int, Ytrain) if y == 1], A1, pi1)

In [None]:
# normalize A and pi so they are valid probability matrices
# convince yourself that this is equivalent to the formulas shown before

A0 /= A0.sum(axis=1, keepdims=True)
pi0 /= pi0.sum()

A1 /= A1.sum(axis=1, keepdims=True)
pi1 /= pi1.sum()

In [42]:
# log A and pi since we dont need the actual probs
logA0 = np.log(A0)
logpi0 = np.log(pi0)

logA1 = np.log(A1)
logpi1 = np.log(pi1)