In [2]:
import os

if not os.path.exists('edgar_allan_poe.txt'):
    !wget -nc https://raw.githubusercontent.com/lazyprogrammer/machine_learning_examples/master/hmm_class/edgar_allan_poe.txt

if not os.path.exists('robert_frost.txt'):
    !wget -nc https://raw.githubusercontent.com/lazyprogrammer/machine_learning_examples/master/hmm_class/robert_frost.txt



In [3]:
import numpy as np
import matplotlib.pyplot as plt
import string
from sklearn.model_selection import train_test_split

In [4]:
input_files = [
    'edgar_allan_poe.txt',
    'robert_frost.txt'
]

In [5]:
!head edgar_allan_poe.txt

LO! Death hath rear'd himself a throne
In a strange city, all alone,
Far down within the dim west
Where the good, and the bad, and the worst, and the best,
Have gone to their eternal rest.
 
There shrines, and palaces, and towers
Are not like any thing of ours
Oh no! O no! ours never loom
To heaven with that ungodly gloom!


In [6]:
!head robert_frost.txt

Two roads diverged in a yellow wood,
And sorry I could not travel both
And be one traveler, long I stood
And looked down one as far as I could
To where it bent in the undergrowth; 

Then took the other, as just as fair,
And having perhaps the better claim
Because it was grassy and wanted wear,
Though as for that the passing there


In [7]:
# Collect data into lists
input_texts = []
labels = []

for label, f in enumerate(input_files):
    print(f"{f} corresponds to label {label}")
    
    for line in open(f):
        line = line.rstrip().lower()
        if line:
            # Remove punctuation
            line = line.translate(str.maketrans('', '', string.punctuation))
            input_texts.append(line)
            labels.append(label)

edgar_allan_poe.txt corresponds to label 0
robert_frost.txt corresponds to label 1


In [8]:
train_text, test_text, y_train, y_test = train_test_split(input_texts, labels)

In [9]:
len(y_train), len(y_test)

(1615, 539)

In [10]:
train_text[:5]

['who wouldst not leave him in his wandering',
 'our flowers are merely flowers',
 'how daring an ambition yet how deep',
 'one level higher than the earth below',
 'to stand together on the craters verge']

In [11]:
y_train[:5]

[0, 0, 0, 1, 1]

In [12]:
idx = 1
word2idx = {'<unk>': 0}

In [13]:
# Populate word2idx

for text in train_text:
    tokens = text.split()
    for token in tokens:
        if token not in word2idx:
            word2idx[token] = idx
            idx += 1
        

In [14]:
word2idx

{'<unk>': 0,
 'who': 1,
 'wouldst': 2,
 'not': 3,
 'leave': 4,
 'him': 5,
 'in': 6,
 'his': 7,
 'wandering': 8,
 'our': 9,
 'flowers': 10,
 'are': 11,
 'merely': 12,
 'how': 13,
 'daring': 14,
 'an': 15,
 'ambition': 16,
 'yet': 17,
 'deep': 18,
 'one': 19,
 'level': 20,
 'higher': 21,
 'than': 22,
 'the': 23,
 'earth': 24,
 'below': 25,
 'to': 26,
 'stand': 27,
 'together': 28,
 'on': 29,
 'craters': 30,
 'verge': 31,
 'take': 32,
 'it': 33,
 'year': 34,
 'out': 35,
 'he': 36,
 'doesnt': 37,
 'make': 38,
 'much': 39,
 'up': 40,
 'attic': 41,
 'mother': 42,
 'because': 43,
 'was': 44,
 'grassy': 45,
 'and': 46,
 'wanted': 47,
 'wear': 48,
 'nor': 49,
 'grannys': 50,
 'surely': 51,
 'call': 52,
 'of': 53,
 'them': 54,
 'old': 55,
 'cellar': 56,
 'hole': 57,
 'a': 58,
 'byroad': 59,
 'off': 60,
 'here': 61,
 'hands': 62,
 'men': 63,
 'talk': 64,
 'had': 65,
 'been': 66,
 'serious': 67,
 'sober': 68,
 'luminary': 69,
 'clock': 70,
 'against': 71,
 'sky': 72,
 'at': 73,
 'least': 74,
 'pas

In [15]:
len(word2idx)

2557

In [16]:
# Convert data into integer format
train_text_int = []
test_text_int = []

for text in train_text:
    tokens = text.split()
    line_as_int = [word2idx[token] for token in tokens]
    train_text_int.append(line_as_int)

for text in test_text:
    tokens = text.split()
    line_as_int = [word2idx.get(token, 0) for token in tokens]
    test_text_int.append(line_as_int)
    

In [17]:
train_text_int[100:105]

[[378, 3, 23, 379, 380, 53, 381],
 [23, 382, 383, 384, 29, 23, 385, 386],
 [93, 387, 119, 123, 260, 160, 58, 388, 40],
 [143, 389, 343, 35, 23, 390, 53, 7, 391],
 [392, 122, 26, 393, 143, 360, 394]]

In [18]:
# Initialize A and pi matrices for both classes
V = len(word2idx)

A0 = np.ones((V, V))
pi0 = np.ones(V)

A1 = np.ones((V, V))
pi1 = np.ones(V)

In [19]:
# Compute counts for A and pi
def compute_counts(text_as_int, A, pi):
    for tokens in text_as_int:
        last_idx = None
        for idx in tokens:
            if last_idx is None:
                pi[idx] += 1
            else:
                A[last_idx, idx] += 1
            
            # Update last idx
            last_idx = idx

In [20]:
compute_counts([t for t, y in zip(train_text_int, y_train) if y == 0], A0, pi0)
compute_counts([t for t, y in zip(train_text_int, y_train) if y == 1], A1, pi1)

In [21]:
# Normalize A and pi so they are valid probability matrices
# Convince your self that this is equivalent to the formulas shown before
A0 /= A0.sum(axis=1, keepdims=True)
pi0 /= pi0.sum()

A1 /= A1.sum(axis=1, keepdims=True)
pi1 /= pi1.sum()

In [22]:
# Log A and pi since we don't need the actual probs
log_A0 = np.log(A0)
log_pi0 = np.log(pi0)

log_A1 = np.log(A1)
log_pi1 = np.log(pi1)

In [23]:
# Compute priors
count0 = sum(y == 0 for y in y_train)
count1 = sum(y == 1 for y in y_train)
total = len(y_train)
p0 = count0 / total
p1 = count1 / total
log_p0 = np.log(p0)
log_p1 = np.log(p1)
p0, p1

(0.32755417956656346, 0.6724458204334365)

In [27]:
# Build classifier
class Classifier:
    def __init__(self, logAs, logpis, logpriors):
        self.logAs = logAs
        self.logpis = logpis
        self.logpriors = logpriors
        self.K = len(logpriors) # number of classes
    
    def _compute_log_likelihood(self, input_, class_):
        log_A = self.logAs[class_]
        log_pi = self.logpis[class_]
        
        last_idx = None
        log_prob = 0
        for idx in input_:
            if last_idx is None:
                log_prob += log_pi[idx]
            else:
                log_prob += log_A[last_idx, idx]
            
            # Update last index
            last_idx = idx
            
        return log_prob
    
    def predict(self, inputs):
        predictions = np.zeros(len(inputs))
        for i, input_ in enumerate(inputs):
            posteriors = [self._compute_log_likelihood(input_, c) + self.logpriors[c] \
            for c in range(self.K)]
            pred = np.argmax(posteriors)
            predictions[i] = pred
        return predictions
        
              
                
            
            

In [30]:
clf = Classifier([log_A0, log_A1], [log_pi0, log_pi1], [log_p0, log_p1])

In [31]:
p_train = clf.predict(train_text_int)
print(f"Train acc: {np.mean(p_train == y_train)}")

Train acc: 0.9962848297213622


In [32]:
p_test = clf.predict(test_text_int)
print(f"Test acc: {np.mean(p_test == y_test)}")

Test acc: 0.8237476808905381


In [33]:
from sklearn.metrics import confusion_matrix, f1_score

cm = confusion_matrix(y_train, p_train)
cm

array([[ 523,    6],
       [   0, 1086]])

In [34]:
cm_test = confusion_matrix(y_test, p_test)
cm_test

array([[102,  87],
       [  8, 342]])

In [35]:
f1_score(y_train, p_train)

0.9972451790633609

In [36]:
f1_score(y_test, p_test)

0.8780487804878049