<a href="https://colab.research.google.com/github/ruchxr/NLP/blob/main/TextClassifierMarkovModel.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [60]:
!wget -nc https://raw.githubusercontent.com/lazyprogrammer/machine_learning_examples/master/hmm_class/robert_frost.txt
!wget -nc https://raw.githubusercontent.com/lazyprogrammer/machine_learning_examples/master/hmm_class/edgar_allan_poe.txt

File ‘robert_frost.txt’ already there; not retrieving.

File ‘edgar_allan_poe.txt’ already there; not retrieving.



In [61]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import string

In [62]:
input_files = ['/content/edgar_allan_poe.txt', '/content/robert_frost.txt']

In [63]:
input_texts = []
labels = []

for label, text in enumerate(input_files):
  print(f"{text} corresponds to label {label}")

  for line in open(text):
    line = line.rstrip().lower()
    if line:
      line = line.translate(str.maketrans('', '', string.punctuation))

      input_texts.append(line)
      labels.append(label)

/content/edgar_allan_poe.txt corresponds to label 0
/content/robert_frost.txt corresponds to label 1


In [64]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(input_texts, labels, test_size = 0.2, shuffle=True)

In [65]:
X_train[:5], y_train[:5]

(['that with a quickening spell doth oer us pass',
  'with a strange sound as of a harpstring broken',
  'i can see we are going to be good friends',
  'nor need you mind the serial ordeal',
  'a troop of echoes whose sweet duty'],
 [0, 0, 1, 1, 0])

In [66]:
len(X_train), len(X_test)

(1723, 431)

In [67]:
idx = 1
word2idx = {'<unk>':0}

for text in X_train:
  tokens = text.split()
  for token in tokens:
    if token not in word2idx:
      word2idx[token] = idx
      idx += 1

In [68]:
len(word2idx)

2636

In [69]:
X_train_text = []
X_test_text = []

for text in X_train:
  tokens = text.split()
  line_idx = [word2idx[token] for token in tokens]
  X_train_text.append(line_idx)

for text in X_test:
  tokens = text.split()
  line_idx = [word2idx.get(token,0) for token in tokens]
  X_test_text.append(line_idx)

In [70]:
X_train_text[:5]

[[1, 2, 3, 4, 5, 6, 7, 8, 9],
 [2, 3, 10, 11, 12, 13, 3, 14, 15],
 [16, 17, 18, 19, 20, 21, 22, 23, 24, 25],
 [26, 27, 28, 29, 30, 31, 32],
 [3, 33, 13, 34, 35, 36, 37]]

In [71]:
V = len(word2idx)

A0 = np.ones((V,V))
pi0 = np.ones((V))

A1 = np.ones((V,V))
pi1 = np.ones((V))

In [72]:
def compute_counts(text_as_int, A, pi):
  for tokens in text_as_int:
    last_idx = None
    for idx in tokens:
      if last_idx is None:
        pi[idx] += 1
      else:
        A[last_idx][idx] += 1

      last_idx = idx

compute_counts([t for t, y in zip(X_train_text, y_train) if y == 0], A0, pi0)
compute_counts([t for t, y in zip(X_train_text, y_train) if y == 1], A1, pi1)

In [73]:
A0 /= A0.sum(axis=1, keepdims=True)
pi0 /= pi0.sum()

A1 /= A1.sum(axis=1, keepdims=True)
pi1 /= pi1.sum()

In [74]:
logA0 = np.log(A0)
logpi0 = np.log(pi0)

logA1 = np.log(A1)
logpi1 = np.log(pi1)

In [78]:
count0 = sum(y == 0 for y in y_train)
count1 = sum(y == 1 for y in y_train)
total = len(X_train)

p0 = count0 / total
p1 = count1 / total
log0 = np.log(p0)
log1 = np.log(p1)

log0, log1, total, count0, count1

(-1.0974521957305792, -0.4060456596484179, 1723, 575, 1148)

In [77]:
class Classifier:
  def __init__(self, logAs, logpis, logpriors):
    self.logAs = logAs
    self.logpis = logpis
    self.logpriors = logpriors
    self.K = len(logpriors) #number of classes

  def _compute_log_likelihood(self, input_, class_):
    logA = self.logAs[class_]
    logpi = self.logpis[class_]

    last_idx = None
    logprob = 0
    for idx in input_:
      if last_idx is None:
        logprob += logpi[idx]
      else:
        logprob += logA[last_idx][idx]

      last_idx = idx

    return logprob

  def predict(self, inputs):
    predictions = np.zeros(len(inputs))

    for i, input_ in enumerate(inputs):
      posteriors = [self._compute_log_likelihood(input_, c) + self.logpriors[c] for c in range(self.K)]
      pred = np.argmax(posteriors)
      predictions[i] = pred

    return predictions



In [79]:
clf = Classifier([logA0, logA1], [logpi0, logpi1], [log0, log1])

In [80]:
Ptrain = clf.predict(X_train_text)
print(f"Train accuracy: {np.mean(Ptrain == y_train)}")

Train accuracy: 0.995937318630296


In [81]:
Ptest = clf.predict(X_test_text)
print(f"Test accuracy: {np.mean(Ptest == y_test)}")

Test accuracy: 0.8259860788863109


In [82]:
from sklearn.metrics import classification_report, confusion_matrix

print(classification_report(Ptest, y_test))
print(confusion_matrix(Ptest, y_test))

              precision    recall  f1-score   support

         0.0       0.55      0.89      0.68        88
         1.0       0.97      0.81      0.88       343

    accuracy                           0.83       431
   macro avg       0.76      0.85      0.78       431
weighted avg       0.88      0.83      0.84       431

[[ 78  10]
 [ 65 278]]
