In [1]:
import numpy as np
from collections import defaultdict
import pandas as pd
import ast

In [2]:
class HMM:
    def __init__(self, states, observations):
        self.states = states
        self.observations = observations
        self.start_probabilities = np.zeros(len(states))
        self.transition_probabilities = np.zeros((len(states), len(states)))
        self.emission_probabilities = np.zeros((len(states), len(observations)))

        self.state_index = {state: i for i, state in enumerate(states)}
        self.observation_index = {obs: i for i, obs in enumerate(observations)}

    def compute_start_probabilities(self, sequences):
        for sequence in sequences:
            start_state = sequence[0][1]  # The tag of the first token
            self.start_probabilities[self.state_index[start_state]] += 1
        self.start_probabilities /= np.sum(self.start_probabilities)

    def compute_transition_probabilities(self, sequences):
        for sequence in sequences:
            for i in range(len(sequence) - 1):
                current_state = sequence[i][1]
                next_state = sequence[i + 1][1]
                self.transition_probabilities[self.state_index[current_state], self.state_index[next_state]] += 1

        for i in range(len(self.states)):
            if(np.sum(self.transition_probabilities[i])==0):
                continue
            self.transition_probabilities[i] /= np.sum(self.transition_probabilities[i])

    def compute_emission_probabilities(self, sequences):
        for sequence in sequences:
            for token, state in sequence:
                self.emission_probabilities[self.state_index[state], self.observation_index[token]] += 1

        for i in range(len(self.states)):
            if(np.sum(self.emission_probabilities[i])==0):
                continue
            self.emission_probabilities[i] /= np.sum(self.emission_probabilities[i])

    def viterbi_algorithm(self, obs):
        viterbi_table = [[0.0 for _ in range(len(self.states))] for _ in range(len(obs))]
        backpointer = [[0 for _ in range(len(self.states))] for _ in range(len(obs))]

        for t in range(len(obs)):
            for s in range(len(self.states)):
                if t == 0:
                    viterbi_table[t][s] = self.start_probabilities[s] * self.emission_probabilities[s][obs[t]]
                else:
                    max_prob = -1
                    max_backpointer = -1

                    for s_prime in range(len(self.states)):
                        prob = viterbi_table[t-1][s_prime] * self.transition_probabilities[s_prime][s] * self.emission_probabilities[s][obs[t]]
                        if prob > max_prob:
                            max_prob = prob
                            max_backpointer = s_prime

                    viterbi_table[t][s] = max_prob
                    backpointer[t][s] = max_backpointer

        best_path_prob = max(viterbi_table[-1])
        best_path_pointer = max(range(len(self.states)), key=lambda s: viterbi_table[-1][s])
        best_path = [best_path_pointer]
        for t in range(len(obs)-1, 0, -1):
            best_path.insert(0, backpointer[t][best_path[0]])

        return best_path

In [3]:
def threshold_data(sequences, threshold):
    word_counts = defaultdict(int)
    for sequence in sequences:
        for token, state in sequence:
            word_counts[token] += 1

    unk_sequences = []
    for sequence in sequences:
        unk_sequence = [(token if word_counts[token] > threshold else 'UNK', state) for token, state in sequence]
        unk_sequences.append(unk_sequence)

    return unk_sequences

In [4]:
train = pd.read_csv('./cleaned_data.csv')
test = pd.read_csv('./test_data.csv')

In [6]:
df_train = pd.read_csv('./cleaned_data.csv', converters={'tokens': ast.literal_eval, 'ner_tags': ast.literal_eval})

In [7]:
train_data = [(row['tokens'], row['ner_tags']) for _, row in df_train.iterrows()]
unique_tags = set(tag for _, tags in train_data for tag in tags)
states = list(unique_tags)

In [8]:
train_hmm_data = [list(zip(tokens, tags)) for tokens, tags in train_data]

In [9]:
unk_sequences = threshold_data(train_hmm_data, 5)

In [10]:
print(unk_sequences[0])

[('பைரவருக்கு', 1), ('தேய்பிறை', 0), ('UNK', 0), ('விசேஷ', 0), ('அபிஷேக', 0), ('ஆராதனைகள்', 0), ('நடைபெறுகின்றன', 0), ('.', 0)]


In [11]:
unique_tokens = set()
for sequence in unk_sequences:
    for token, _ in sequence:
        unique_tokens.add(token)

unique_tokens_list = list(unique_tokens)
num_observations = len(unique_tokens)

In [12]:
def train_hmm(hmm_model, sequences):
    hmm_model.compute_start_probabilities(sequences)
    hmm_model.compute_transition_probabilities(sequences)
    hmm_model.compute_emission_probabilities(sequences)

In [13]:
hmm_model = HMM(states, unique_tokens_list)

train_hmm(hmm_model, unk_sequences)

print("Start Probabilities:")
print(hmm_model.start_probabilities)
print("Transition Probabilities:")
print(hmm_model.transition_probabilities)
print("Emission Probabilities:")
print(hmm_model.emission_probabilities)

Start Probabilities:
[0.69395158 0.09176672 0.         0.09568934 0.         0.11859236
 0.        ]
Transition Probabilities:
[[8.70181685e-01 5.31512551e-02 0.00000000e+00 2.83013819e-02
  0.00000000e+00 4.83656784e-02 0.00000000e+00]
 [4.73991545e-01 1.63788780e-02 5.01533281e-01 2.76563731e-03
  2.04198610e-04 5.05839364e-03 6.80662033e-05]
 [6.36743036e-01 1.79804199e-02 3.28184296e-01 6.25756964e-03
  1.48869600e-03 9.11384740e-03 2.32135648e-04]
 [4.62780528e-01 2.87547003e-03 8.33716162e-04 7.54314623e-03
  5.23182414e-01 2.47279079e-03 3.11934619e-04]
 [3.55764544e-01 3.81564228e-03 5.19892865e-04 1.56756983e-02
  6.16588296e-01 7.53844654e-03 9.74799122e-05]
 [7.78107609e-01 4.52930923e-03 5.06903367e-04 1.01672416e-02
  2.63298008e-03 3.22339487e-02 1.71822008e-01]
 [5.34096742e-01 3.86969398e-03 5.92300099e-04 6.50213886e-03
  1.68476473e-03 3.81309641e-02 4.15123396e-01]]
Emission Probabilities:
[[4.67347989e-06 1.27458543e-06 8.49723617e-07 ... 0.00000000e+00
  8.28480526

In [16]:
test_tokens = test['tokens']
test_ner_tags = test['ner_tags']
misvals = []

N_test = len(test_tokens)

def compute_accuracy(hmm_model):
    count_correct_new = 0
    tot_new = 0
    correct = 0
    total = 0
    for idx in range(N_test):
        tags = eval(test_ner_tags[idx])
        observations = [obs for obs in eval(test_tokens[idx])]
        if(len(observations)==0):
            continue
        observed_tokens = [tok for tok in eval(test_ner_tags[idx])]
        observation_indices = [hmm_model.observation_index[obs] if obs in hmm_model.observation_index else hmm_model.observation_index['UNK'] for obs in observations]
        predicted_tags = hmm_model.viterbi_algorithm(observation_indices)
        for i,obs in enumerate(observations):
            if obs not in hmm_model.observation_index:
                tot_new += 1 
                if(predicted_tags[i] == tags[i]):
                    count_correct_new += 1

        for i in range(len(predicted_tags)):
            if(predicted_tags[i] == observed_tokens[i]):
                correct += 1
        total += len(observations)
    return correct / total, count_correct_new / tot_new


accuracy, unk_accuracy = compute_accuracy(hmm_model)
print("Accuracy:", accuracy)
print("UNK Accuracy:", unk_accuracy)

Accuracy: 0.8650293870696893
UNK Accuracy: 0.7225312934631433
