In [None]:
import nltk
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/shashwat1225/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [None]:
import os
from sklearn import metrics
import numpy as np
from tqdm import tqdm
import pandas as pd
import matplotlib.pyplot as plt
from nltk.metrics import ConfusionMatrix


def evaluate(test_sentences, tagged_test_sentences, output_dict=False):
    gold = [str(tag) for sentence in test_sentences for token, tag in sentence]
    pred = [
        str(tag)
        for sentence in tagged_test_sentences
        for token, tag in sentence
    ]
    return metrics.classification_report(gold, pred, output_dict=output_dict)


def get_token_tag_tuples(sent):
    return [nltk.tag.str2tuple(t) for t in sent.split()]


def get_tagged_sentences(text):
    sentences = []

    blocks = text.split("======================================")
    for block in blocks:
        sents = block.split("\n\n")
        for sent in sents:
            sent = sent.replace("\n", "").replace("[", "").replace("]", "")
            if sent is not "":
                sentences.append(sent)
    return sentences


def load_treebank_splits(datadir):

    train = []
    dev = []
    test = []

    print("Loading treebank data...")

    for subdir, dirs, files in os.walk(datadir):
        for filename in files:
            if filename.endswith(".pos"):
                filepath = subdir + os.sep + filename
                with open(filepath, "r") as fh:
                    text = fh.read()
                    if int(subdir.split(os.sep)[-1]) in range(0, 19):
                        train += get_tagged_sentences(text)

                    if int(subdir.split(os.sep)[-1]) in range(19, 22):
                        dev += get_tagged_sentences(text)

                    if int(subdir.split(os.sep)[-1]) in range(22, 25):
                        test += get_tagged_sentences(text)

    print("Train set size: ", len(train))
    print("Dev set size: ", len(dev))
    print("Test set size: ", len(test))

    return train, dev, test


def main():

    # Set path for datadir
    datadir = os.path.join("data", "penn-treeban3-wsj", "wsj")

    train, dev, test = load_treebank_splits(datadir)

    ## For evaluation against the default NLTK POS tagger

    test_sentences = [get_token_tag_tuples(sent) for sent in test]
    tagged_test_sentences = [
        nltk.pos_tag([token for token, tag in sentence])
        for sentence in test_sentences
    ]
    evaluate(test_sentences, tagged_test_sentences)


if __name__ == "__main__":
    main()

  if sent is not "":


Loading treebank data...
Train set size:  51681
Dev set size:  7863
Test set size:  9046


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
START_TOKEN = "<START>"
STOP_TOKEN = "<STOP>"
UNK_TOKEN = "<UNK>"
def tokenize_text(sent):
    words = sent.split()
    processed_words = []
    processed_words.append(START_TOKEN)
    processed_words.extend(words)
    processed_words.append(STOP_TOKEN)
    return processed_words


def get_labels(sent):
    # Add start and stop tokens to the sentence
    processed_sent = [(START_TOKEN, START_TOKEN)] + get_token_tag_tuples(sent) + [(STOP_TOKEN, STOP_TOKEN)]

    # Return the processed sentence
    return processed_sent

def remove_tokens(sent):
    return sent[1:-1]


def get_words(sent):
    words = []
    for word, _ in sent:
        words.append(word)
    return words


def get_tags(sent):
    return [tag for _, tag in sent]


def preprocess_corpus(corpus, preprocess):
    preprocessed_corpus = []
    for sent in corpus:
        preprocessed_corpus.append(preprocess(sent))
    return preprocessed_corpus


def preprocess_flatten_corpus(corpus, preprocess):
    return [token for sent in corpus for token in preprocess(sent)]

In [None]:
#Baseline Tagger
class BaselineTagger:
    def __init__(self):
        self.corpus = []  

        self.most_frequent_table = ({})  
        self.most_common_tag = ("")  

    def train(self, corpus):
        self.corpus = [get_labels(sent) for sent in corpus]

        word_tag_frequent_table = {}
        tag_counts = {}

        for sent in self.corpus:
            for token in sent:
                word, tag = token
                tag_frequency = word_tag_frequent_table.get(word, {})
                tag_frequency[tag] = tag_frequency.get(tag, 0) + 1
                word_tag_frequent_table[word] = tag_frequency
                tag_counts[tag] = tag_counts.get(tag, 0) + 1


        for word, tag_freq in word_tag_frequent_table.items():
            self.most_frequent_table[word] = max(tag_freq.items(), key=lambda x: x[1])[0]

        self.most_common_tag = max(tag_counts.items(), key=lambda x: x[1])[0]

    def predict(self, corpus):
        tags = []
        for sent in corpus:
            tag_seq = []
            for word in sent:
                tag = self.most_frequent_table.get(word, self.most_common_tag)
                tag_seq.append((word, tag))
            tags.append(tag_seq)
        return tags

In [None]:
class HMMTagger:
    def __init__(self):
        self.corpus = []
        self.vocab = {}
        self.tag_list = []
        self.transition_table = []
        self.emission_table = []
        self.unknown_emission_prob = {}

    def train(self, corpus, alpha=1):
        self.corpus = [get_labels(sent) for sent in corpus]
        self.vocab = self.get_vocab(self.corpus)
        (self.transition_table,self.emission_table,self.unknown_emission_prob,self.tag_list) = self.build_tables(alpha)
        self.word2idx = {word: idx for idx, word in enumerate(self.vocab.keys())}
        self.tag2idx = {tag: idx for idx, tag in enumerate(self.tag_list)}
        self.idx2word = {idx: word for idx, word in enumerate(self.vocab.keys())}
        self.idx2tag = {idx: tag for idx, tag in enumerate(self.tag_list)}

    def build_tables(self, alpha=1):
        transitions = self.get_transitions(self.corpus)
        emissions = self.get_emissions(self.corpus)
        tag_dict = self.get_tag_freq(self.corpus)
        tag_list = tag_dict.keys()
        transition_table = self.create_transition_table(transitions, tag_dict, tag_list, alpha)
        emission_table, unknown_emission_prob = self.create_emission_table(emissions, tag_dict, tag_list, self.vocab, alpha)
        return (transition_table,emission_table,unknown_emission_prob,tag_list)

    def get_vocab(self, sents):
        vocab = {}
        for sent in sents:
            for token in sent:
                word = token[0]
                vocab[word] = vocab.get(word, 0) + 1
        return vocab

    def get_transitions(self, sents):
        transitions = {}
        for sent in sents:
            for i in range(1, len(sent)):
                bigram_tags = (sent[i - 1][1], sent[i][1])
                transitions[bigram_tags] = transitions.get(bigram_tags, 0) + 1
        return transitions

    def get_emissions(self, sents):
        emissions = {}
        for sent in sents:
            for token_tag_pair in sent:
                emissions[token_tag_pair] = (emissions.get(token_tag_pair, 0) + 1)
        return emissions

    def get_tag_freq(self, sents):
        tag_dict = {}
        for sent in sents:
            for _, tag in sent:
                tag_dict[tag] = tag_dict.get(tag, 0) + 1
        return tag_dict

    def create_transition_table(self, transitions, tag_dict, tags, alpha=1):
        transition_table = []  
        for prev_tag in tags:
            prob_list = []
            for current_tag in tags:
                prev_count = tag_dict.get(prev_tag, 0)
                bigram_count = transitions.get((prev_tag, current_tag), 0)
                prob = (bigram_count + alpha) / (prev_count + (alpha * len(tags)))
                prob_list.append(np.log(prob))
            transition_table.append(prob_list)
        return transition_table

    def create_emission_table(self, emissions, tag_dict, tags, vocab, alpha):
        emission_table = []  
        unknown_emission_prob = {}
        total_tag_counts = sum(tag_dict.values())
        for tag in tags:
            prob_list = []
            tag_count = tag_dict.get(tag, 0)
            for word in vocab.keys():
                word_tag_count = emissions.get((word, tag), 0)
                prob = (word_tag_count + alpha) / (
                    tag_count + (alpha * len(tags))
                )
                prob_list.append(np.log(prob))
            emission_table.append(prob_list)
            unknown_emission_prob[tag] = (tag_count + alpha) / (
                total_tag_counts + (alpha * len(tags))
            )
        return emission_table, unknown_emission_prob

    def viterbi_decode(self, sent):
        tags = []
        viterbi_matrix = []
        initial = [] 
        viterbi_matrix.append(initial)
        first_token = sent[1]
        first_token_scores = []
        for i, tag in enumerate(self.tag_list):
            transition_prob = self.transition_table[self.tag2idx[START_TOKEN]][i]
            emission_prob = self.unknown_emission_prob[tag]
            if first_token in self.word2idx.keys():
                emission_prob = self.emission_table[i][self.word2idx[first_token]]
            first_token_scores.append((self.tag2idx[START_TOKEN], transition_prob + emission_prob))
        viterbi_matrix.append(first_token_scores)
        for t, token in enumerate(sent):
            if t <= 1:
                continue
            max_scores = []
            for i, tag in enumerate(self.tag_list):
                max_score = float("-inf")
                candidate = None
                emission_prob = self.unknown_emission_prob[tag]
                if token in self.word2idx.keys():
                    emission_prob = self.emission_table[i][self.word2idx[token]]
                for j, score in enumerate(viterbi_matrix[t - 1]):
                    _, prev_max_log_prob = score
                    transition_prob = self.transition_table[j][i]
                    new_score = (
                        emission_prob + transition_prob + prev_max_log_prob
                    )
                    if new_score > max_score:
                        max_score = new_score
                        candidate = j
                max_scores.append((candidate, max_score))
            viterbi_matrix.append(max_scores)
        max_tag = self.tag2idx[STOP_TOKEN]
        tags.append((STOP_TOKEN, self.idx2tag[max_tag]))
        for i in reversed(range(1, len(viterbi_matrix))):
            max_tag = viterbi_matrix[i][max_tag][0]
            tags.append((sent[i - 1], self.idx2tag[max_tag]))
        tags.reverse()
        return tags

    def predict(self, corpus):
        all_tags = []
        for sent in tqdm(corpus):
            prediction_tags = self.viterbi_decode(sent)
            all_tags.append(prediction_tags)
        return all_tags

In [None]:
datadir = os.path.join("data", "penn-treeban3-wsj", "wsj")
train, dev, test = load_treebank_splits(datadir)


gold_dev = preprocess_corpus(dev, get_labels)
gold_test = preprocess_corpus(test, get_labels)

train_corpus = train
dev_corpus = preprocess_corpus(gold_dev,get_words)
test_corpus = preprocess_corpus(gold_test,get_words)
gold_dev = preprocess_corpus(gold_dev, remove_tokens)
gold_test = preprocess_corpus(gold_test, remove_tokens)


def evaluation(result, name=""):
    print(name)
    print(result)

Loading treebank data...
Train set size:  51681
Dev set size:  7863
Test set size:  9046


In [None]:
#RUNNING BASELINE
baseline_tagger = BaselineTagger()
baseline_tagger.train(train_corpus)
# val
prediction_dev = baseline_tagger.predict(dev_corpus)
prediction_dev = preprocess_corpus(prediction_dev, remove_tokens)
evaluation(evaluate(gold_dev, prediction_dev), "Baseline_Val")
# test
prediction_test = baseline_tagger.predict(test_corpus)
prediction_test = preprocess_corpus(prediction_test, remove_tokens)
evaluation(evaluate(gold_test, prediction_test), "Baseline_Test")

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Baseline_Dev
              precision    recall  f1-score   support

           #       1.00      1.00      1.00        31
           $       1.00      1.00      1.00      1248
          ''       1.00      0.98      0.99      1168
           (       1.00      1.00      1.00       244
           )       1.00      1.00      1.00       244
           ,       1.00      1.00      1.00      7931
           .       1.00      1.00      1.00      6125
           :       1.00      1.00      1.00       775
          CC       0.99      0.99      0.99      3777
          CD       0.99      0.90      0.94      5766
          DT       0.99      0.98      0.99     12639
          EX       0.86      1.00      0.92       133
          FW       0.53      0.40      0.45        25
          IN       0.95      0.98      0.96     15497
       IN|RB       0.00      0.00      0.00         1
          JJ       0.88      0.84      0.86      9014
         JJR       0.72      0.92      0.81       506
         JJS  

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Baseline_Test
              precision    recall  f1-score   support

           #       1.00      1.00      1.00        22
           $       1.00      1.00      1.00      1138
          ''       1.00      0.99      1.00      1423
           (       1.00      1.00      1.00       249
           )       1.00      1.00      1.00       252
           ,       1.00      1.00      1.00      9056
           .       1.00      1.00      1.00      7035
           :       1.00      1.00      1.00       983
          CC       1.00      1.00      1.00      4289
          CD       0.99      0.90      0.94      6023
          DT       0.99      0.99      0.99     14946
          EX       0.89      1.00      0.94       174
          FW       0.35      0.21      0.26        38
          IN       0.94      0.98      0.96     18147
          JJ       0.88      0.86      0.87     10704
         JJR       0.66      0.95      0.78       581
     JJR|RBR       0.00      0.00      0.00         4
         JJS 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
#RUNNING HMM
alpha = 1
hmm_tagger = HMMTagger()
hmm_tagger.train(train_corpus, alpha)
# val
prediction_dev = hmm_tagger.predict(dev_corpus)
prediction_dev = preprocess_corpus(prediction_dev, remove_tokens)
evaluation(evaluate(gold_dev, prediction_dev), f"HMM_Val with alpha={alpha}")
# test
prediction_test = hmm_tagger.predict(test_corpus)
prediction_test = preprocess_corpus(prediction_test, remove_tokens)
evaluation(evaluate(gold_test, prediction_test), f"HMM_Test with alpha={alpha}")
evaluation(ConfusionMatrix(preprocess_flatten_corpus(gold_test, get_tags),preprocess_flatten_corpus(prediction_test, get_tags),),f"Confusion Matrix with alpha={alpha}")



100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7863/7863 [05:32<00:00, 23.65it/s]
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


HMM_Dev with alpha=1
              precision    recall  f1-score   support

           #       0.26      1.00      0.41        31
           $       0.87      1.00      0.93      1248
          ''       0.87      0.98      0.92      1168
           (       0.86      0.97      0.91       244
           )       0.80      0.96      0.87       244
           ,       0.99      0.99      0.99      7931
           .       0.97      1.00      0.99      6125
           :       0.96      0.90      0.93       775
          CC       0.99      0.91      0.95      3777
          CD       0.98      0.85      0.91      5766
          DT       0.96      0.96      0.96     12639
          EX       0.45      0.98      0.62       133
          FW       0.00      0.40      0.00        25
          IN       0.97      0.90      0.94     15497
       IN|RB       0.00      0.00      0.00         1
          JJ       0.86      0.69      0.77      9014
         JJR       0.75      0.76      0.75       506
      

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 9046/9046 [06:22<00:00, 23.68it/s]
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


HMM_Test with alpha=1
              precision    recall  f1-score   support

           #       0.24      1.00      0.38        22
           $       0.83      1.00      0.91      1138
          ''       0.89      0.98      0.93      1423
           (       0.86      0.95      0.90       249
           )       0.75      0.94      0.83       252
           ,       0.99      0.99      0.99      9056
           .       0.98      1.00      0.99      7035
           :       0.96      0.90      0.93       983
          CC       1.00      0.92      0.95      4289
          CD       0.98      0.83      0.90      6023
          DT       0.96      0.96      0.96     14946
          EX       0.44      0.99      0.61       174
          FW       0.00      0.37      0.00        38
          IN       0.97      0.91      0.94     18147
       IN|RB       0.00      0.00      0.00         0
          JJ       0.88      0.71      0.78     10704
         JJR       0.74      0.78      0.76       581
     

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
#RUNNING BEST CASE HMM 
alpha = 0.00001
hmm_tagger = HMMTagger()
hmm_tagger.train(train_corpus, alpha)
# dev
prediction_dev = hmm_tagger.predict(dev_corpus)
prediction_dev = preprocess_corpus(prediction_dev, remove_tokens)
evaluation(evaluate(gold_dev, prediction_dev), f"HMM_Dev with alpha={alpha}")

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7863/7863 [05:27<00:00, 24.03it/s]
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


HMM_Dev with alpha=1e-09
              precision    recall  f1-score   support

           #       1.00      1.00      1.00        31
           $       0.98      1.00      0.99      1248
          ''       0.99      1.00      1.00      1168
           (       1.00      1.00      1.00       244
           )       1.00      1.00      1.00       244
           ,       0.99      1.00      1.00      7931
           .       0.99      1.00      1.00      6125
           :       1.00      0.99      1.00       775
          CC       1.00      0.99      1.00      3777
          CD       0.99      0.95      0.97      5766
          DT       0.96      0.99      0.97     12639
          EX       0.98      0.98      0.98       133
          FW       0.44      0.32      0.37        25
          IN       0.97      0.97      0.97     15497
       IN|RB       0.00      0.00      0.00         1
          JJ       0.88      0.89      0.89      9014
         JJR       0.86      0.86      0.86       506
  

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
#TESTING BEST CASE HMM
prediction_test = hmm_tagger.predict(test_corpus)
prediction_test = preprocess_corpus(prediction_test, remove_tokens)
evaluation(evaluate(gold_test, prediction_test), f"HMM_Test with alpha={alpha}")
evaluation(ConfusionMatrix(preprocess_flatten_corpus(gold_test, get_tags),preprocess_flatten_corpus(prediction_test, get_tags),),f"Confusion Matrix with alpha={alpha}")

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 9046/9046 [06:21<00:00, 23.71it/s]
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


HMM_Test with alpha=1e-09
              precision    recall  f1-score   support

           #       1.00      1.00      1.00        22
           $       0.97      1.00      0.98      1138
          ''       1.00      1.00      1.00      1423
           (       1.00      1.00      1.00       249
           )       1.00      1.00      1.00       252
           ,       0.99      1.00      1.00      9056
           .       0.99      1.00      1.00      7035
           :       1.00      1.00      1.00       983
          CC       1.00      1.00      1.00      4289
          CD       0.99      0.93      0.96      6023
          DT       0.96      0.99      0.97     14946
          EX       0.97      0.99      0.98       174
          FW       0.50      0.32      0.39        38
          IN       0.97      0.97      0.97     18147
       IN|RB       0.00      0.00      0.00         0
          JJ       0.89      0.90      0.90     10704
         JJR       0.84      0.88      0.86       581
 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
