# Part-of-Speech Tagging Mini Project

Treebank data sourced from [https://www.kaggle.com/nltkdata/penn-tree-bank](https://www.kaggle.com/nltkdata/penn-tree-bank)

In [1]:
import os
import json
from abc import ABC, abstractclassmethod
from collections import defaultdict

## Parsing Treebank data

In [2]:
def parse_tagged_chunk(chunk):
    """
    Input: raw text from a .pos file from the Penn Treebank Project dataset.
    Output: A list of sentences, which are lists of (word, tag) pairs.
    """
    lines = chunk.split("\n")
    sentences = []
    current_sentence = []
    for line in lines:
        if line == "":
            if current_sentence:
                # we have reached the end of the current sentence
                sentences.append(current_sentence)
                # start a new one
                current_sentence = []
            else:
                # this is just boring whitespace, ignore it
                continue
        # strip off groupings
        line = line.strip("[] ")
        # split the line into word-tag pairs
        word_tag_pairs = line.split()
        for pair in word_tag_pairs:
            # currently pair is in string form
            # split it into word, tag
            # sometimes word can unfortunately include the / symbol
            # makes code messier
            pieces = pair.split("/")
            tag = pieces[-1]
            word = "/".join(pieces[:-1])
            # not too bad!
            pair = word, tag

            # require letters with optional $ at the end
            looks_like_a_real_tag = tag.strip("$").isalpha()
            if looks_like_a_real_tag:
                current_sentence.append(pair)
    return sentences

In [6]:
# this code block will create the variable `chunks`,
# a list of chunks, where a chunk is a list of sentences, where a sentence is
# a list of (word, tag) pairs.

# if `UPDATE_REQUIRED` is set to true, data from data/treebank is accessed and
# parsed into chunks, and the result saved to data/parsed.json. Otherwise, the
# chunks are read straight from data/parsed.json
UPDATE_REQUIRED = False

if not UPDATE_REQUIRED:
    try:
        with open("data/parsed.json") as file:
            chunks = json.load(file)
    except FileNotFoundError:
        # force update
        print("You are wrong, update is in fact required")
        UPDATE_REQUIRED = True

if UPDATE_REQUIRED:
    def parse_all_tagged_chunks(path="data/treebank/tagged"):
        filenames = [x for x in os.listdir(path) if x.endswith(".pos")]
        chunks = []
        for fn in filenames:
            fp = os.path.join(path, fn)
            with open(fp) as file:
                data = file.read()
                parsed_chunk = parse_tagged_chunk(data)
                chunks.append(parsed_chunk)
        return chunks

    chunks = parse_all_tagged_chunks()
    with open("data/parsed.json", "w") as file:
        json.dump(chunks, file)

You are wrong, update is in fact required


## Preparing parsed data for learning

In [7]:
# intra-chunk sentences are related, but we aren't going to consider this context
# instead, chunks will be summed (as lists) to produce datasets

all_sentences = sum(chunks, [])

def train_dev_test_split(data, dev_prop=0.1, test_prop=0.1):
    n = len(data)
    dev_size = int(n * dev_prop)
    test_size = int(n * test_prop)
    train_size = n - (dev_size + test_size)
    return data[:train_size], data[train_size:train_size+dev_size], data[-test_size:]

data_train, data_dev, data_test = train_dev_test_split(all_sentences)

In [8]:
class Tagger(ABC):

    def __init__(self):
        pass

    @abstractclassmethod
    def train(self, sentences):
        pass

    @abstractclassmethod
    def tag(self, sentence):
        pass

def test_tagger(tagger, test_data=data_dev, num_failures_to_print=0):
    correct, n = 0, 0
    failures_printed = 0
    for sentence in test_data:
        sentence_tags_hidden = [word for word, _ in sentence]
        tagged = tagger.tag(sentence_tags_hidden)
        mistake_made = False
        for (_, tag), (_, true_tag) in zip(tagged, sentence):
            n += 1
            if tag == true_tag:
                correct += 1
            else:
                mistake_made = True
        if failures_printed < num_failures_to_print and mistake_made:
            longest_word_len = max(len(word) for word, _ in sentence)
            print("----- Example error ----")
            print("WORD".rjust(longest_word_len), "TRUE".rjust(5), "PRED".rjust(5))
            for (word, tag), (_, true_tag) in zip(tagged, sentence):
                print(word.rjust(longest_word_len), true_tag.rjust(5), (tag if tag != true_tag else "-").rjust(5))
            failures_printed += 1
            print()
    return correct / n

In [9]:
class UnigramTagger(Tagger):
    """
    Tag words with tag they appeared with most often in training data.

    If word has not been seen before, tag it with the most common tag.
    """

    def __init__(self):
        self.trained = False

    def train(self, sentences):
        # we don't care about sentence context, so just combine into a big ol mash
        tagged_words = sum(sentences, [])
        tag_frequency = defaultdict(lambda: defaultdict(int))
        overall_tag_counts = defaultdict(int)
        for word, tag in tagged_words:
            tag_frequency[word][tag] += 1
            overall_tag_counts[tag] += 1
        model = {}
        for word, tag_freq_for_word in tag_frequency.items():
            most_common_tag = max(tag_freq_for_word.keys(), key = tag_freq_for_word.get)
            model[word] = most_common_tag
        self.model = model
        self.most_common_tag = max(overall_tag_counts, key = overall_tag_counts.get)
        self.tag_frequency = tag_frequency

    def tag(self, sentence):
        tagged = []
        for word in sentence:
            if word in self.model:
                tag = self.model[word]
            else:
                tag = self.most_common_tag
            tagged.append((word, tag))
        return tagged

In [10]:
T = UnigramTagger()
T.train(data_train)

In [12]:
acc = test_tagger(T, num_failures_to_print=0)
train_acc = test_tagger(T, data_train)
print(f"Baseline unigram tagger achieved {100 * acc : .2f}% accuracy on development data ({100 * train_acc : .2f}% on training data)")

Baseline unigram tagger achieved  85.98% accuracy on development data ( 95.09% on training data)
