In [173]:
from typing import List, Dict, Tuple
import os
import itertools

TEST_WORDS_FILE: str = os.getenv("TEST_WORDS_FILE", "WSJ_23.words")

def glance(d, _f: int = 0, _t: int=3):
    return dict(itertools.islice(d.items(), _f, _t))

# Handling OOV
* has digit?
* is capitalized?
* has symbol like '-' or '/'?
* common suffixes
* common prefixes

In [176]:
with open("merged.pos") as f:
    lines: List[str] = [l for l in f if l != "\n"]
    record_length: int = len(lines)
    words: List[str] = [""] * record_length
    tags: List[str] = [""] * record_length
    for i in range(record_length):
        words[i], tags[i] = lines[i].split()

def classify(w: str):
    if any(c.isdigit() for c in w):
        return "U_CD"
    if w[0].isupper():
        return "U_up"
    if "-" in w:
        return "U_hyp"
    if "/" in w:
        return "U_slash"
    suffixes: List[str] = ["ly", "ed", "ble", "ous", "s", "ing", 
                "ion", "ism", "ist", "al", "um", "er", 
                "es", "ent", "ize", "ful", "ive"]
    for s in suffixes:
        if w.lower().endswith(s):
            return "U_" + s
    prefixes: List[str] = ["extra", "hetero", "homo", "ir", "in", 
                "im", "macro", "micro", "non", "omni"]
    for p in prefixes:
        if w.lower().startswith(p):
            return "U_" + p
    if len(w) == 1:
        return "U_sym"
    return "U_Unknown"

from collections import Counter
words_occur: Dict[str, int] = Counter(words)
for i in range(record_length):
    if words_occur[words[i]] == 1:
        words[i] = classify(words[i])

tag_tag_e: Dict[str, Dict[str, float]] = {}
word_tag_e: Dict[str, Dict[str, float]] = {}
tag_tag: Dict[str, List[Tuple[str, float]]] = {}
word_tag: Dict[str, List[Tuple[str, float]]] = {}
for i in range(record_length - 1):
    from_tag: str = tags[i]
    to_tag: str = tags[i + 1]
    if not tag_tag_e.get(from_tag): tag_tag_e[from_tag] = {}
    if not tag_tag_e[from_tag].get(to_tag): tag_tag_e[from_tag][to_tag] = 0
    tag_tag_e[from_tag][to_tag] += 1
    
    from_word: str = words[i]
    if not word_tag_e.get(from_word): word_tag_e[from_word] = {}
    if not word_tag_e[from_word].get(from_tag): word_tag_e[from_word][from_tag] = 0
    word_tag_e[from_word][from_tag] += 1

for key in tag_tag_e.keys():
    to_dict_tag: Dict[str, float] = tag_tag_e[key]
    total = sum(to_dict_tag.values())
    for in_key in to_dict_tag.keys(): to_dict_tag[in_key] /= total
    tag_list = sorted(to_dict_tag.items(), key = lambda x: x[0])
    tag_tag[key] = tag_list

for key in word_tag_e.keys():
    to_dict_word: Dict[str, float] = word_tag_e[key]
    total = sum(to_dict_word.values())
    for in_key in to_dict_word.keys(): to_dict_word[in_key] /= total
    word_list = sorted(to_dict_word.items(), key = lambda x: x[0])
    word_tag[key] = word_list

glance(word_tag)

{'In': [('IN', 0.9972283813747228),
  ('NNP', 0.0016629711751662971),
  ('RB', 0.0005543237250554324),
  ('RBR', 0.0005543237250554324)],
 'an': [(',', 0.00030950170225936243), ('DT', 0.9996904982977406)],
 'Oct.': [('NN', 0.003003003003003003), ('NNP', 0.996996996996997)]}

# Main algorithm
Normal HMM, it only cares about the previous tag.

In [177]:
with open(TEST_WORDS_FILE) as f:
    test_words: List[str] = [line.rstrip() for line in f]

test_length = len(test_words)

ans: List[str] = [""] * test_length
out: List[str] = [""] * test_length
for i in range(test_length):
    current_word: str = test_words[i]
    if current_word == "":
        out[i] = out[i - 1]
        ans[i] = ""
        continue
    if current_word not in word_tag_e:
        current_word = classify(current_word)
    transitions = tag_tag["."] if not i else tag_tag[out[i - 1]]
    emissions = word_tag.get(current_word, 0)
    if not emissions: 
        ans[i] = out[i] = "NNP"
        # out[i] = [for e in emissions]
        continue
    max_p: float = 0
    current_p: float = 0
    j: int = 0
    k: int = 0
    while j < len(transitions) and k < len(emissions):
        transition_tag, emission_tag = transitions[j][0], emissions[k][0]
        if emission_tag == transition_tag:
            current_p = transitions[j][1] * emissions[k][1]
            if current_p > max_p:
                out[i] = transition_tag
                max_p = current_p
            j += 1
            k += 1
        elif transition_tag > emission_tag: k += 1
        else: j += 1
    if out[i]=="":
        out[i] = "NNP"
        out[i] = max(emissions,key=lambda x: x[1])[0]  
    ans[i] = out[i]
lst_out = [(w + "\t" + t) if len(t) else "" for (w, t) in zip(test_words, ans)]
with open("submission.pos", "w") as f:
    f.write("\n".join(lst_out) + "\n")