### Creating a vocabulary

In [1]:
#Import the libraries
import string
from collections import Counter

In [2]:
#Read the data from the file
with open("WSJ_02-21.pos", 'r') as f:
    lines = f.readlines()
lines

['In\tIN\n',
 'an\tDT\n',
 'Oct.\tNNP\n',
 '19\tCD\n',
 'review\tNN\n',
 'of\tIN\n',
 '``\t``\n',
 'The\tDT\n',
 'Misanthrope\tNN\n',
 "''\t''\n",
 'at\tIN\n',
 'Chicago\tNNP\n',
 "'s\tPOS\n",
 'Goodman\tNNP\n',
 'Theatre\tNNP\n',
 '(\t(\n',
 '``\t``\n',
 'Revitalized\tVBN\n',
 'Classics\tNNS\n',
 'Take\tVBP\n',
 'the\tDT\n',
 'Stage\tNN\n',
 'in\tIN\n',
 'Windy\tNNP\n',
 'City\tNNP\n',
 ',\t,\n',
 "''\t''\n",
 'Leisure\tNN\n',
 '&\tCC\n',
 'Arts\tNNS\n',
 ')\t)\n',
 ',\t,\n',
 'the\tDT\n',
 'role\tNN\n',
 'of\tIN\n',
 'Celimene\tNNP\n',
 ',\t,\n',
 'played\tVBN\n',
 'by\tIN\n',
 'Kim\tNNP\n',
 'Cattrall\tNNP\n',
 ',\t,\n',
 'was\tVBD\n',
 'mistakenly\tRB\n',
 'attributed\tVBN\n',
 'to\tTO\n',
 'Christina\tNNP\n',
 'Haag\tNNP\n',
 '.\t.\n',
 '\n',
 'Ms.\tNNP\n',
 'Haag\tNNP\n',
 'plays\tVBZ\n',
 'Elianti\tNNP\n',
 '.\t.\n',
 '\n',
 'Rolls-Royce\tNNP\n',
 'Motor\tNNP\n',
 'Cars\tNNPS\n',
 'Inc.\tNNP\n',
 'said\tVBD\n',
 'it\tPRP\n',
 'expects\tVBZ\n',
 'its\tPRP$\n',
 'U.S.\tNNP\n',
 's

In [3]:
#Print the first 5 lines of the data in structured way
print("\t\tWord", "\tTag\n")
for i in range(5):
    print(f'line number {i + 1}: {lines[i]}')

		Word 	Tag

line number 1: In	IN

line number 2: an	DT

line number 3: Oct.	NNP

line number 4: 19	CD

line number 5: review	NN



In [4]:
#Get all the words from the lines
words = [line.split('\t')[0] for line in lines]
words[:5]

['In', 'an', 'Oct.', '19', 'review']

In [6]:
#Build a dictionary to get the counts
from collections import defaultdict
freq = defaultdict(int)

for word in words:
    freq[word] += 1
freq

defaultdict(int,
            {'In': 1740,
             'an': 3143,
             'Oct.': 318,
             '19': 100,
             'review': 58,
             'of': 22929,
             '``': 6967,
             'The': 6833,
             'Misanthrope': 3,
             "''": 6787,
             'at': 4362,
             'Chicago': 197,
             "'s": 9311,
             'Goodman': 7,
             'Theatre': 5,
             '(': 1153,
             'Revitalized': 1,
             'Classics': 1,
             'Take': 9,
             'the': 41107,
             'Stage': 3,
             'in': 15186,
             'Windy': 1,
             'City': 139,
             ',': 48723,
             'Leisure': 3,
             '&': 1034,
             'Arts': 8,
             ')': 1160,
             'role': 125,
             'Celimene': 4,
             'played': 53,
             'by': 4495,
             'Kim': 7,
             'Cattrall': 1,
             'was': 3903,
             'mistakenly': 6,
             'att

In [7]:
#Filter dictionary to get only the words with more than one occurence
vocab = [k for k, v in freq.items() if v > 1 and k != '\n']
vocab[:5]

['In', 'an', 'Oct.', '19', 'review']

In [8]:
#Sort the vocabulary
vocab.sort()

for i in range(2000, 2010):
    print(vocab[i])

ARE
ARTICLE
AS
ASCAP
ASSETS
ASSOCIATION
AST
AT&T
AUS
AVX


#### Processing new text sources

In [10]:
#Assign the tag to unknown word
def assign_unk(word):
    
    #Get the string of punctuations
    punct = set(string.punctuation)
    
    #Suffixes
    noun_suffix = ["action", "age", "ance", "cy", "dom", "ee", "ence", "er", "hood", "ion", "ism", "ist", "ity", "ling", "ment", "ness", "or", "ry", "scape", "ship", "ty"]
    verb_suffix = ["ate", "ify", "ise", "ize"]
    adj_suffix = ["able", "ese", "ful", "i", "ian", "ible", "ic", "ish", "ive", "less", "ly", "ous"]
    adv_suffix = ["ward", "wards", "wise"]
    
    #Check for digit
    if any(char.isdigit() for char in word):
        return "--unk_digit--"
    
    #Check for punctuation
    elif any(char in punct for char in word):
        return "--unk_punc--"
    
    # Loop the characters in the word, check if any is an upper case character
    elif any(char.isupper() for char in word):
        return "--unk_upper--"

    # Check if word ends with any noun suffix
    elif any(word.endswith(suffix) for suffix in noun_suffix):
        return "--unk_noun--"

    # Check if word ends with any verb suffix
    elif any(word.endswith(suffix) for suffix in verb_suffix):
        return "--unk_verb--"

    # Check if word ends with any adjective suffix
    elif any(word.endswith(suffix) for suffix in adj_suffix):
        return "--unk_adj--"

    # Check if word ends with any adverb suffix
    elif any(word.endswith(suffix) for suffix in adv_suffix):
        return "--unk_adv--"
    
    # If none of the previous criteria is met, return plain unknown
    return "--unk--"


In [11]:
#Define a function to get the correct tag
def get_word_tag(line, vocab):
    # If line is empty return placeholders for word and tag
    if not line.split():
        word = "--n--"
        tag = "--s--"
    else:
        # Split line to separate word and tag
        word, tag = line.split()
        # Check if word is not in vocabulary
        if word not in vocab: 
            # Handle unknown word
            word = assign_unk(word)
    return word, tag

In [12]:
get_word_tag('\n', vocab)

('--n--', '--s--')

In [13]:
get_word_tag('In\tIN\n', vocab)

('In', 'IN')

In [14]:
get_word_tag('tardigrade\tNN\n', vocab)

('--unk--', 'NN')