# Q1


*   In this question we want to use POS-tagged training set to compute for each word the tag that maximizes $p(t|w)$.
*   We will implement a simple tokenizer to deal with sentence boundaries.
*   We start by assuming that all unknown words are NN and compute error rate on known and unknown words.
*   Then write at least five rules to do a better job of tagging unknown words, and show the difference in error rates.


In [None]:
import re
import math

import nltk
nltk.download('brown')
from nltk.corpus import brown

In [2]:
def generate_dict(_samples):
    """
    Generate a dictionary that captures the count of each (word, tag) combination from a list of samples.

    Args:
    _samples (list): List of tuples containing (word, tag) pairs.

    Returns:
    dict: A dictionary where keys are words and values are lists of dictionaries with 'tag' and 'count' keys.
          Each dictionary in the list represents a unique tag associated with the word and its count.
    """

    '''
    Example:
      _samples = [('apple', 'fruit'), ('banana', 'fruit'), ('apple', 'fruit'), ('apple', 'color'), ('banana', 'color')]
      After calling generate_dict(_samples), the expected output would be:
      {
          'apple': [
              {'tag': 'fruit', 'count': 2},
              {'tag': 'color', 'count': 1}
          ],
          'banana': [
              {'tag': 'fruit', 'count': 1},
              {'tag': 'color', 'count': 1}
          ]
      }
      This represents that 'apple' appeared twice with the tag 'fruit' and once with the tag 'color',
      while 'banana' appeared once with both 'fruit' and 'color' tags.

    '''

    dictionary = {}


    ## Your code here
    # create the dictionary described in the comments

    ## End Code
    def get_tag_counts(subitem):
        return subitem['count']

    for item in dictionary:
        sorted(dictionary[item], key=get_tag_counts, reverse=True)

    return dictionary

In [3]:
def predict_tag(_test_set, _tag_dict):
    """
    Predicts the tags for a given test set of words based on a provided tag dictionary.

    Args:
    _test_set (list): A list of tuples containing (word, true_tag) pairs to be predicted.
    _tag_dict (dict): A dictionary containing words as keys and lists of dictionaries with 'tag' and 'count' keys as values.

    Returns:
    float: The accuracy of the predictions, calculated as the ratio of correct predictions to the total number of predictions.

    Comments:
    - For unknown words, 'NN' (noun) tag is assigned.
    - Tags are assigned based on the highest count for known words.
      - If there are more than 1 tag for a given word, the tag with the highest count is chosen.
      - If there is only 1 tag available, it is returned directly.
    """

    accuracy = 0
    for item in _test_set:
        word = item[0]
        true_tag = item[1]
        if word in _tag_dict:
            prediction = ## Your code here
        else:
            prediction = ## Your code here
        if prediction == true_tag:
          accuracy = ## Your code here

    accuracy /= len(_test_set)
    print(f"Assuming that all unknown words are NN")
    print(f">> accuracy: {accuracy}")
    return accuracy


In [4]:
def predict_tag_with_improvements(_test_set, _tag_dict):
    """
    Predicts the tags for a given test set of words based on a provided tag dictionary, with additional rules for unknown words.

    Args:
    _test_set (list): A list of tuples containing (word, true_tag) pairs to be predicted.
    _tag_dict (dict): A dictionary containing words as keys and lists of dictionaries with 'tag' and 'count' keys as values.

    Returns:
    float: The accuracy of the predictions, calculated as the ratio of correct predictions to the total number of predictions.

    Comments:
    - For unknown words, 'NN' (noun) tag is initially assigned.
    - Additional rules are applied to analyze unknown words and assign more specific tags based on patterns observed in the word:
        - 'VBG' (verb, gerund) for words ending in 'ing'
        - 'NP$' (noun, possessive) for words ending in "'s"
        - 'NNS' (noun, plural) for words ending in 's'
        - 'RB' (adverb) for words ending in 'ly'
        - 'VBN' (verb, past participle) for words ending in 'ed'
        - 'JJ' (adjective) for words matching certain patterns like 'ble', 'ish', 'ful', etc.
        - 'CD' (cardinal numeral) for numeric strings
        - 'NP' (noun, proper singular) for capitalized words
    """

    ## Your code here
    accuracy = 0

    ## End Code
    accuracy /= len(_test_set)
    print(f"With additional rules for unknown words")
    print(f">> accuracy: {accuracy}")
    return accuracy

In [None]:
CORPUS = brown.tagged_words(categories='news')
CORPUS_SIZE = len(brown.tagged_words(categories='news'))

CUT_OFF = math.floor(CORPUS_SIZE * 0.75)

# section off training and testing lists from corpus
training_list = CORPUS[:CUT_OFF]
testing_list = CORPUS[CUT_OFF:]

# duplicates are ignored in sets
training_set = set(training_list)
testing_set = set(testing_list)
intersection = training_set.intersection(testing_set)

print(f"length of training set:     {len(training_list)}")
print(f"length of testing set:      {len(testing_list)}")

# uncomment to see how much the training set and testing set overlap
# print(f"intersection:               {len(intersection)}")

# uncomment to survey tagged corpus
# print(training_set)

tag_dict = generate_dict(training_list)
accurary_base = predict_tag(testing_list, tag_dict)
accurary_impr = predict_tag_with_improvements(testing_list, tag_dict)
delta = math.floor((accurary_impr - accurary_base) * len(testing_list))
print(f"{delta} more words got correctly classified.")
