In [3]:
%run initial_preprocessing.ipynb
%run viterbi.ipynb

In [12]:
#update to local paths to test locally
# TRAIN_PATH = "/kaggle/input/nlp-datasets/en_ewt-ud-train.conllu"
# TEST_PATH  = "/kaggle/input/nlp-datasets/en_ewt-ud-test.conllu"
TRAIN_PATH = "en_ewt-ud-train.conllu"
TEST_PATH  = "en_ewt-ud-test.conllu"
train_sentences = read_conllu(TRAIN_PATH)
test_sentences  = read_conllu(TEST_PATH)

In [5]:
initial_probs, transition_probs, emission_probs, vocab, tags = train_hmm(train_sentences)

train_sentences
[('al', 'PROPN'), ('-', 'PUNCT'), ('zaman', 'PROPN'), (':', 'PUNCT'), ('american', 'ADJ'), ('forces', 'NOUN'), ('killed', 'VERB'), ('shaikh', 'PROPN'), ('abdullah', 'PROPN'), ('al', 'PROPN'), ('-', 'PUNCT'), ('ani', 'PROPN'), (',', 'PUNCT'), ('the', 'DET'), ('preacher', 'NOUN'), ('at', 'ADP'), ('the', 'DET'), ('mosque', 'NOUN'), ('in', 'ADP'), ('the', 'DET'), ('town', 'NOUN'), ('of', 'ADP'), ('qaim', 'PROPN'), (',', 'PUNCT'), ('near', 'ADP'), ('the', 'DET'), ('syrian', 'ADJ'), ('border', 'NOUN'), ('.', 'PUNCT')]
[('[', 'PUNCT'), ('this', 'DET'), ('killing', 'NOUN'), ('of', 'ADP'), ('a', 'DET'), ('respected', 'ADJ'), ('cleric', 'NOUN'), ('will', 'AUX'), ('be', 'AUX'), ('causing', 'VERB'), ('us', 'PRON'), ('trouble', 'NOUN'), ('for', 'ADP'), ('years', 'NOUN'), ('to', 'PART'), ('come', 'VERB'), ('.', 'PUNCT'), (']', 'PUNCT')]

Most Common Words
[('the', 9075), ('.', 8640), (',', 7021), ('to', 5137), ('and', 5002), ('a', 3782), ('of', 3622), ('i', 3380), ('in', 3112), ('is'

In [6]:
def debug(test_sentences,initial_probs,transition_probs,emission_probs,brute_force,viterbi,vocab,tags,max_examples=6):

    all_tags = list(tags)
    shown = 0

    for sent in test_sentences:

        words = [w for w, _ in sent]
        gold  = [t for _, t in sent]

        print("Sentence:", words)
        print("Gold tags:", gold)
        
        if len(words) < 7:
            bf_pred = brute_force(words,all_tags,initial_probs,transition_probs,emission_probs)

            bf_correct = sum(g == p for g, p in zip(gold, bf_pred))
            bf_acc = bf_correct / len(gold)

            print("Brute Force:", bf_pred)
            print("BF Accuracy:", round(bf_acc, 3))
        else:
            print("Brute Force: skipped (sentence too long)")


        vit_pred = viterbi(words,all_tags,initial_probs,transition_probs,emission_probs,vocab)

        vit_correct = sum(g == p for g, p in zip(gold, vit_pred))
        vit_acc = vit_correct / len(gold)

        print("Viterbi:    ", vit_pred)
        print("VT Accuracy:", round(vit_acc, 3))
        print()
        shown += 1
        if shown >= max_examples:
            break

            



In [7]:
basic_checks(initial_probs,transition_probs,emission_probs)
debug(test_sentences,initial_probs,transition_probs,emission_probs,brute_force,viterbi,vocab,tags,max_examples=6)


1.0
1.0
1.0
Sentence: ['what', 'if', 'google', 'morphed', 'into', 'googleos', '?']
Gold tags: ['PRON', 'SCONJ', 'PROPN', 'VERB', 'ADP', 'PROPN', 'PUNCT']
Brute Force: skipped (sentence too long)
Viterbi:     ['PRON', 'SCONJ', 'PROPN', 'PROPN', 'ADP', 'PROPN', 'PUNCT']
VT Accuracy: 0.857

Sentence: ['what', 'if', 'google', 'expanded', 'on', 'its', 'search', '-', 'engine', '(', 'and', 'now', 'e-mail', ')', 'wares', 'into', 'a', 'full', '-', 'fledged', 'operating', 'system', '?']
Gold tags: ['PRON', 'SCONJ', 'PROPN', 'VERB', 'ADP', 'PRON', 'NOUN', 'PUNCT', 'NOUN', 'PUNCT', 'CCONJ', 'ADV', 'NOUN', 'PUNCT', 'NOUN', 'ADP', 'DET', 'ADV', 'PUNCT', 'ADJ', 'NOUN', 'NOUN', 'PUNCT']
Brute Force: skipped (sentence too long)
Viterbi:     ['PRON', 'SCONJ', 'PROPN', 'VERB', 'ADP', 'PRON', 'NOUN', 'PUNCT', 'NOUN', 'PUNCT', 'CCONJ', 'ADV', 'NOUN', 'PUNCT', 'NOUN', 'ADP', 'DET', 'ADJ', 'PUNCT', 'ADJ', 'NOUN', 'NOUN', 'PUNCT']
VT Accuracy: 0.957

Sentence: ['[', 'via', 'microsoft', 'watch', 'from', 'mary'

In [8]:
def compute_accuracy(test_sentences, viterbi,tags, initial_probs,transition_probs, emission_probs,vocab):

    correct = 0
    total = 0

    for sent in test_sentences:

        words = [word for word, tag in sent]
        gold_tags = [tag for word, tag in sent]

        pred_tags = viterbi(words,tags,initial_probs,transition_probs,emission_probs,vocab)

        # compare tag-by-tag
        for i in range(len(gold_tags)):

            if pred_tags[i] == gold_tags[i]:
                correct += 1

            total += 1

    accuracy = correct / total

    print("Total tags in test set:", total)
    print("Correct predictions:", correct)
    print("Final Accuracy:", accuracy)

    return accuracy



In [9]:
def comp_sen_len_accuracy(test_sentences,viterbi,tags,initial_probs, transition_probs, emission_probs, vocab):
    correct_short=0
    correct_medium=0
    correct_long=0
    total_short=0
    total_medium=0
    total_long=0
    accuracy_short=0
    accuracy_medium=0
    accuracy_long=0

    for sent in test_sentences:
        words = [word for word, tag in sent]
        gold_tags = [tag for word, tag in sent]
        pred_tags = viterbi(words,tags,initial_probs,transition_probs,emission_probs,vocab)
        if len(words)<5:
            for i in range(len(gold_tags)):

                if pred_tags[i] == gold_tags[i]:
                    correct_short += 1

                total_short += 1
        

        if len(words)>=5 and len(words)<9:
            for i in range(len(gold_tags)):

                if pred_tags[i] == gold_tags[i]:
                    correct_medium += 1

                total_medium += 1
        
            
        
        if len(words)>=9:
            for i in range(len(gold_tags)):

                if pred_tags[i] == gold_tags[i]:
                    correct_long += 1

                total_long += 1
        
            

    accuracy_short = correct_short / total_short if total_short > 0 else 0
    accuracy_medium = correct_medium / total_medium if total_medium > 0 else 0
    accuracy_long = correct_long / total_long if total_long > 0 else 0

    print("Final Accuracy for short sentences:", accuracy_short)
    print("Final Accuracy for medium-length sentences:", accuracy_medium)
    print("FInal accuracy for long sentences", accuracy_long)


    accuracy=[accuracy_short,accuracy_medium,accuracy_long]

    return accuracy

In [10]:
acc = compute_accuracy(test_sentences,viterbi,tags,initial_probs,transition_probs,emission_probs,vocab)
print()
sent_accuracy= comp_sen_len_accuracy(test_sentences,viterbi, tags, initial_probs, transition_probs, emission_probs, vocab)

Total tags in test set: 25096
Correct predictions: 22575
Final Accuracy: 0.8995457443417277

Final Accuracy for short sentences: 0.8171206225680934
Final Accuracy for medium-length sentences: 0.9093351242444594
FInal accuracy for long sentences 0.903230451687227


In [11]:
def compute_confusion_matrix(test_sentences, viterbi,tags,initial_probs,transition_probs,emission_probs,vocab):


    confusion = {}

    for actual in tags:
        confusion[actual] = {}
        for predicted in tags:
            confusion[actual][predicted] = 0


    for sent in test_sentences:

        words = [word for word, tag in sent]
        gold_tags = [tag for word, tag in sent]

        pred_tags = viterbi(words,tags,initial_probs,transition_probs,emission_probs,vocab)

        for actual, predicted in zip(gold_tags, pred_tags):

            confusion[actual][predicted] += 1

    return confusion


In [17]:
def print_confusion_matrix(confusion,tags):

    print("\nConfusion Matrix:\n")

    for actual in tags:

        print(actual, "->", confusion[actual])
        print()

In [18]:
def compute_unk_accuracy(test_sentences, viterbi, all_tags, initial_probs, transition_probs, emission_probs, vocab):
    correct_unk = 0
    total_unk = 0

    for sent in test_sentences:
        words = [word for word, tag in sent]
        gold_tags = [tag for word, tag in sent]
        pred_tags = viterbi(words,all_tags,initial_probs,transition_probs,emission_probs,vocab)

        
        for i in range(len(words)):
            # If the word isn't in vocab, the model treated it as UNK
            if words[i] not in vocab:
                total_unk += 1
                if pred_tags[i] == gold_tags[i]:
                    correct_unk += 1

    
    accuracy = (correct_unk / total_unk) * 100 if total_unk > 0 else 0
    
    
    print(f"Accuracy of unknown words: {accuracy:.2f}%")
    
    return accuracy

In [19]:
compute_unk_accuracy(test_sentences, viterbi,tags, initial_probs, transition_probs, emission_probs, vocab)

Accuracy of unknown words: 54.32%


54.32199918066366

In [20]:
confusion = compute_confusion_matrix(test_sentences,viterbi,tags,initial_probs,transition_probs,emission_probs,vocab)

print_confusion_matrix(confusion,tags)





Confusion Matrix:

PROPN -> {'PROPN': 1523, 'PUNCT': 0, 'ADJ': 76, 'NOUN': 373, 'VERB': 46, 'DET': 0, 'ADP': 6, 'AUX': 2, 'PRON': 4, 'PART': 0, 'SCONJ': 0, 'NUM': 35, 'ADV': 1, 'CCONJ': 0, 'INTJ': 0, 'X': 9, 'SYM': 0}

PUNCT -> {'PROPN': 20, 'PUNCT': 3039, 'ADJ': 0, 'NOUN': 4, 'VERB': 1, 'DET': 0, 'ADP': 0, 'AUX': 0, 'PRON': 0, 'PART': 0, 'SCONJ': 0, 'NUM': 1, 'ADV': 0, 'CCONJ': 0, 'INTJ': 0, 'X': 0, 'SYM': 31}

ADJ -> {'PROPN': 42, 'PUNCT': 0, 'ADJ': 1555, 'NOUN': 72, 'VERB': 57, 'DET': 4, 'ADP': 5, 'AUX': 0, 'PRON': 0, 'PART': 0, 'SCONJ': 1, 'NUM': 9, 'ADV': 40, 'CCONJ': 0, 'INTJ': 1, 'X': 2, 'SYM': 0}

NOUN -> {'PROPN': 224, 'PUNCT': 0, 'ADJ': 85, 'NOUN': 3643, 'VERB': 103, 'DET': 2, 'ADP': 4, 'AUX': 14, 'PRON': 0, 'PART': 0, 'SCONJ': 0, 'NUM': 37, 'ADV': 4, 'CCONJ': 0, 'INTJ': 1, 'X': 1, 'SYM': 5}

VERB -> {'PROPN': 33, 'PUNCT': 0, 'ADJ': 29, 'NOUN': 120, 'VERB': 2315, 'DET': 0, 'ADP': 1, 'AUX': 103, 'PRON': 0, 'PART': 0, 'SCONJ': 0, 'NUM': 0, 'ADV': 3, 'CCONJ': 0, 'INTJ': 0, 'X':