In [1]:
import nltk
import numpy as np
import pandas as pd
import random
from collections import defaultdict, Counter
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix, accuracy_score
import time

In [2]:
nltk.download('brown')
nltk.download('universal_tagset')
nltk_data = list(nltk.corpus.brown.tagged_sents(tagset='universal'))

[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package universal_tagset to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!


In [3]:
tagged_words = [tup for sent in nltk_data for tup in sent]
tags = {tag for word, tag in tagged_words}

word_tag_freq = defaultdict(lambda: defaultdict(int))
tag_count = defaultdict(int)
       
for word, tag in tagged_words:
    word_tag_freq[word][tag] += 1
    tag_count[tag] += 1

In [4]:
def t2_given_t1(t2, t1, tags_seq):
    transitions = defaultdict(int)
    tag_seq_len = len(tags_seq)

    for i in range(tag_seq_len - 1):
        if tags_seq[i] == t1 and tags_seq[i + 1] == t2:
            transitions[(t1, t2)] += 1

    count_t1 = tags_seq.count(t1)
    return transitions[(t1, t2)] / count_t1 if count_t1 > 0 else 1e-6

# Emission probabilities for word given tag
def word_given_tag(word, tag):
    count_tag = tag_count[tag]
    count_word_tag = word_tag_freq[word][tag] if word in word_tag_freq else 0
    return count_word_tag / count_tag if count_tag > 0 else 1e-6

In [5]:
# Optimized Viterbi Algorithm with smoothing for unknown words
def Viterbi_optimized(words, tags_df, train_tagged_words):
    state = []
    T = list(tag_count.keys())
    word_prob_cache = {}

    for key, word in enumerate(words):
        p = []
        if word in word_tag_freq:
            possible_tags = list(word_tag_freq[word].keys())
        else:
            possible_tags = T

        for tag in possible_tags:
            if key == 0:
                transition_p = tags_df.loc['.', tag] if '.' in tags_df.index else 1e-6
            else:
                transition_p = tags_df.loc[state[-1], tag]

            if (word, tag) not in word_prob_cache:
                emission_p = word_given_tag(word, tag)
                word_prob_cache[(word, tag)] = emission_p
            else:
                emission_p = word_prob_cache[(word, tag)]

            state_probability = emission_p * transition_p
            p.append(state_probability)

        pmax = max(p)
        state_max = possible_tags[p.index(pmax)]
        state.append(state_max)

    return list(zip(words, state))

# Function to compute 5-fold cross-validation, confusion matrix, and per-POS accuracy
def evaluate_viterbi_with_cross_validation(tagged_sentences, k=5):
    kf = KFold(n_splits=k, shuffle=True, random_state=42)
    all_true_tags = []
    all_pred_tags = []
    fold_accuracies = []
    
    # Prepare the transition matrix
    tags_seq = [pair[1] for pair in tagged_words]
    tags_matrix = np.zeros((len(tags), len(tags)), dtype='float32')
    tag_list = list(tags)

    for i, t1 in enumerate(tag_list):
        for j, t2 in enumerate(tag_list):
            tags_matrix[i, j] = t2_given_t1(t2, t1, tags_seq)
    tags_df = pd.DataFrame(tags_matrix, columns=tag_list, index=tag_list)

    # Cross-validation loop
    for train_index, test_index in kf.split(tagged_sentences):
        # Split the data into train and test sets for this fold
        train_data = [tagged_sentences[i] for i in train_index]
        test_data = [tagged_sentences[i] for i in test_index]

        # Flatten the train data for transition probability calculation
        train_tagged_words = [tup for sent in train_data for tup in sent]

        # Run Viterbi on the test set
        for test_sent in test_data:
            words = [word for word, tag in test_sent]
            true_tags = [tag for word, tag in test_sent]
            predicted_tags = [tag for word, tag in Viterbi_optimized(words, tags_df, train_tagged_words)]

            all_true_tags.extend(true_tags)
            all_pred_tags.extend(predicted_tags)

        # Accuracy for the current fold
        fold_accuracy = accuracy_score(all_true_tags, all_pred_tags)
        fold_accuracies.append(fold_accuracy)

    # Average accuracy over the folds
    avg_accuracy = np.mean(fold_accuracies)
    print(f"5-Fold Cross-Validation Accuracy: {avg_accuracy * 100:.2f}%")

    # Confusion matrix
    conf_matrix = confusion_matrix(all_true_tags, all_pred_tags, labels=tag_list)
    conf_matrix_df = pd.DataFrame(conf_matrix, index=tag_list, columns=tag_list)
    print("\nConfusion Matrix:\n", conf_matrix_df)

    # Per POS accuracy
    per_tag_accuracy = {}
    true_tag_counter = Counter(all_true_tags)

    for tag in tag_list:
        correct_preds = conf_matrix_df.loc[tag, tag]
        total_true = true_tag_counter[tag]
        per_tag_accuracy[tag] = correct_preds / total_true if total_true > 0 else 0.0

    print("\nPer POS Tag Accuracy:")
    for tag, accuracy in per_tag_accuracy.items():
        print(f"{tag}: {accuracy * 100:.2f}%")



# Run the evaluation
evaluate_viterbi_with_cross_validation(nltk_data, k=5)


5-Fold Cross-Validation Accuracy: 96.10%

Confusion Matrix:
          ADP       .     X    NUM    ADV   CONJ    NOUN    ADJ     DET   PRON  \
ADP   135351      20     5      1   1119    148      48     95     201      0   
.          0  147565     0      0      0      0       0      0       0      0   
X         24      24  1158      5      4      4     109      7      17      5   
NUM        0       0     0  14670      0      0     204      0       0      0   
ADV     2668       0     1      0  48879    124     100   3212     373      0   
CONJ       3       0     0      0     55  37965       2      0     126      0   
NOUN      33       1    18    543    132      1  269008   2793     115     16   
ADJ       80       0     3      4   2303      0    1839  78935       0      0   
DET     1161       0     5      2      2     37       1      0  135566    244   
PRON    1776       0     0      0      3      0       0      0    1139  46414   
VERB     162       0     5      0     99      0 

### Final output function

In [6]:
# Prepare the transition matrix
tags_seq = [pair[1] for pair in tagged_words]
tags_matrix = np.zeros((len(tags), len(tags)), dtype='float32')
tag_list = list(tags)

for i, t1 in enumerate(tag_list):
    for j, t2 in enumerate(tag_list):
        tags_matrix[i, j] = t2_given_t1(t2, t1, tags_seq)
tags_df = pd.DataFrame(tags_matrix, columns=tag_list, index=tag_list)
        
def call_viterbi(sentence):
    words = sentence.split()
    predicted_tags_with_words = list(Viterbi_optimized(words, tags_df, tagged_words))
    return predicted_tags_with_words

In [7]:
import gradio as gr

# Define the Gradio interface
interface = gr.Interface(
    fn= call_viterbi,                       # Function that processes input
    inputs="text",                       # Input type: Text field for the user
    outputs="text",                      # Output type: Text display for POS tags
    title="Part-of-Speech Tagger",       # Title of the UI
    description="Enter a sentence and get the Part-of-Speech tags."  # Description
)

# Launch the UI in a browser
interface.launch()

Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.


