In [1]:
# Step 1: Install Hugging Face dataset library
!pip install datasets

# Step 2: Import required modules
from datasets import load_dataset
from collections import defaultdict, Counter
import math
import random



In [2]:
# Step 3: Load the WikiANN dataset for Hindi
dataset = load_dataset("unimelb-nlp/wikiann", "hi")

# Step 4: Split into train, validation, test
train_data = dataset['train']
valid_data = dataset['validation']
test_data = dataset['test']

print("Train samples:", len(train_data))
print("Validation samples:", len(valid_data))
print("Test samples:", len(test_data))

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

hi/validation-00000-of-00001.parquet:   0%|          | 0.00/64.7k [00:00<?, ?B/s]

hi/test-00000-of-00001.parquet:   0%|          | 0.00/65.0k [00:00<?, ?B/s]

hi/train-00000-of-00001.parquet:   0%|          | 0.00/312k [00:00<?, ?B/s]

Generating validation split:   0%|          | 0/1000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1000 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/5000 [00:00<?, ? examples/s]

Train samples: 5000
Validation samples: 1000
Test samples: 1000


In [3]:
# Step 5: Get tag names
label_list = dataset["train"].features["ner_tags"].feature.names
print("NER labels:", label_list)
def convert_data(data):
    sents = []
    for tokens, tags in zip(data["tokens"], data["ner_tags"]):
        tag_names = [label_list[t] for t in tags]
        sents.append((tokens, tag_names))
    return sents

train_sents = convert_data(train_data)
valid_sents = convert_data(valid_data)
test_sents = convert_data(test_data)

print("Example sentence:", train_sents[0][0])
print("Example tags:", train_sents[0][1])

NER labels: ['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC']
Example sentence: ['टैपी', 'ने', 'अपने', 'उत्पादों', 'को', 'एशिया', 'के', 'अपतटीय', 'भागों', 'में', 'भेजने', 'का', 'फैसला', 'लिया।', 'उन्होंने', 'प्रोमोशन', 'के', 'लिए', 'मैडोना', 'को', 'अपने', 'साथ', 'मिलाया।']
Example tags: ['O', 'O', 'O', 'O', 'O', 'B-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-PER', 'O', 'O', 'O', 'O']


In [4]:
# Step 6: Train the HMM using counts (with smoothing)
def train_hmm(sents, smoothing=1e-5):
    start_counts = Counter()
    trans_counts = defaultdict(Counter)
    emit_counts = defaultdict(Counter)
    tags = set()
    vocab = set()

    for tokens, tagseq in sents:
        if not tagseq: continue
        start_counts[tagseq[0]] += 1
        for i in range(len(tokens)):
            vocab.add(tokens[i])
            tags.add(tagseq[i])
            emit_counts[tagseq[i]][tokens[i]] += 1
            if i > 0:
                trans_counts[tagseq[i-1]][tagseq[i]] += 1

    tags = sorted(tags)
    start_log = {t: math.log((start_counts[t] + smoothing) / (sum(start_counts.values()) + smoothing*len(tags))) for t in tags}
    trans_log = {t1: {t2: math.log((trans_counts[t1][t2] + smoothing) / (sum(trans_counts[t1].values()) + smoothing*len(tags))) for t2 in tags} for t1 in tags}
    emit_log = {t: {w: math.log((emit_counts[t][w] + smoothing) / (sum(emit_counts[t].values()) + smoothing*(len(vocab)+1))) for w in emit_counts[t]} for t in tags}

    return {"tags": tags, "vocab": vocab, "start": start_log, "trans": trans_log, "emit": emit_log}

model = train_hmm(train_sents)
print("Training done! Total tags:", len(model['tags']))

Training done! Total tags: 7


In [5]:
# Step 7: Build simple suffix-based Hebbian associations for unseen words
def build_hebbian(train_sents):
    assoc = defaultdict(Counter)
    for tokens, tags in train_sents:
        for w, t in zip(tokens, tags):
            lw = w.lower()
            if len(lw) >= 3:
                suf = lw[-3:]
                assoc[suf][t] += 1
    assoc_prob = {suf: {t: math.log(c/sum(cnt.values())) for t, c in cnt.items()} for suf, cnt in assoc.items()}
    return assoc_prob

hebbian_assoc = build_hebbian(train_sents)
print("Hebbian associative features ready!")

Hebbian associative features ready!


In [6]:
# Step 8: Implement Viterbi algorithm to decode most likely tag sequence
def viterbi(tokens, model, hebb_assoc):
    tags = model["tags"]
    T = len(tokens)
    V = [{} for _ in range(T)]
    back = [{} for _ in range(T)]
    def emit_log(tag, word):
        if word in model["emit"][tag]:
            return model["emit"][tag][word]
        else:
            lw = word.lower()
            suf = lw[-3:] if len(lw) >= 3 else None
            if suf in hebb_assoc:
                return max(hebb_assoc[suf].get(tag, math.log(1e-12)), math.log(1e-12))
            return math.log(1e-12)
    for tag in tags:
        V[0][tag] = model["start"].get(tag, math.log(1e-12)) + emit_log(tag, tokens[0])
        back[0][tag] = None
    for t in range(1, T):
        for curr in tags:
            best_score, best_prev = -1e9, None
            for prev in tags:
                score = V[t-1][prev] + model["trans"][prev].get(curr, math.log(1e-12)) + emit_log(curr, tokens[t])
                if score > best_score:
                    best_score, best_prev = score, prev
            V[t][curr] = best_score
            back[t][curr] = best_prev
    best_final = max(tags, key=lambda t: V[-1][t])
    best_path = [best_final]
    for t in range(T-1, 0, -1):
        best_path.append(back[t][best_path[-1]])
    best_path.reverse()
    return best_path

In [7]:
# Step 9: Evaluate performance
def iob_to_entities(tags):
    ents = set()
    start, label = None, None
    for i, tag in enumerate(tags):
        if tag.startswith("B-"):
            if start is not None:
                ents.add((label, start, i-1))
            label, start = tag[2:], i
        elif tag == "O":
            if start is not None:
                ents.add((label, start, i-1))
                start = None
    if start is not None:
        ents.add((label, start, len(tags)-1))
    return ents

def evaluate(sents, model, hebb_assoc, max_samples=100):
    tp = fp = fn = 0
    for tokens, gold in sents[:max_samples]:
        pred = viterbi(tokens, model, hebb_assoc)
        gset, pset = iob_to_entities(gold), iob_to_entities(pred)
        tp += len(gset & pset)
        fp += len(pset - gset)
        fn += len(gset - pset)
    prec = tp/(tp+fp+1e-12)
    rec = tp/(tp+fn+1e-12)
    f1 = 2*prec*rec/(prec+rec+1e-12)
    print(f"Precision: {prec:.3f}  Recall: {rec:.3f}  F1: {f1:.3f}")

evaluate(valid_sents, model, hebbian_assoc)

Precision: 0.318  Recall: 0.301  F1: 0.309


In [8]:
# Testing
sample = ["प्रधानमंत्री", "नरेंद्र", "मोदी", "दिल्ली", "में", "बैठक", "कर", "रहे", "हैं", "."]
predicted = viterbi(sample, model, hebbian_assoc)
print(list(zip(sample, predicted)))

[('प्रधानमंत्री', 'B-PER'), ('नरेंद्र', 'I-PER'), ('मोदी', 'I-PER'), ('दिल्ली', 'O'), ('में', 'O'), ('बैठक', 'O'), ('कर', 'O'), ('रहे', 'O'), ('हैं', 'O'), ('.', 'O')]


In [10]:
examples = [
    # PERSON examples
    (["श्री", "नरेंद्र", "मोदी", "दिल्ली", "गए", "।"],
     ["O", "PER", "PER", "LOC", "O", "O"]),

    (["प्रियंका", "चोपड़ा", "मुंबई", "में", "रहती", "हैं", "।"],
     ["PER", "PER", "LOC", "O", "O", "O", "O"]),

    # ORGANIZATION examples
    (["भारतीय", "अंतरिक्ष", "अनुसंधान", "संस्थान", "ने", "उपग्रह", "लॉन्च", "किया", "।"],
     ["ORG", "ORG", "ORG", "ORG", "O", "O", "O", "O", "O"]),

    (["टाटा", "कंसल्टेंसी", "सर्विसेज़", "भारत", "की", "सबसे", "बड़ी", "कंपनी", "है", "।"],
     ["ORG", "ORG", "ORG", "LOC", "O", "O", "O", "O", "O", "O"]),

    # LOCATION examples
    (["वाराणसी", "उत्तर", "प्रदेश", "में", "एक", "प्राचीन", "शहर", "है", "।"],
     ["LOC", "LOC", "LOC", "O", "O", "O", "O", "O", "O"]),

    (["भारत", "और", "नेपाल", "के", "बीच", "संबंध", "मजबूत", "हैं", "।"],
     ["LOC", "O", "LOC", "O", "O", "O", "O", "O", "O"]),

    # MISC / Date / Miscellaneous examples
    (["२६", "जनवरी", "को", "दिल्ली", "में", "गणतंत्र", "दिवस", "मनाया", "गया", "।"],
     ["DATE", "DATE", "O", "LOC", "O", "EVENT", "EVENT", "O", "O", "O"]),

    (["कोहली", "ने", "आईसीसी", "पुरस्कार", "जीता", "।"],
     ["PER", "O", "ORG", "O", "O"])
]

for tokens, gold in examples:
    pred = viterbi(tokens, model, hebbian_assoc)
    print("Sentence:", " ".join(tokens))
    print("Gold tags:", gold)
    print("Pred tags:", pred)
    print("-" * 80)


Sentence: श्री नरेंद्र मोदी दिल्ली गए ।
Gold tags: ['O', 'PER', 'PER', 'LOC', 'O', 'O']
Pred tags: ['O', 'B-PER', 'I-PER', 'O', 'O', 'O']
--------------------------------------------------------------------------------
Sentence: प्रियंका चोपड़ा मुंबई में रहती हैं ।
Gold tags: ['PER', 'PER', 'LOC', 'O', 'O', 'O', 'O']
Pred tags: ['B-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'O']
--------------------------------------------------------------------------------
Sentence: भारतीय अंतरिक्ष अनुसंधान संस्थान ने उपग्रह लॉन्च किया ।
Gold tags: ['ORG', 'ORG', 'ORG', 'ORG', 'O', 'O', 'O', 'O', 'O']
Pred tags: ['O', 'B-PER', 'I-PER', 'O', 'O', 'O', 'O', 'B-LOC', 'O']
--------------------------------------------------------------------------------
Sentence: टाटा कंसल्टेंसी सर्विसेज़ भारत की सबसे बड़ी कंपनी है ।
Gold tags: ['ORG', 'ORG', 'ORG', 'LOC', 'O', 'O', 'O', 'O', 'O', 'O']
Pred tags: ['O', 'B-PER', 'I-PER', 'I-PER', 'O', 'O', 'B-LOC', 'O', 'O', 'O']
-----------------------------------