SETUP

In [2]:
#importing libraries
import os
import re #regex for chunk id extraction
import numpy as np
from datasets import load_dataset, Dataset, DatasetDict
from transformers import (
    AutoTokenizer,
    AutoModelForTokenClassification,
    TrainingArguments,
    Trainer,
    DataCollatorForTokenClassification)
from sklearn.metrics import f1_score

PART 1: TOKEN CLASSIFICATION
- Replicating the hugging face token classification tutorial using DIstilBERT on the English dataset ( first understand token classification that's transformer based before adapting it to Hindi data)

In [3]:
#Loading dataset
#conll2003 = load_dataset("conll2003")
conll2003 = load_dataset("conll2003", trust_remote_code=True)

# extracting label names from the dataset feats
label_names = conll2003["train"].features["ner_tags"].feature.names
#bidirectional mapping
id2label = {i: l for i, l in enumerate(label_names) }
label2id = {l: i for i, l in enumerate(label_names) }
#loading distilbBERT tokeniser
tokenizer_en = AutoTokenizer.from_pretrained("distilbert-base-uncased")

#note to self:  4 entities PER, ORG, LOC, MISC.



In [4]:
#Tokenisation and Label alignement 

#need to use subword tokenisation as labels are at word lvl, thus we need to tokenise words that are pre-split and align labels with subword tokens 
def tokenize_and_align(examples):
    #tokenising pre-split words
    tokenized = tokenizer_en(
        examples["tokens"],
        is_split_into_words=True,
        truncation=True
    )
    labels = []
    for i, label_seq in enumerate(examples["ner_tags"]):
        word_ids = tokenized.word_ids(i) #mapping each token back to org word
        #return none for special tokens 
        label_ids = []
        prev = None
        for w in word_ids:
            if  w is None:
                label_ids.append(-100) #ignore special tokens (eg: CLS, SEP etc)
            elif w != prev:
                label_ids.append(label_seq[w]) #first subword of a word should use word's label
            else:
                label_ids.append(-100) #ignore eg: -ing etc
            prev = w
        labels.append(label_ids)
        #adding aligned labels to tokenised output
    tokenized["labels"] = labels
    return tokenized

In [5]:
#applying tokenisation to the dataset
tokenized_conll = conll2003.map(
    tokenize_and_align,
    batched=True,
    remove_columns=conll2003["train"].column_names
)

Map:   0%|          | 0/14041 [00:00<?, ? examples/s]

Map:   0%|          | 0/3250 [00:00<?, ? examples/s]

Map:   0%|          | 0/3453 [00:00<?, ? examples/s]

In [6]:
#need to initialise Model
#Loading DistilBERT for tokenclassification
model_p1 = AutoModelForTokenClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=len(label_names),
    id2label=id2label,
    label2id=label2id
)

#from seqeval.metrics import f1_score
from sklearn.metrics import f1_score

#F1 macro score for token classification
def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=2)
    labels = p.label_ids
#flattening + filtering out ignored tokens/subwords (-100)
    all_preds  = []
    all_labels = []

    for pred, lab in zip(preds, labels):
        for p_i, l_i in zip(pred, lab):
            if l_i != -100: 
                all_preds.append (p_i)
                all_labels.append(l_i)
    return {
        "f1_macro": f1_score (all_labels, all_preds, average="macro")
    }


Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:

#Training the model
trainer_p1 = Trainer(
    model = model_p1,
    args= TrainingArguments(
        output_dir= "./p1",
        evaluation_strategy = "epoch",
        num_train_epochs = 3,
        per_device_train_batch_size = 16,
        per_device_eval_batch_size = 16,
    ),
    train_dataset = tokenized_conll["train"],
    eval_dataset = tokenized_conll["validation"],
    tokenizer = tokenizer_en,
    data_collator = DataCollatorForTokenClassification(tokenizer_en), #to handle padding
    compute_metrics = compute_metrics,
)

trainer_p1.train()
trainer_p1.evaluate(tokenized_conll["test"])
results_p1 = trainer_p1.evaluate(tokenized_conll["test"])

print(f"\n Part 1 Results (English NER)")
print(f"Test F1 Macro: {results_p1['eval_f1_macro']:.4f}")




Epoch,Training Loss,Validation Loss,F1 Macro
1,No log,0.053084,0.910687
2,No log,0.048469,0.926186
3,0.098400,0.048771,0.931853







 Part 1 Results (English NER)
Test F1 Macro: 0.8836


PART 2: HINDI CHUNKING

In [None]:
#Adapting the token classificaiton from earlier to Hindi chunking. IOB chunking 
#note to self: Beginning (B), Inside (I), Outside (O) 
import os
import re
import numpy as np
from datasets import Dataset, DatasetDict
from transformers import (
    AutoTokenizer,
    AutoModelForTokenClassification,
    TrainingArguments,
    Trainer,
    DataCollatorForTokenClassification
)
from sklearn.metrics import f1_score




In [None]:
DATA_DIR = "/srv/data/lt2326-h25/a2"
train_file = os.path.join(DATA_DIR, "hi_hdtb-ud-train.conllu")
dev_file   = os.path.join(DATA_DIR, "hi_hdtb-ud-dev.conllu")
test_file  = os.path.join(DATA_DIR, "hi_hdtb-ud-test.conllu")

#parsing Hindi conllu file into (return)sentences w/ tokens and chunk info
def parse_hindi_conll(filepath):
    sentences = []
    current = {'tokens': [], 'chunk_ids': [], 'raw_chunk_ids': []}

    with open(filepath, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            #empty line -> sentence boundary
            if not line:
                if current['tokens']: #saving sentence if not emprty
                    sentences.append(current)
                    current = { 'tokens': [], 'chunk_ids': [], 'raw_chunk_ids': []}
                continue
            if line.startswith('#'): #Skipping comment lines
                continue  
#split line to columns inorder to handle spaces and tabs
            parts = line.split('\t') if '\t' in line else line.split()
            if len(parts) < 2:
                continue
            token = parts[1]

            #extracting chunk info using regex. eg: ChunkId=NP2 -> 'NP' and '2' (seperately)
            chunk_match = re.search(r'ChunkId=([A-Za-z]+)(\d*)', line)
            if chunk_match:
                raw_chunk = chunk_match.group(1) + chunk_match.group(2)  # NP2
                chunk_type = chunk_match.group(1)  # NP
            else:
                raw_chunk = 'O'
                chunk_type = 'O'

            current['tokens'].append(token)
            current['chunk_ids'].append(chunk_type)
            current['raw_chunk_ids'].append(raw_chunk)
    if current['tokens']:
        sentences.append(current)
    return sentences

In [None]:

#converting chunk IDs to  IOB form
#note to self: first token of chunk (B), continuation token (I), and tokens o/s chunks (O)

def make_iob(sentences):
    for sent in sentences:
        iob = []
        prev_raw = None #in order to detect boundaries
        for chunk_type, raw_chunk in zip(sent['chunk_ids'], sent['raw_chunk_ids']):
            if chunk_type == 'O':
                iob.append('O')
            elif raw_chunk != prev_raw:
                iob.append(f'B-{chunk_type}')
            else:
                iob.append(f'I-{chunk_type}')
            prev_raw = raw_chunk
        sent['iob_labels'] = iob
    return sentences

train_sents = make_iob(parse_hindi_conll(train_file))
dev_sents   = make_iob(parse_hindi_conll(dev_file))
test_sents  = make_iob(parse_hindi_conll(test_file))
print(f"Train sentences: {len(train_sents)}")
print(f"Dev sentences: {len(dev_sents)}")
print(f"Test sentences: {len(test_sents)}")

# Example sentence inorder to verify formatting (IOB)
example = train_sents[0]
print("\n Example sentence with IOB labels: ")
for tok, label in zip(example['tokens'], example['iob_labels']):
    print(f"{tok:10s} -> {label}")

In [None]:
#Creatiing bidirectional label mappings for model

labels = sorted({label for sent in train_sents for label in sent["iob_labels"]}) #only use trianing labels
label2id = {label: idx for idx, label in enumerate(labels)}
id2label = {idx: label for label, idx in label2id.items()}
print("\n Labels:", labels)
print( f"Total number of labels: {len(labels)}")
print("Label2ID:", label2id)

In [None]:
#Conv parsed sentences to Hugging face datset format
def to_dataset(sents):
    return Dataset.from_dict({
        "tokens": [s["tokens"] for s in sents],
        "labels": [[label2id[l] for l in s["iob_labels"]] for s in sents] #conv to numeric ids
    })
data = DatasetDict({
    "train": to_dataset(train_sents),
    "validation": to_dataset(dev_sents),
    "test": to_dataset(test_sents)
})


In [None]:

#Model 1: mBERT
#loading tokeniser
tokenizer_hi = AutoTokenizer.from_pretrained("bert-base-multilingual-cased")

#tokenise hindi text + align labels with subword token
def tokenize_hindi(examples):
    tok = tokenizer_hi(
        examples["tokens"],
        is_split_into_words=True,
        truncation=True
    )
    aligned = []
    
    for i, labs in enumerate(examples["labels"]):
        word_ids = tok.word_ids(i)
        ids, prev = [], None
        for w in word_ids:
            if w is None:
                ids.append(-100)
            elif w != prev:
                ids.append(labs[w])
            else:
                ids.append(-100)
            prev = w
        aligned.append(ids)
    
    tok["labels"] = aligned
    return tok
#applying tokenisaiton to all splits
tokenized_hi = data.map(tokenize_hindi, batched=True, remove_columns=["tokens"])

In [None]:
model_hi = AutoModelForTokenClassification.from_pretrained(
    "bert-base-multilingual-cased",
    num_labels = len(labels),
    id2label = id2label,
    label2id = label2id
)


In [None]:

def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=2)
    labels = p.label_ids

    all_preds, all_labels = [], []
    for pred, lab in zip(preds, labels):
        for p_i, l_i in zip(pred, lab):
            if l_i != -100:
                all_preds.append(p_i)
                all_labels.append(l_i)
    return {"f1_macro": f1_score(all_labels, all_preds, average="macro")}


In [None]:
#trainer for mBERT
trainer_hi = Trainer(
    model = model_hi,
    args = TrainingArguments(
        output_dir ="./hindi",
        evaluation_strategy = "epoch",
        num_train_epochs = 5,
        per_device_train_batch_size = 16,
        per_device_eval_batch_size = 16,
        save_strategy = "epoch"
    ),
    
    train_dataset = tokenized_hi["train"],
    eval_dataset = tokenized_hi["validation"],
    tokenizer = tokenizer_hi,
    data_collator = DataCollatorForTokenClassification(tokenizer_hi),
    compute_metrics = compute_metrics,
)

In [None]:
#train
trainer_hi.train()
results_hi = trainer_hi.evaluate(tokenized_hi["test"])
print(f"\n mBERT Test F1 Macro:  {results_hi['eval_f1_macro']:.4f}")

In [None]:
#MODEL 2 - DistilBERT Multilinguial (for faster training)
#loading tokeniser
tokenizer_distil = AutoTokenizer.from_pretrained("distilbert-base-multilingual-cased")
#follow same alignment as mBERT
def tokenize_hindi_distil(examples):
    tok = tokenizer_distil(
        examples["tokens"],
        is_split_into_words=True,
        truncation=True
    )
    
    aligned = []
    for i, labs in enumerate(examples["labels"]):
        word_ids = tok.word_ids(i)
        ids, prev = [], None
        
        for w in word_ids:
            if w is None:
                ids.append(-100)
            elif w != prev:
                ids.append(labs[w])
            else:
                ids.append(-100)
            prev = w
        aligned.append(ids)
    tok["labels"] = aligned
    return tok
#tokenising dataset
tokenized_hi_distil = data.map(tokenize_hindi_distil, batched=True, remove_columns=["tokens"])

#initialisation
model_distil = AutoModelForTokenClassification.from_pretrained(
    "distilbert-base-multilingual-cased",
    num_labels=len(labels),
    id2label=id2label,
    label2id=label2id
)

#TRainer
trainer_distil = Trainer(
    model=model_distil,
    args=TrainingArguments(
        output_dir="./hindi_distilbert",
        evaluation_strategy="epoch",
        num_train_epochs=5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
    ),
    train_dataset=tokenized_hi_distil["train"],
    eval_dataset=tokenized_hi_distil["validation"],
    tokenizer=tokenizer_distil,
    data_collator=DataCollatorForTokenClassification(tokenizer_distil),
    compute_metrics=compute_metrics,
)

trainer_distil.train()
results_distil = trainer_distil.evaluate(tokenized_hi_distil["test"])

print("\n Model Comparison on Hindi Chunking:")
print(f"mBERT: F1 = {results_hi['eval_f1_macro']:.4f}")
print(f"DistillBERT: F1 = {results_distil['eval_f1_macro']:.4f}")

print(f"Performance gap: {(results_hi['eval_f1_macro'] - results_distil['eval_f1_macro']):.4f} "
      f"({((results_hi['eval_f1_macro'] - results_distil['eval_f1_macro']) / results_distil['eval_f1_macro'] * 100):.1f}%)")

PART 3: PERFORMANCE ANALYSIS

- Both models were evaluated on Hindi IOB chunking, identifying chunk boundaries using BIO labels.

Evaluation Metric:
- I used macro F1 score as it averages F1 across all the 3 labels equally (regardless of class frequency). Additionally, it seemed ideal for IOB tasks cause it ensured balances performance across all label types, thus preventing/reducing bias towards the majority class 

Individual Model Analysis:
- mBERT F1 = 0.8995
    - The training progress showed steady imporvement eg: starting at 0.824 (epoch 1) and then hightening at 0.907 (epoch 4).
    - Validation loss decreased consistently from 0.076 to 0.059, and then slightly increasing to 0.064
    - Overall, the model demonstrated effective transfer learning from multilingual to Hindi

- DistilBERT F1 = 0.8716
    - Significantly smaller than mBERT in size
    - Training progression was slower in comparison eg: starting at 0.801 (iepoch 1) and 0.867 (epoch 4)
    - While validation loss decreased from 0.105 to 0.075
    - It had an overall 3.2% performance gap in comparison to mBERT

Comparative Analysis:
- Performance: mBERT clearly outperformed DistilBERT witha 3.2% relative difference (0.8995 vs 0.8716). Such performance gap is consistence and typical when comparing full BERT models to their respective distillled verisons due to eg: model capacity/parameters, attention mechanisms (full attention layers are better at capturing long range dependencies in Hindi syntax) etc
- However, despite lower performance, DistilBERT does offer more practical advantages such as:
    - Faster training time and inference speed (faster predictions)
    - Being a smaller model size means that it enables deployments in resource-constrained (?) environments

- Trade-offs: 
    - mBERT: when maximum accuracy is critical and if there is sufficent computational resources available
    - DistilBERT: when fast trianing or inference is prioritised and if memory is limited
    - Essentially, accuracy vs efficiency



In [None]:
# Visualisations
import matplotlib.pyplot as plt

#plotting training curves comparison
epochs = [1, 2, 3, 4, 5]
mbert_scores = [0.824197, 0.837153, 0.899832, 0.906529, 0.904749]
distilbert_scores = [0.800774, 0.826090, 0.834281, 0.867580, 0.866282]

plt.figure(figsize=(10, 6))
plt.plot(epochs, mbert_scores, 'b-o', label = 'mBERT', linewidth=2)
plt.plot(epochs, distilbert_scores, 'r-s', label = 'DistilBERT-multilingual', linewidth=2)
plt.xlabel('Epoch', fontsize=12)
plt.ylabel('F1 Macro Score', fontsize=12)
plt.title('Training Progression: mBERT vs DistilBERT on Hindi Chuking', fontsize=14)
plt.legend(fontsize = 11)
plt.grid(True, alpha = 0.3)
plt.ylim(0.75, 0.95)
plt.show()


print("\n Final Smmary of  Test Results:")
print(f"mBERT: F1 = {results_hi['eval_f1_macro']:.4f}")
print(f"DistilBERT: F1 = {results_distil['eval_f1_macro']:.4f}")
print(f"Performance gap:{(results_hi['eval_f1_macro'] - results_distil['eval_f1_macro']):.4f} ({((results_hi['eval_f1_macro'] - results_distil['eval_f1_macro']) / results_distil['eval_f1_macro'] * 100):.1f}%)")