<a href="https://colab.research.google.com/github/segnig/Amharic-E-commerce-Data-Extractor/blob/task-4/notebooks/task_four.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# =====================================
# Step 0: Install & Import Libraries
# =====================================
!pip install -q transformers datasets torch seqeval accelerate gdown

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m31.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m31.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m17.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m5.1 MB/s[0m eta [36m

In [None]:
from datasets import DatasetDict, Dataset
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import (
    AutoTokenizer,
    AutoModelForTokenClassification,
    Trainer,
    TrainingArguments
)
from seqeval.metrics import classification_report
import numpy as np
import gdown

# =====================================
# Step 1: Download and Load CoNLL File
# =====================================
file_id = "1oKfqYyLDPcotD85zM59LsBJEQm2bVt2f"
destination_path = "labeled_telegram_product_price_location.txt"
gdown.download(f"https://drive.google.com/uc?id={file_id}", destination_path, quiet=False)



Downloading...
From: https://drive.google.com/uc?id=1oKfqYyLDPcotD85zM59LsBJEQm2bVt2f
To: /content/labeled_telegram_product_price_location.txt
100%|██████████| 2.26M/2.26M [00:00<00:00, 31.9MB/s]


'labeled_telegram_product_price_location.txt'

In [None]:
def load_ner_dataset(file_path):
    sentences = []
    current_sentence = []
    current_labels = []

    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if not line:
                if current_sentence:
                    sentences.append({'words': current_sentence, 'labels': current_labels})
                    current_sentence, current_labels = [], []
            else:
                if '\t' in line:
                    parts = line.split('\t')
                else:
                    parts = line.split()
                if len(parts) == 2:
                    current_sentence.append(parts[0])
                    current_labels.append(parts[1])

    if current_sentence:
        sentences.append({'words': current_sentence, 'labels': current_labels})
    return sentences

all_data = load_ner_dataset(destination_path)
train_data, val_data = train_test_split(all_data, test_size=0.2, random_state=42)

dataset = DatasetDict({
    'train': Dataset.from_dict({
        'words': [x['words'] for x in train_data],
        'labels': [x['labels'] for x in train_data]
    }),
    'validation': Dataset.from_dict({
        'words': [x['words'] for x in val_data],
        'labels': [x['labels'] for x in val_data]
    })
})

In [None]:
# =====================================
# Step 2: Label Mapping
# =====================================
label_list = ["O", "B-PRODUCT", "I-PRODUCT", "B-PRICE", "I-PRICE", "B-LOC", "I-LOC"]
label2id = {label: i for i, label in enumerate(label_list)}
id2label = {i: label for i, label in enumerate(label_list)}

def encode_labels(example):
    example["labels"] = [label2id[label] for label in example["labels"]]
    return example

dataset = dataset.map(encode_labels)

Map:   0%|          | 0/2532 [00:00<?, ? examples/s]

Map:   0%|          | 0/634 [00:00<?, ? examples/s]

In [None]:
# =====================================
# Step 3: Metric Function
# =====================================
def compute_metrics(p):
    predictions, labels = p
    preds = np.argmax(predictions, axis=2)

    true_labels = []
    true_preds = []

    for pred, label in zip(preds, labels):
        temp_labels = []
        temp_preds = []
        for p_, l_ in zip(pred, label):
            if l_ != -100:
                temp_labels.append(id2label[l_])
                temp_preds.append(id2label[p_])
        true_labels.append(temp_labels)
        true_preds.append(temp_preds)

    report = classification_report(true_labels, true_preds, output_dict=True)
    return {
        "precision": report["micro avg"]["precision"],
        "recall": report["micro avg"]["recall"],
        "f1": report["micro avg"]["f1-score"],
    }

In [None]:
# =====================================
# Step 4: Tokenization Function
# =====================================
def tokenize_and_align_labels(examples, tokenizer):
    tokenized_inputs = tokenizer(
        examples["words"],
        truncation=True,
        padding="max_length",
        max_length=128,
        is_split_into_words=True
    )

    labels = []
    for i in range(len(examples["labels"])):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label_ids = []
        previous_word_idx = None
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(examples["labels"][i][word_idx])
            else:
                label_value = examples["labels"][i][word_idx]
                tag = id2label[label_value]
                label_ids.append(label_value if tag.startswith("I-") else -100)
            previous_word_idx = word_idx
        while len(label_ids) < len(tokenized_inputs["input_ids"][i]):
            label_ids.append(-100)
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs



In [None]:
# =====================================
# Step 5: Train and Compare Models
# =====================================
model_candidates = {
    "xlm-roberta-base": "XLM-Roberta",
    "bert-base-multilingual-cased": "mBERT",
    "Davlan/distilbert-base-multilingual-cased-ner-hrl": "DistilBERT",
}

results = []

for model_name, display_name in model_candidates.items():
    print(f"\n🔧 Training model: {display_name} ({model_name})")

    tokenizer = AutoTokenizer.from_pretrained(model_name)
    tokenized_dataset = dataset.map(lambda x: tokenize_and_align_labels(x, tokenizer), batched=True)

    model = AutoModelForTokenClassification.from_pretrained(
        model_name,
        num_labels=len(label_list),
        id2label=id2label,
        label2id=label2id
    )

    training_args = TrainingArguments(
        output_dir=f"./models/{model_name.replace('/', '_')}",
        eval_strategy="epoch",
        save_strategy="epoch",
        learning_rate=2e-5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=3,
        weight_decay=0.01,
        logging_dir="./logs",
        report_to="none"
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_dataset["train"],
        eval_dataset=tokenized_dataset["validation"],
        tokenizer=tokenizer,
        compute_metrics=compute_metrics
    )

    trainer.train()
    metrics = trainer.evaluate()

    print(f"📊 {display_name} Evaluation: {metrics}")

    results.append({
        "model": display_name,
        "f1": metrics["eval_f1"],
        "precision": metrics["eval_precision"],
        "recall": metrics["eval_recall"]
    })

    model.save_pretrained(f"./models/{model_name.replace('/', '_')}")
    tokenizer.save_pretrained(f"./models/{model_name.replace('/', '_')}")


🔧 Training model: XLM-Roberta (xlm-roberta-base)


Error while fetching `HF_TOKEN` secret value from your vault: 'Requesting secret HF_TOKEN timed out. Secrets can only be fetched when running from the Colab UI.'.
You are not authenticated with the Hugging Face Hub in this notebook.
If the error persists, please let us know by opening an issue on GitHub (https://github.com/huggingface/huggingface_hub/issues/new).


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

Map:   0%|          | 0/2532 [00:00<?, ? examples/s]

Map:   0%|          | 0/634 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.036597,0.893158,0.925692,0.909134
2,No log,0.019164,0.945316,0.965517,0.95531
3,No log,0.014884,0.953853,0.973774,0.963711


📊 XLM-Roberta Evaluation: {'eval_loss': 0.014883667230606079, 'eval_precision': 0.9538534728829686, 'eval_recall': 0.9737736765420106, 'eval_f1': 0.9637106464792118, 'eval_runtime': 4.7913, 'eval_samples_per_second': 132.324, 'eval_steps_per_second': 8.349, 'epoch': 3.0}

🔧 Training model: mBERT (bert-base-multilingual-cased)


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

Map:   0%|          | 0/2532 [00:00<?, ? examples/s]

Map:   0%|          | 0/634 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.090762,0.777834,0.719054,0.74729
2,No log,0.064737,0.759342,0.791377,0.775028
3,No log,0.051905,0.825419,0.821975,0.823693


📊 mBERT Evaluation: {'eval_loss': 0.05190509185194969, 'eval_precision': 0.8254189944134078, 'eval_recall': 0.8219749652294854, 'eval_f1': 0.8236933797909408, 'eval_runtime': 4.9092, 'eval_samples_per_second': 129.146, 'eval_steps_per_second': 8.148, 'epoch': 3.0}

🔧 Training model: DistilBERT (Davlan/distilbert-base-multilingual-cased-ner-hrl)


tokenizer_config.json:   0%|          | 0.00/270 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/876 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Map:   0%|          | 0/2532 [00:00<?, ? examples/s]

Map:   0%|          | 0/634 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/539M [00:00<?, ?B/s]

RuntimeError: Error(s) in loading state_dict for Linear:
	size mismatch for bias: copying a param with shape torch.Size([9]) from checkpoint, the shape in current model is torch.Size([7]).

In [None]:
# =====================================
# Step 6: Final Result Summary
# =====================================
print("\n📈 Final Model Comparison:")
print("{:<20} {:<10} {:<10} {:<10}".format("Model", "F1", "Precision", "Recall"))
for r in results:
    print("{:<20} {:.2f}      {:.2f}       {:.2f}".format(r["model"], r["f1"], r["precision"], r["recall"]))