In [None]:
# =====================================
# Step 0: Install & Import Libraries
# =====================================
! pip install -q transformers datasets torch seqeval accelerate gdown

In [15]:
from datasets import DatasetDict, Dataset
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import (
    AutoTokenizer,
    AutoModelForTokenClassification,
    Trainer,
    TrainingArguments
)
from seqeval.metrics import classification_report
import numpy as np
import gdown

# =====================================
# Step 1: Download and Load CoNLL File
# =====================================
file_id = "1oKfqYyLDPcotD85zM59LsBJEQm2bVt2f"
destination_path = "labeled_telegram_product_price_location.txt"
gdown.download(f"https://drive.google.com/uc?id={file_id}", destination_path, quiet=False)



Downloading...
From: https://drive.google.com/uc?id=1oKfqYyLDPcotD85zM59LsBJEQm2bVt2f
To: /content/labeled_telegram_product_price_location.txt
100%|██████████| 2.26M/2.26M [00:00<00:00, 155MB/s]


'labeled_telegram_product_price_location.txt'

In [16]:
def load_ner_dataset(file_path):
    sentences = []
    current_sentence = []
    current_labels = []

    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if not line:
                if current_sentence:
                    sentences.append({'words': current_sentence, 'labels': current_labels})
                    current_sentence, current_labels = [], []
            else:
                if '\t' in line:
                    parts = line.split('\t')
                else:
                    parts = line.split()
                if len(parts) == 2:
                    current_sentence.append(parts[0])
                    current_labels.append(parts[1])

    if current_sentence:
        sentences.append({'words': current_sentence, 'labels': current_labels})
    return sentences

all_data = load_ner_dataset(destination_path)
train_data, val_data = train_test_split(all_data, test_size=0.2, random_state=42)

dataset = DatasetDict({
    'train': Dataset.from_dict({
        'words': [x['words'] for x in train_data],
        'labels': [x['labels'] for x in train_data]
    }),
    'validation': Dataset.from_dict({
        'words': [x['words'] for x in val_data],
        'labels': [x['labels'] for x in val_data]
    })
})

In [17]:
# =====================================
# Step 2: Label Mapping
# =====================================
label_list = ["O", "B-PRODUCT", "I-PRODUCT", "B-PRICE", "I-PRICE", "B-LOC", "I-LOC"]
label2id = {label: i for i, label in enumerate(label_list)}
id2label = {i: label for i, label in enumerate(label_list)}

def encode_labels(example):
    example["labels"] = [label2id[label] for label in example["labels"]]
    return example

dataset = dataset.map(encode_labels)

Map:   0%|          | 0/2532 [00:00<?, ? examples/s]

Map:   0%|          | 0/634 [00:00<?, ? examples/s]

In [18]:
# =====================================
# Step 3: Metric Function
# =====================================
def compute_metrics(p):
    predictions, labels = p
    preds = np.argmax(predictions, axis=2)

    true_labels = []
    true_preds = []

    for pred, label in zip(preds, labels):
        temp_labels = []
        temp_preds = []
        for p_, l_ in zip(pred, label):
            if l_ != -100:
                temp_labels.append(id2label[l_])
                temp_preds.append(id2label[p_])
        true_labels.append(temp_labels)
        true_preds.append(temp_preds)

    report = classification_report(true_labels, true_preds, output_dict=True)
    return {
        "precision": report["micro avg"]["precision"],
        "recall": report["micro avg"]["recall"],
        "f1": report["micro avg"]["f1-score"],
    }

In [19]:
# =====================================
# Step 4: Tokenization Function
# =====================================
def tokenize_and_align_labels(examples, tokenizer):
    tokenized_inputs = tokenizer(
        examples["words"],
        truncation=True,
        padding="max_length",
        max_length=128,
        is_split_into_words=True
    )

    labels = []
    for i in range(len(examples["labels"])):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label_ids = []
        previous_word_idx = None
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(examples["labels"][i][word_idx])
            else:
                label_value = examples["labels"][i][word_idx]
                tag = id2label[label_value]
                label_ids.append(label_value if tag.startswith("I-") else -100)
            previous_word_idx = word_idx
        while len(label_ids) < len(tokenized_inputs["input_ids"][i]):
            label_ids.append(-100)
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs



In [25]:
# =====================================
# Step 5: Train and Compare Models
# =====================================
model_candidates = {
    "xlm-roberta-base": "XLM-Roberta",
    "bert-base-multilingual-cased": "mBERT",
    "distilbert-base-multilingual-cased": "DistilBERT",
}

results = []

for model_name, display_name in model_candidates.items():
    print(f"\n🔧 Training model: {display_name} ({model_name})")

    tokenizer = AutoTokenizer.from_pretrained(model_name)
    tokenized_dataset = dataset.map(lambda x: tokenize_and_align_labels(x, tokenizer), batched=True)

    model = AutoModelForTokenClassification.from_pretrained(
        model_name,
        num_labels=len(label_list),
        id2label=id2label,
        label2id=label2id
    )

    output_dir = "/content/drive/MyDrive/10 Academy" + f"/models/{model_name.replace('/', '_')}"

    training_args = TrainingArguments(
        output_dir=output_dir,
        eval_strategy="epoch",
        save_strategy="epoch",
        learning_rate=2e-5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=3,
        weight_decay=0.01,
        logging_dir="./logs",
        report_to="none"
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_dataset["train"],
        eval_dataset=tokenized_dataset["validation"],
        tokenizer=tokenizer,
        compute_metrics=compute_metrics
    )

    trainer.train()
    metrics = trainer.evaluate()

    print(f"📊 {display_name} Evaluation: {metrics}")

    results.append({
        "model": display_name,
        "f1": metrics["eval_f1"],
        "precision": metrics["eval_precision"],
        "recall": metrics["eval_recall"]
    })

    model.save_pretrained(f"./models/{model_name.replace('/', '_')}")
    tokenizer.save_pretrained(f"./models/{model_name.replace('/', '_')}")


🔧 Training model: XLM-Roberta (xlm-roberta-base)


Map:   0%|          | 0/2532 [00:00<?, ? examples/s]

Map:   0%|          | 0/634 [00:00<?, ? examples/s]

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.030457,0.901402,0.936863,0.91879
2,No log,0.016456,0.952358,0.97086,0.96152
3,No log,0.01282,0.963158,0.977659,0.970354


📊 XLM-Roberta Evaluation: {'eval_loss': 0.012820129282772541, 'eval_precision': 0.9631578947368421, 'eval_recall': 0.9776590577950461, 'eval_f1': 0.970354302241504, 'eval_runtime': 4.8817, 'eval_samples_per_second': 129.873, 'eval_steps_per_second': 8.194, 'epoch': 3.0}

🔧 Training model: mBERT (bert-base-multilingual-cased)


Map:   0%|          | 0/2532 [00:00<?, ? examples/s]

Map:   0%|          | 0/634 [00:00<?, ? examples/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.090762,0.777834,0.719054,0.74729
2,No log,0.064737,0.759342,0.791377,0.775028
3,No log,0.051905,0.825419,0.821975,0.823693


📊 mBERT Evaluation: {'eval_loss': 0.05190509185194969, 'eval_precision': 0.8254189944134078, 'eval_recall': 0.8219749652294854, 'eval_f1': 0.8236933797909408, 'eval_runtime': 5.2348, 'eval_samples_per_second': 121.113, 'eval_steps_per_second': 7.641, 'epoch': 3.0}

🔧 Training model: DistilBERT (distilbert-base-multilingual-cased)


Map:   0%|          | 0/2532 [00:00<?, ? examples/s]

Map:   0%|          | 0/634 [00:00<?, ? examples/s]

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.117443,0.693853,0.544274,0.610029
2,No log,0.076244,0.801778,0.710709,0.753502
3,No log,0.064981,0.848,0.786277,0.815973


📊 DistilBERT Evaluation: {'eval_loss': 0.06498074531555176, 'eval_precision': 0.848, 'eval_recall': 0.7862772369031061, 'eval_f1': 0.8159730574933847, 'eval_runtime': 2.7118, 'eval_samples_per_second': 233.791, 'eval_steps_per_second': 14.75, 'epoch': 3.0}


In [26]:
# =====================================
# Step 6: Final Result Summary
# =====================================
print("\n📈 Final Model Comparison:")
print("{:<20} {:<10} {:<10} {:<10}".format("Model", "F1", "Precision", "Recall"))
for r in results:
    print("{:<20} {:.2f}      {:.2f}       {:.2f}".format(r["model"], r["f1"], r["precision"], r["recall"]))


📈 Final Model Comparison:
Model                F1         Precision  Recall    
XLM-Roberta          0.97      0.96       0.98
mBERT                0.82      0.83       0.82
DistilBERT           0.82      0.85       0.79
