In [20]:
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/tenx/Data-Extraction-From-Telegram-Channels-LLMs-Week_4

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/tenx/Data-Extraction-From-Telegram-Channels-LLMs-Week_4


In [21]:
!pip install transformers datasets accelerate seqeval -q

from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer
from datasets import Dataset, Features, Sequence, Value, ClassLabel
import pandas as pd
import numpy as np
from seqeval.metrics import f1_score, precision_score, recall_score, classification_report

#experiment with 'Davlan/xlm-roberta-base-finetuned-amharic'
model_checkpoint = "Davlan/xlm-roberta-base-finetuned-amharic"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)


**Read and prepare the CoNLL data**

In [22]:
conll_data = '/content/drive/MyDrive/tenx/Data-Extraction-From-Telegram-Channels-LLMs-Week_4/labeled_data.conll'

# Add a check to see if the file exists
import os
if not os.path.exists(conll_data):
    print(f"Error: File not found at {conll_data}")

else:
    print(f"File found: {conll_data}")
    # Add a check to see if the file is empty
    if os.stat(conll_data).st_size == 0:
        print(f"Error: File is empty at {conll_data}")
    else:
        print(f"File is not empty. Reading content...")


def read_conll(file):
    """Reads the CoNLL formatted file and returns a list of dictionaries.
    Each dictionary represents a sentence with 'tokens' and 'ner_tags'.
    Handles potential format issues by skipping problematic lines and
    ensuring labels are strings.
    """
    sentences = []
    tokens = []
    ner_tags = []
    with open(file, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if line:
                parts = line.split('\t')
                if len(parts) == 2: # Assuming simple word \t tag format
                    token = parts[0]
                    tag = parts[1]
                    # Explicitly check if the tag is a string and not empty
                    if isinstance(tag, str) and tag:
                        tokens.append(token)
                        ner_tags.append(tag)
                    else:
                        print(f"Skipping line with invalid tag format (expected non-empty string): {line}")
                        continue # Skip this line if tag is not a valid string
                else:
                    # Print skipping message only if parts were found but not 2
                    if parts:
                        print(f"Skipping line with unexpected format (expected 2 parts, got {len(parts)}): {line}")
                    continue
            else:
                if tokens: # End of a sentence
                    sentences.append({"tokens": tokens, "ner_tags": ner_tags})
                    tokens = []
                    ner_tags = []
        if tokens: # Add the last sentence if file doesn't end with a blank line
            sentences.append({"tokens": tokens, "ner_tags": ner_tags})
    return sentences

raw_datasets = read_conll(conll_data)

# Add a check to see if any data was read
if not raw_datasets:
    print("Error: No sentences were read from the CoNLL file. Please check the file path and format.")
else:
    print(f"Successfully read {len(raw_datasets)} sentences.")
    # Extract all unique NER tags to create a label list
    unique_tags = sorted(list(set(tag for sentence in raw_datasets for tag in sentence['ner_tags'])))
    # Add 'O' if it's not present and ensure the desired order (B-*, I-*, O)
    # This is important for consistent mapping
    bio_tags = ['O']
    for tag in unique_tags:
        if tag.startswith('B-') and tag not in bio_tags:
            bio_tags.append(tag)
    for tag in unique_tags:
        if tag.startswith('I-') and tag not in bio_tags:
            bio_tags.append(tag)
    for tag in unique_tags:
        if tag not in bio_tags: # For any other tags not covered by B/I
            bio_tags.append(tag)

    label_names = bio_tags
    id2label = {i: label for i, label in enumerate(label_names)}
    label2id = {label: i for i, label in enumerate(label_names)}

    # Convert to Hugging Face Dataset format
    # Define the features for our dataset
    features = Features({
        'tokens': Sequence(Value('string')),
        'ner_tags': Sequence(ClassLabel(names=label_names))
    })

    # Create Hugging Face Dataset
    dataset = Dataset.from_list(raw_datasets, features=features)

    # Split the dataset into training and validation sets
    # Adjust test_size as needed, e.g., 0.2 for 80/20 split
    train_test_split = dataset.train_test_split(test_size=0.2)
    train_dataset = train_test_split["train"]
    eval_dataset = train_test_split["test"]

    print(f"Training dataset size: {len(train_dataset)}")
    print(f"Validation dataset size: {len(eval_dataset)}")
    print(f"Label names: {label_names}")
    print(f"Your label2id: {label2id}")
    print(f"Your id2label: {id2label}")

File found: /content/drive/MyDrive/tenx/Data-Extraction-From-Telegram-Channels-LLMs-Week_4/labeled_data.conll
File is not empty. Reading content...
Successfully read 1019 sentences.
Training dataset size: 815
Validation dataset size: 204
Label names: ['O', 'B-LOC', 'B-PRICE', 'B-Product', 'I-LOC', 'I-PRICE', 'I-Product']
Your label2id: {'O': 0, 'B-LOC': 1, 'B-PRICE': 2, 'B-Product': 3, 'I-LOC': 4, 'I-PRICE': 5, 'I-Product': 6}
Your id2label: {0: 'O', 1: 'B-LOC', 2: 'B-PRICE', 3: 'B-Product', 4: 'I-LOC', 5: 'I-PRICE', 6: 'I-Product'}


Tokenize Data and Align Labels

In [23]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], is_split_into_words=True, truncation=True, padding="longest"
    )

    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word ID that is None. We set the label to -100 so they are ignored in the loss function.
            if word_idx is None:
                label_ids.append(-100)
            # We want to set a label only for the first token of each word.
            elif word_idx != previous_word_idx:
                # Debugging step: Check the type and value of 'label[word_idx]'
                current_label_from_data = label[word_idx]

                # IMPORTANT: Ensure current_label_from_data is a string before looking up in label2id
                if not isinstance(current_label_from_data, str):
                    print(f"!!! DEBUG: Unexpected label type for example {i}, word_idx {word_idx}: {type(current_label_from_data)} - Value: {current_label_from_data}")
                    # Attempt to convert if it's an int and exists in id2label, otherwise default to 'O'
                    if isinstance(current_label_from_data, int) and current_label_from_data in id2label:
                        print(f"!!! DEBUG: Converting int {current_label_from_data} to string label {id2label[current_label_from_data]}")
                        current_label_from_data = id2label[current_label_from_data]
                    else:
                        print(f"!!! DEBUG: Cannot convert, defaulting to 'O' for example {i}, word_idx {word_idx}")
                        current_label_from_data = 'O' # Default to 'O' if it's truly unrecoverable

                label_ids.append(label2id[current_label_from_data])
            # For the other tokens of a word, we set the label to -100.
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

# Add a check to ensure train_dataset and eval_dataset were created
if 'train_dataset' in locals() and 'eval_dataset' in locals():
    tokenized_train_dataset = train_dataset.map(
        tokenize_and_align_labels, batched=True, remove_columns=train_dataset.column_names
    )
    tokenized_eval_dataset = eval_dataset.map(
        tokenize_and_align_labels, batched=True, remove_columns=eval_dataset.column_names
    )

    # Data Collator for Token Classification
    from transformers import DataCollatorForTokenClassification
    data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)
else:
    print("Skipping tokenization and alignment as train_dataset/eval_dataset were not created.")

Map:   0%|          | 0/815 [00:00<?, ? examples/s]

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
!!! DEBUG: Unexpected label type for example 750, word_idx 92: <class 'int'> - Value: 0
!!! DEBUG: Converting int 0 to string label O
!!! DEBUG: Unexpected label type for example 750, word_idx 93: <class 'int'> - Value: 0
!!! DEBUG: Converting int 0 to string label O
!!! DEBUG: Unexpected label type for example 750, word_idx 94: <class 'int'> - Value: 0
!!! DEBUG: Converting int 0 to string label O
!!! DEBUG: Unexpected label type for example 750, word_idx 95: <class 'int'> - Value: 0
!!! DEBUG: Converting int 0 to string label O
!!! DEBUG: Unexpected label type for example 750, word_idx 96: <class 'int'> - Value: 0
!!! DEBUG: Converting int 0 to string label O
!!! DEBUG: Unexpected label type for example 750, word_idx 97: <class 'int'> - Value: 0
!!! DEBUG: Converting int 0 to string label O
!!! DEBUG: Unexpected label type for example 750, word_idx 98: <class 'int'> - Value: 0
!!! DEBUG: Converting int 0 to string label

Map:   0%|          | 0/204 [00:00<?, ? examples/s]

!!! DEBUG: Unexpected label type for example 0, word_idx 0: <class 'int'> - Value: 3
!!! DEBUG: Converting int 3 to string label B-Product
!!! DEBUG: Unexpected label type for example 0, word_idx 1: <class 'int'> - Value: 6
!!! DEBUG: Converting int 6 to string label I-Product
!!! DEBUG: Unexpected label type for example 0, word_idx 2: <class 'int'> - Value: 6
!!! DEBUG: Converting int 6 to string label I-Product
!!! DEBUG: Unexpected label type for example 0, word_idx 3: <class 'int'> - Value: 6
!!! DEBUG: Converting int 6 to string label I-Product
!!! DEBUG: Unexpected label type for example 0, word_idx 4: <class 'int'> - Value: 6
!!! DEBUG: Converting int 6 to string label I-Product
!!! DEBUG: Unexpected label type for example 0, word_idx 5: <class 'int'> - Value: 6
!!! DEBUG: Converting int 6 to string label I-Product
!!! DEBUG: Unexpected label type for example 0, word_idx 6: <class 'int'> - Value: 0
!!! DEBUG: Converting int 0 to string label O
!!! DEBUG: Unexpected label type fo

Set Up Training Arguments and Trainer

In [24]:
# Load the model
model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint, num_labels=len(label_names), id2label=id2label, label2id=label2id
)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",          # output directory
    eval_strategy="epoch",     # Evaluate every epoch
    learning_rate=2e-5,              # learning rate
    per_device_train_batch_size=16,  # batch size for training
    per_device_eval_batch_size=16,   # batch size for evaluation
    num_train_epochs=5,              # total number of training epochs
    weight_decay=0.01,               # strength of weight decay
    logging_dir="./logs",            # directory for storing logs
    logging_strategy="epoch",        # log every epoch
    save_strategy="epoch",           # save checkpoint every epoch
    load_best_model_at_end=True,     # load the best model when training ends
    metric_for_best_model="f1",      # use f1 score to determine the best model
    report_to="none"                 # disable reporting to services like Weights & Biases
)

# Define compute_metrics function
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_names[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = {
        "precision": precision_score(true_labels, true_predictions),
        "recall": recall_score(true_labels, true_predictions),
        "f1": f1_score(true_labels, true_predictions),
    }
    return results

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_eval_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at Davlan/xlm-roberta-base-finetuned-amharic and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Fine-tune the Model

In [25]:
trainer.train()

Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,0.3186,0.076896,0.954436,0.827443,0.886414
2,0.0561,0.042498,0.958904,0.873181,0.914037
3,0.0327,0.029856,0.97619,0.93763,0.956522
4,0.0224,0.024231,0.978678,0.954262,0.966316
5,0.0175,0.022484,0.980851,0.95842,0.969506


TrainOutput(global_step=255, training_loss=0.08948417224136053, metrics={'train_runtime': 500.2648, 'train_samples_per_second': 8.146, 'train_steps_per_second': 0.51, 'total_flos': 842299079811750.0, 'train_loss': 0.08948417224136053, 'epoch': 5.0})

Evaluate the Fine-tuned Model

In [26]:
eval_results = trainer.evaluate()
print(eval_results)

# To get a detailed classification report
predictions, labels, _ = trainer.predict(tokenized_eval_dataset)
predictions = np.argmax(predictions, axis=2)

# Remove ignored index (special tokens)
true_predictions = [
    [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]
true_labels = [
    [label_names[l] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]

print("\nDetailed Classification Report:")
print(classification_report(true_labels, true_predictions))

{'eval_loss': 0.022483715787529945, 'eval_precision': 0.9808510638297873, 'eval_recall': 0.9584199584199584, 'eval_f1': 0.9695057833859095, 'eval_runtime': 4.2055, 'eval_samples_per_second': 48.508, 'eval_steps_per_second': 3.091, 'epoch': 5.0}

Detailed Classification Report:
              precision    recall  f1-score   support

         LOC       1.00      1.00      1.00       292
       PRICE       0.99      0.99      0.99       110
     Product       0.88      0.76      0.82        79

   micro avg       0.98      0.96      0.97       481
   macro avg       0.96      0.92      0.94       481
weighted avg       0.98      0.96      0.97       481



Save the Model for future use

In [27]:
# Define a path to save your model
output_model_path = "./fine_tuned_amharic_ner_model"

# Save the model and tokenizer
trainer.save_model(output_model_path)
tokenizer.save_pretrained(output_model_path)

print(f"Model and tokenizer saved to {output_model_path}")

# To load the model later
# from transformers import AutoTokenizer, AutoModelForTokenClassification
# loaded_tokenizer = AutoTokenizer.from_pretrained(output_model_path)
# loaded_model = AutoModelForTokenClassification.from_pretrained(output_model_path)
# print("Model loaded successfully!")

Model and tokenizer saved to ./fine_tuned_amharic_ner_model
