In [None]:
!pip install transformers datasets accelerate seqeval -q
!pip install optimum -q # Optional: For ONNX export or quantization later

In [None]:
!pip install seqeval -q

In [None]:
print("\n2. Importing libraries and loading model components...")
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer, DataCollatorForTokenClassification
from datasets import load_dataset, Dataset, Features, Value, ClassLabel, Sequence
from seqeval.metrics import classification_report, accuracy_score, f1_score, precision_score, recall_score
import numpy as np
import os

# Define the model checkpoint
# Choose one of the following models by uncommenting it:
print("\n2. Importing libraries and loading model components...")
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer, DataCollatorForTokenClassification
from datasets import load_dataset, Dataset, Features, Value, ClassLabel, Sequence
from seqeval.metrics import classification_report, accuracy_score, f1_score, precision_score, recall_score
import numpy as np
import os

# Define the model checkpoint
# Choose one of the following models by uncommenting it:
# MODEL_CHECKPOINT = "xlm-roberta-base"        # Strong general-purpose multilingual model
# MODEL_CHECKPOINT = "attributio/bert-tiny-amharic" # Smaller, faster, Amharic-specific
MODEL_CHECKPOINT = "bert-base-multilingual-cased"  # Good multilingual model for African languages

# Load tokenizer
print(f"Loading tokenizer from: {MODEL_CHECKPOINT}")
tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)

# Define your labels - these MUST match exactly with your CoNLL labels
# IMPORTANT: Ensure this list contains all unique B-I-O tags from your labeled_telegram_product_price_location.txt
label_list = [
    "O",
    "B-PRODUCT", # Changed P to uppercase
    "I-PRODUCT", # Changed P to uppercase
    "B-PRICE",
    "I-PRICE",
    "B-LOC",
    "I-LOC"
]
id2label = {i: label for i, label in enumerate(label_list)}
label2id = {label: i for i, label in enumerate(label_list)}

print(f"Defined labels: {label_list}")
print(f"id2label mapping: {id2label}")
print(f"label2id mapping: {label2id}")
print(f"Loading tokenizer from: {MODEL_CHECKPOINT}")
tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)

# Define your labels - these MUST match exactly with your CoNLL labels
# IMPORTANT: Ensure this list contains all unique B-I-O tags from your labeled_telegram_product_price_location.txt
label_list = [
    "O",
    "B-PRODUCT", # Changed P to uppercase
    "I-PRODUCT", # Changed P to uppercase
    "B-PRICE",
    "I-PRICE",
    "B-LOC",
    "I-LOC"
]
id2label = {i: label for i, label in enumerate(label_list)}
label2id = {label: i for i, label in enumerate(label_list)}

print(f"Defined labels: {label_list}")
print(f"id2label mapping: {id2label}")
print(f"label2id mapping: {label2id}")

In [None]:
print("\n3. Loading the labeled dataset...")

# Option A: Upload directly to Colab (Temporary - for small files)
# Run this cell, a file uploader will appear. Select your .txt file.
from google.colab import files
uploaded = files.upload()

uploaded_file_name = list(uploaded.keys())[0]
print(f"Uploaded file: {uploaded_file_name}")
file_name = uploaded_file_name # Use the uploaded file name

# # Option B: Mount Google Drive (Recommended - for persistent storage)
# # Uncomment the following lines if you want to use Google Drive and have your file there.
# # from google.colab import drive
# # drive.mount('/content/drive')
# # # Adjust this path to where you saved your file in Google Drive
# # # Example: if it's in a folder named 'my_project' in the root of your Drive:
# # file_path = "/content/drive/MyDrive/10academy_project/labeled_telegram_product_price_location.txt" # <--- IMPORTANT: Adjust this path!
# # print(f"Looking for file at: {file_path}")
# # file_name = file_path # Use the full path as the file_name


# Function to parse CoNLL formatted file
def parse_conll_file(file_path):
    """Parses a CoNLL formatted file into a list of (words, tags) tuples."""
    texts = []
    tags = []
    current_words = []
    current_tags = []

    with open(file_path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if line: # If line is not empty
                parts = line.split()
                if len(parts) == 2:
                    word, tag = parts[0], parts[1]
                    current_words.append(word)
                    current_tags.append(tag)
                else:
                    # Handle malformed lines: if a line is not empty but doesn't have 2 parts, it's malformed
                    print(f"Warning: Skipping malformed line (expected 2 parts, got {len(parts)}): '{line}'")
            else: # Empty line indicates end of a sentence/message
                if current_words: # Only add if there are words in the current sentence
                    texts.append(current_words)
                    tags.append(current_tags)
                    current_words = []
                    current_tags = []
    # Add any remaining sentence at the end of the file (important if file doesn't end with blank line)
    if current_words:
        texts.append(current_words)
        tags.append(current_tags)

    return texts, tags

# Parse your CoNLL file
raw_texts, raw_tags = parse_conll_file(file_name)
print(f"Successfully parsed {len(raw_texts)} sentences from the CoNLL file.")

# Check for any tags in your data that are not in label_list
all_unique_tags_in_data = set(tag for sublist in raw_tags for tag in sublist)
missing_labels_in_config = all_unique_tags_in_data - set(label_list)
if missing_labels_in_config:
    print(f"\nWARNING: Found tags in your data not present in 'label_list': {missing_labels_in_config}")
    print("Please update 'label_list' in Cell 2 to include these tags and rerun all cells from the beginning.")
    # Consider adding `raise ValueError("Missing labels in config")` here if you want to stop execution
    # if this critical issue occurs.


# Convert raw_tags (string labels) to numerical IDs
numeric_tags = []
for i, sentence_tags in enumerate(raw_tags):
    current_numeric_tags = []
    for tag in sentence_tags:
        if tag in label2id:
            current_numeric_tags.append(label2id[tag])
        else:
            # This case should ideally be caught by the warning above.
            print(f"Error: Tag '{tag}' not found in label2id for sentence {i}. Assigning 'O'.")
            current_numeric_tags.append(label2id["O"])
    numeric_tags.append(current_numeric_tags)


# Create a Hugging Face Dataset
features = Features({
    'id': Value('string'),
    'tokens': Sequence(Value('string')),
    'ner_tags': Sequence(ClassLabel(names=label_list))
})

data_dict_list = []
for i, (tokens, tags) in enumerate(zip(raw_texts, numeric_tags)):
    # Ensure tokens and tags have the same length
    if len(tokens) != len(tags):
        print(f"Warning: Token-tag length mismatch in sentence {i}. Skipping this sentence.")
        print(f"Tokens: {tokens}")
        print(f"Tags: {[id2label[t] for t in tags]}") # Convert numerical tags back to string for printing
        continue # Skip this malformed sentence
    data_dict_list.append({
        'id': str(i),
        'tokens': tokens,
        'ner_tags': tags
    })

dataset = Dataset.from_list(data_dict_list, features=features)
print(f"Dataset loaded with {len(dataset)} examples.")
print("Example from dataset (first entry):")
print(dataset[0])

# Split into training and validation sets
# Use a small validation set (e.g., 10-20% of your data). Adjust test_size as needed.
train_test_split = dataset.train_test_split(test_size=0.2, seed=42) # Added seed for reproducibility
train_dataset = train_test_split['train']
eval_dataset = train_test_split['test']

print(f"\nTrain dataset size: {len(train_dataset)}")
print(f"Eval dataset size: {len(eval_dataset)}")

In [None]:
print("\n4. Tokenizing data and aligning labels...")

def tokenize_and_align_labels(examples):
    # This function expects examples['tokens'] to be a list of lists of words (sentences)
    # and examples['ner_tags'] to be a list of lists of numerical tag IDs.
    tokenized_inputs = tokenizer(
        examples["tokens"],
        truncation=True, # Truncate long sequences to model's max input length
        is_split_into_words=True # Tells the tokenizer that inputs are already pre-split into words
    )
    labels = []
    for i, label in enumerate(examples["ner_tags"]): # Iterate through each sentence's original labels
        word_ids = tokenized_inputs.word_ids(batch_index=i) # Get word IDs for the current tokenized sentence
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens (like CLS, SEP, PAD) have a word_idx of None.
            # We set their label to -100 so they are ignored in loss computation.
            if word_idx is None:
                label_ids.append(-100)
            # If this is the first token of a new word, assign its original label.
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            # If it's a subsequent subword token of the same word:
            else:
                # Get the original string label for the word
                original_label_str = id2label[label[word_idx]]
                # If the original label was a 'B-' tag, change it to 'I-'.
                # Otherwise, keep it as 'I-' or 'O'. This ensures all subwords of an entity
                # are labeled as 'I-' (or 'O' if the word was 'O').
                if original_label_str.startswith("B-"):
                    label_ids.append(label2id[f"I-{original_label_str[2:]}"])
                else:
                    label_ids.append(label[word_idx]) # For I- and O tags, keep them as is
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

# Apply the tokenization and alignment to both training and evaluation datasets
tokenized_train_dataset = train_dataset.map(tokenize_and_align_labels, batched=True)
tokenized_eval_dataset = eval_dataset.map(tokenize_and_align_labels, batched=True)

print("\nExample of tokenized and aligned data (first entry from training set):")
first_example_tokenized = tokenized_train_dataset[0]
print("Original Tokens (first sentence in training set):", train_dataset[0]["tokens"])
print("Original Labels:", [id2label[l] for l in train_dataset[0]["ner_tags"]])
print("Subword Tokens (after tokenization):", tokenizer.convert_ids_to_tokens(first_example_tokenized["input_ids"]))
print("Aligned Numerical Labels:", first_example_tokenized["labels"])
print("Decoded Aligned Labels:", [id2label[l] if l != -100 else "IGNORE" for l in first_example_tokenized["labels"]])

In [None]:
print("\n4. Tokenizing data and aligning labels...")

def tokenize_and_align_labels(examples):
    # This function expects examples['tokens'] to be a list of lists of words (sentences)
    # and examples['ner_tags'] to be a list of lists of numerical tag IDs.
    tokenized_inputs = tokenizer(
        examples["tokens"],
        truncation=True, # Truncate long sequences to model's max input length
        is_split_into_words=True # Tells the tokenizer that inputs are already pre-split into words
    )
    labels = []
    for i, label in enumerate(examples["ner_tags"]): # Iterate through each sentence's original labels
        word_ids = tokenized_inputs.word_ids(batch_index=i) # Get word IDs for the current tokenized sentence
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens (like CLS, SEP, PAD) have a word_idx of None.
            # We set their label to -100 so they are ignored in loss computation.
            if word_idx is None:
                label_ids.append(-100)
            # If this is the first token of a new word, assign its original label.
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            # If it's a subsequent subword token of the same word:
            else:
                # Get the original string label for the word
                original_label_str = id2label[label[word_idx]]
                # If the original label was a 'B-' tag, change it to 'I-'.
                # Otherwise, keep it as 'I-' or 'O'. This ensures all subwords of an entity
                # are labeled as 'I-' (or 'O' if the word was 'O').
                if original_label_str.startswith("B-"):
                    label_ids.append(label2id[f"I-{original_label_str[2:]}"])
                else:
                    label_ids.append(label[word_idx]) # For I- and O tags, keep them as is
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

# Apply the tokenization and alignment to both training and evaluation datasets
tokenized_train_dataset = train_dataset.map(tokenize_and_align_labels, batched=True)
tokenized_eval_dataset = eval_dataset.map(tokenize_and_align_labels, batched=True)

print("\nExample of tokenized and aligned data (first entry from training set):")
first_example_tokenized = tokenized_train_dataset[0]
print("Original Tokens (first sentence in training set):", train_dataset[0]["tokens"])
print("Original Labels:", [id2label[l] for l in train_dataset[0]["ner_tags"]])
print("Subword Tokens (after tokenization):", tokenizer.convert_ids_to_tokens(first_example_tokenized["input_ids"]))
print("Aligned Numerical Labels:", first_example_tokenized["labels"])
print("Decoded Aligned Labels:", [id2label[l] if l != -100 else "IGNORE" for l in first_example_tokenized["labels"]])

In [None]:
#Set up Training Arguments and Model
print("\n5. Setting up training arguments and model...")

# Initialize the Data Collator for Token Classification
# This handles padding of sequences to the longest sequence in each batch,
# and also stacks inputs into tensors.
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

# Load the model for token classification
# This will add a classification head on top of the pre-trained model.
model = AutoModelForTokenClassification.from_pretrained(
    MODEL_CHECKPOINT,
    num_labels=len(label_list), # Number of unique NER labels
    id2label=id2label,         # Map numerical IDs back to string labels for output
    label2id=label2id          # Map string labels to numerical IDs for internal use
)

# Verify if model's label configurations are correctly set
print(f"Model num_labels: {model.config.num_labels}")
print(f"Model id2label: {model.config.id2label}")
print(f"Model label2id: {model.config.label2id}")

# Define training arguments
# These parameters significantly impact training time and model performance.
# Adjust `num_train_epochs` and `per_device_train_batch_size` based on your dataset size
# and available GPU memory.
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",      # Evaluate at the end of each epoch (updated argument name)# Directory to save model checkpoints and training logs
     learning_rate=2e-5,                       # Learning rate for the optimizer (typical for fine-tuning)
    per_device_train_batch_size=16,           # Batch size per GPU/CPU during training
    per_device_eval_batch_size=16,            # Batch size per GPU/CPU during evaluation
    num_train_epochs=5,                       # Number of full passes over the training data
    weight_decay=0.01,                        # L2 regularization to prevent overfitting
    logging_dir="./logs",                     # Directory for TensorBoard logs
    logging_steps=100,                        # How often to log training information
    save_strategy="epoch",                    # Save a model checkpoint at the end of each epoch
    save_total_limit=2,                       # Keep only the last 2 best checkpoints to save disk space
    report_to="none",                         # Disable integrations like Weights & Biases for simplicity
    fp16=True,                                # Enable mixed precision training (float16) for faster GPU training
    push_to_hub=False,                        # Do not push the model to the Hugging Face Hub automatically
    load_best_model_at_end=True,              # Load the model with the best evaluation metric at the end of training
    metric_for_best_model="overall_f1",       # The metric to monitor for selecting the best model
    greater_is_better=True,                   # For F1-score, a higher value is better
)

In [None]:
#Initialize and Run the Hugging Face Trainer
print("\n6. Initializing and running the Hugging Face Trainer...")

# Define the compute_metrics function for NER evaluation using seqeval
def compute_metrics(p):
    predictions, labels = p
    # Convert prediction logits to predicted label IDs
    predictions = np.argmax(predictions, axis=2)

    # Convert numerical labels and predictions back to string labels for seqeval
    # Also, remove ignored index (-100)
    true_labels = [[id2label[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [id2label[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    # --- Important for seqeval ---
    # Ensure lengths of prediction and true label lists are identical for each sample.
    # This addresses potential issues where `word_ids` or `prediction` might cause slight mismatches.
    cleaned_true_predictions = []
    cleaned_true_labels = []
    for pred_list, label_list_i in zip(true_predictions, true_labels):
        if len(pred_list) == len(label_list_i):
            cleaned_true_predictions.append(pred_list)
            cleaned_true_labels.append(label_list_i)
        else:
            # This should ideally not happen if tokenization and alignment are robust.
            # Print a warning if a mismatch occurs, indicating a potential data or alignment issue.
            print(f"Warning: Skipping a sample in metrics calculation due to length mismatch: pred={len(pred_list)}, label={len(label_list_i)}")

    if not cleaned_true_labels: # Handle case where all samples are skipped or no valid labels
        return {"overall_precision": 0.0, "overall_recall": 0.0, "overall_f1": 0.0, "overall_accuracy": 0.0}

    # Generate the classification report from seqeval
    report = classification_report(cleaned_true_labels, cleaned_true_predictions, output_dict=True)

    # Extract overall metrics, typically using 'micro avg' for overall performance in NER
    overall_f1 = report['micro avg']['f1-score'] if 'micro avg' in report else f1_score(cleaned_true_labels, cleaned_true_predictions, average='micro')
    overall_precision = report['micro avg']['precision'] if 'micro avg' in report else precision_score(cleaned_true_labels, cleaned_true_predictions, average='micro')
    overall_recall = report['micro avg']['recall'] if 'micro avg' in report else recall_score(cleaned_true_labels, cleaned_true_predictions, average='micro')
    overall_accuracy = accuracy_score(cleaned_true_labels, cleaned_true_predictions)

    metrics = {
        "overall_precision": overall_precision,
        "overall_recall": overall_recall,
        "overall_f1": overall_f1,
        "overall_accuracy": overall_accuracy,
    }

    # Add per-entity F1 scores if they exist in the report (excluding 'O' tag and 'micro avg')
    for entity_type in label_list:
        # Check if the entity type is present in the report (i.e., it appeared in the eval set)
        if entity_type != 'O' and entity_type in report:
            metrics[f"{entity_type}_f1"] = report[entity_type]['f1-score']

    return metrics


# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_eval_dataset, # The model will be evaluated on this dataset
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics, # Function to compute evaluation metrics
)

print("\nStarting model training...")
# This will start the training loop. Progress bars and metrics will be displayed.
trainer.train()
print("\nTraining complete!")

In [None]:
#Evaluate the Fine-tuned Model (Final Evaluation)
print("\n7. Evaluating the fine-tuned model on the validation set (final check)...")
# This will run a final evaluation on the `eval_dataset` using the best model loaded at the end of training.
eval_results = trainer.evaluate()
print("Final Evaluation Results:", eval_results)

In [None]:
print("\n8. Saving the fine-tuned model and tokenizer...")

# Define a path to save your model
# IMPORTANT: If you want to save to Google Drive for persistence, uncomment the Drive path
# and ensure you have mounted Drive (Option B in Cell 3 was for this).
model_save_path = "./fine_tuned_amharic_ner_model_v1" # Local path in Colab (temporary, deleted when session ends)
# OR for Google Drive (recommended for persistence):
# model_save_path = "/content/drive/MyDrive/10academy_project/fine_tuned_amharic_ner_model_v1" # <--- ADJUST THIS PATH!

# Create the directory if it doesn't exist
os.makedirs(model_save_path, exist_ok=True)

# Save the model's weights, configuration, and vocabulary
trainer.save_model(model_save_path) # Saves the model's weights and configuration
tokenizer.save_pretrained(model_save_path) # Saves the tokenizer files (vocab, merges, etc.)

print(f"Model and tokenizer saved successfully to: {model_save_path}")

# Optional: If you saved to a local Colab path (not Drive), you might want to download it.
# This zips the model directory and initiates a browser download.
try:
    if not "MyDrive" in model_save_path: # Only attempt download if not saved to Drive
        print("\nAttempting to zip and download the model (if saved locally)...")
        !zip -r /content/fine_tuned_amharic_ner_model_v1.zip {model_save_path}
        from google.colab import files
        files.download('/content/fine_tuned_amharic_ner_model_v1.zip')
        print("Model zip file download initiated.")
    else:
        print("Model saved to Google Drive, no need to download from Colab local storage.")
except Exception as e:
    print(f"Could not zip or download model (an error occurred or it was saved to Drive): {e}")

print("\n--- Fine-tuning process complete ---")