<a href="https://colab.research.google.com/github/segnig/Amharic-E-commerce-Data-Extractor/blob/task-3/notebooks/task_three.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>
<a href="https://colab.research.google.com/github/segnig/Amharic-E-commerce-Data-Extractor/blob/main/notebooks/task_three.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
# Step 1: Install required packages
!pip install -q transformers datasets torch seqeval accelerate

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m74.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m19.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m36.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m919.3 kB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m5.5 MB/s[0m eta [36

## **Data Loading**

In [8]:
from datasets import DatasetDict, Dataset
import pandas as pd
from sklearn.model_selection import train_test_split

def load_ner_dataset(file_path):
    """Enhanced NER dataset loader with comprehensive debugging"""

    # First let's examine the file structure
    print("\n=== FILE STRUCTURE ANALYSIS ===")
    with open(file_path, 'r', encoding='utf-8') as f:
        sample_lines = [next(f) for _ in range(10)]  # Read first 10 lines

    print("First 10 lines of file:")
    for i, line in enumerate(sample_lines, 1):
        print(f"{i}: {line.strip()}")

    # Now load the full content with proper parsing
    print("\n=== ATTEMPTING TO PARSE FULL FILE ===")
    with open(file_path, 'r', encoding='utf-8') as f:
        content = f.read()

    sentences = []
    current_sentence = []
    current_labels = []

    for line in content.split('\n'):
        line = line.strip()

        # Skip metadata lines (like "space", "label")
        if line.lower() in ['space', 'label']:
            continue

        if not line:  # Sentence boundary
            if current_sentence:
                sentences.append({
                    'words': current_sentence,
                    'labels': current_labels
                })
                current_sentence = []
                current_labels = []
        else:
            # Handle different separators (tab, space, or multiple spaces)
            if '\t' in line:
                parts = line.split('\t')
            else:
                parts = line.split()

            if len(parts) == 2:  # word and label
                current_sentence.append(parts[0])
                current_labels.append(parts[1])
            else:
                print(f"Warning: Skipping malformed line: {line}")

    # Add the last sentence if exists
    if current_sentence:
        sentences.append({
            'words': current_sentence,
            'labels': current_labels
        })

    # Print parsing results
    print(f"\nSuccessfully parsed {len(sentences)} sentences")
    if sentences:
        print("\nFirst complete sentence example:")
        print("Words:", sentences[0]['words'][:10])  # First 10 words
        print("Labels:", sentences[0]['labels'][:10])  # First 10 labels

    return sentences

# Load your dataset
print("Loading dataset...")
all_data = load_ner_dataset('/content/drive/MyDrive/10 Academy/labeled_telegram_product_price_location.txt')

if not all_data:
    raise ValueError("No valid sentences were parsed. Please check your file format.")

# Split into train and validation
train_data, val_data = train_test_split(all_data, test_size=0.2, random_state=42)

# Create Hugging Face datasets
dataset = DatasetDict({
    'train': Dataset.from_dict({
        'words': [x['words'] for x in train_data],
        'labels': [x['labels'] for x in train_data]
    }),
    'validation': Dataset.from_dict({
        'words': [x['words'] for x in val_data],
        'labels': [x['labels'] for x in val_data]
    })
})

# Final verification
print("\n=== DATASET SUMMARY ===")
print(f"Training samples: {len(dataset['train'])}")
print(f"Validation samples: {len(dataset['validation'])}")
print("\nSample training example:")
print(dataset['train'][0])

Loading dataset...

=== FILE STRUCTURE ANALYSIS ===
First 10 lines of file:
1: 3pcs B-PRODUCT
2: silicon I-PRODUCT
3: brush I-PRODUCT
4: spatulas I-PRODUCT
5: እስከ O
6: 260°c O
7: ሙቀት O
8: መቆቆም O
9: የሚችል O
10: ዋጋ-550ብር I-PRICE

=== ATTEMPTING TO PARSE FULL FILE ===

Successfully parsed 3166 sentences

First complete sentence example:
Words: ['3pcs', 'silicon', 'brush', 'spatulas', 'እስከ', '260°c', 'ሙቀት', 'መቆቆም', 'የሚችል', 'ዋጋ-550ብር']
Labels: ['B-PRODUCT', 'I-PRODUCT', 'I-PRODUCT', 'I-PRODUCT', 'O', 'O', 'O', 'O', 'O', 'I-PRICE']

=== DATASET SUMMARY ===
Training samples: 2532
Validation samples: 634

Sample training example:
{'words': ['Korean', 'Body', 'Scrub', 'Sponge', 'የሞተ', 'ቆዳን', 'እንዲሁም', 'ቆሻሻን', 'ለማፅዳት', 'ተመራጭ', 'ዋጋ፦', '200', 'ብር', 'ውስን', 'ፍሬ', 'ነው', 'ያለው', 'አድራሻ', 'መገናኛ', 'ታሜ', 'ጋስ', 'ህንፃ', 'ጎን', 'ስሪ', 'ኤም', 'ሲቲ', 'ሞል', 'ሁለተኛ', 'ፎቅ', 'ቢሮ', 'ቁ.', 'SL-05A', '(ከ', 'ሊፍቱ', 'ፊት', 'ለ', 'ፊት)', '0909522840', '0923350054', 'በTelegram', 'ለማዘዝ', 'ይጠቀሙ', '@shager_onlinestore', 'ለተጨማሪ', 'ማብራሪያ',

### Step 2: Define label mappings (update with your actual labels)

In [12]:
# Step 2: Define label mappings (update with your actual labels)
label_list = ["O", "B-PRODUCT", "I-PRODUCT", "B-PRICE", "I-PRICE", "B-LOC", "I-LOC"]
label2id = {label: i for i, label in enumerate(label_list)}
id2label = {i: label for i, label in enumerate(label_list)}

### Step 3: Load tokenizer and model

In [13]:
# Step 3: Load tokenizer and model
from transformers import AutoTokenizer, AutoModelForTokenClassification

model_name = "Davlan/afro-xlmr-base"  # Best for Amharic
tokenizer = AutoTokenizer.from_pretrained(model_name)

### Step 4: Tokenize and align labels

In [14]:
# Step 4: Tokenize and align labels
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["words"],
        truncation=True,
        is_split_into_words=True,
        max_length=128,
        padding="max_length"
    )

    labels = []
    for i, label in enumerate(examples["labels"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label2id[label[word_idx]])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_datasets = dataset.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=dataset["train"].column_names
)

Map:   0%|          | 0/2532 [00:00<?, ? examples/s]

Map:   0%|          | 0/634 [00:00<?, ? examples/s]

### Step 5: Initialize model

In [15]:
# Step 5: Initialize model
model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=len(label_list),
    id2label=id2label,
    label2id=label2id
)

config.json:   0%|          | 0.00/707 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at Davlan/afro-xlmr-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Step 6: Set up training

In [17]:
# Step 6: Set up training
from transformers import TrainingArguments, Trainer
import numpy as np
from seqeval.metrics import f1_score

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    return {"f1": f1_score(true_labels, true_predictions)}

training_args = TrainingArguments(
    output_dir="amharic-ner-model",
    # Changed 'evaluation_strategy' to 'eval_strategy' as it was deprecated
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    save_strategy="epoch",
    logging_steps=50,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

  trainer = Trainer(


### Step 7: Train and evaluate

In [None]:
# Step 7: Train and evaluate
trainer.train()
results = trainer.evaluate()
print("Evaluation results:", results)

Epoch,Training Loss,Validation Loss,F1
1,0.0605,0.036911,0.943046


Epoch,Training Loss,Validation Loss,F1
1,0.0605,0.036911,0.943046


### Step 8: Save model

In [None]:
# Step 8: Save model
model.save_pretrained("amharic-ner-model")
tokenizer.save_pretrained("amharic-ner-model")

### Step 9: Create inference pipeline

In [None]:
# Step 9: Create inference pipeline
from transformers import pipeline

ner_pipeline = pipeline(
    "ner",
    model=model,
    tokenizer=tokenizer,
    aggregation_strategy="average",
    device=0  # Use GPU if available
)

# Test inference
sample_text = "በአዲስ አበባ ላይ ስልክ ታይ ምርት በ 2500 ብር ይገኛል።"
results = ner_pipeline(sample_text)

print("\nInference Results:")
for entity in results:
    print(f"{entity['word']} -> {entity['entity_group']} (score: {entity['score']:.2f})")