<a href="https://colab.research.google.com/github/segnig/Amharic-E-commerce-Data-Extractor/blob/task-3/notebooks/task_three.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from datasets import DatasetDict, Dataset
import pandas as pd
from sklearn.model_selection import train_test_split

def load_ner_dataset(file_path):
    """Enhanced NER dataset loader with comprehensive debugging"""

    # First let's examine the file structure
    print("\n=== FILE STRUCTURE ANALYSIS ===")
    with open(file_path, 'r', encoding='utf-8') as f:
        sample_lines = [next(f) for _ in range(10)]  # Read first 10 lines

    print("First 10 lines of file:")
    for i, line in enumerate(sample_lines, 1):
        print(f"{i}: {line.strip()}")

    # Now load the full content with proper parsing
    print("\n=== ATTEMPTING TO PARSE FULL FILE ===")
    with open(file_path, 'r', encoding='utf-8') as f:
        content = f.read()

    sentences = []
    current_sentence = []
    current_labels = []

    for line in content.split('\n'):
        line = line.strip()

        # Skip metadata lines (like "space", "label")
        if line.lower() in ['space', 'label']:
            continue

        if not line:  # Sentence boundary
            if current_sentence:
                sentences.append({
                    'words': current_sentence,
                    'labels': current_labels
                })
                current_sentence = []
                current_labels = []
        else:
            # Handle different separators (tab, space, or multiple spaces)
            if '\t' in line:
                parts = line.split('\t')
            else:
                parts = line.split()

            if len(parts) == 2:  # word and label
                current_sentence.append(parts[0])
                current_labels.append(parts[1])
            else:
                print(f"Warning: Skipping malformed line: {line}")

    # Add the last sentence if exists
    if current_sentence:
        sentences.append({
            'words': current_sentence,
            'labels': current_labels
        })

    # Print parsing results
    print(f"\nSuccessfully parsed {len(sentences)} sentences")
    if sentences:
        print("\nFirst complete sentence example:")
        print("Words:", sentences[0]['words'][:10])  # First 10 words
        print("Labels:", sentences[0]['labels'][:10])  # First 10 labels

    return sentences

# Load your dataset
print("Loading dataset...")
all_data = load_ner_dataset('/content/drive/MyDrive/10 Academy/labeled_telegram_product_price_location.txt')

if not all_data:
    raise ValueError("No valid sentences were parsed. Please check your file format.")

# Split into train and validation
train_data, val_data = train_test_split(all_data, test_size=0.2, random_state=42)

# Create Hugging Face datasets
dataset = DatasetDict({
    'train': Dataset.from_dict({
        'words': [x['words'] for x in train_data],
        'labels': [x['labels'] for x in train_data]
    }),
    'validation': Dataset.from_dict({
        'words': [x['words'] for x in val_data],
        'labels': [x['labels'] for x in val_data]
    })
})

# Final verification
print("\n=== DATASET SUMMARY ===")
print(f"Training samples: {len(dataset['train'])}")
print(f"Validation samples: {len(dataset['validation'])}")
print("\nSample training example:")
print(dataset['train'][0])

Loading dataset...

=== FILE STRUCTURE ANALYSIS ===
First 10 lines of file:
1: 3pcs B-PRODUCT
2: silicon I-PRODUCT
3: brush I-PRODUCT
4: spatulas I-PRODUCT
5: እስከ O
6: 260°c O
7: ሙቀት O
8: መቆቆም O
9: የሚችል O
10: ዋጋ-550ብር I-PRICE

=== ATTEMPTING TO PARSE FULL FILE ===

Successfully parsed 3166 sentences

First complete sentence example:
Words: ['3pcs', 'silicon', 'brush', 'spatulas', 'እስከ', '260°c', 'ሙቀት', 'መቆቆም', 'የሚችል', 'ዋጋ-550ብር']
Labels: ['B-PRODUCT', 'I-PRODUCT', 'I-PRODUCT', 'I-PRODUCT', 'O', 'O', 'O', 'O', 'O', 'I-PRICE']

=== DATASET SUMMARY ===
Training samples: 2532
Validation samples: 634

Sample training example:
{'words': ['Korean', 'Body', 'Scrub', 'Sponge', 'የሞተ', 'ቆዳን', 'እንዲሁም', 'ቆሻሻን', 'ለማፅዳት', 'ተመራጭ', 'ዋጋ፦', '200', 'ብር', 'ውስን', 'ፍሬ', 'ነው', 'ያለው', 'አድራሻ', 'መገናኛ', 'ታሜ', 'ጋስ', 'ህንፃ', 'ጎን', 'ስሪ', 'ኤም', 'ሲቲ', 'ሞል', 'ሁለተኛ', 'ፎቅ', 'ቢሮ', 'ቁ.', 'SL-05A', '(ከ', 'ሊፍቱ', 'ፊት', 'ለ', 'ፊት)', '0909522840', '0923350054', 'በTelegram', 'ለማዘዝ', 'ይጠቀሙ', '@shager_onlinestore', 'ለተጨማሪ', 'ማብራሪያ',

In [4]:
dataset["train"]["words"]

[['Korean',
  'Body',
  'Scrub',
  'Sponge',
  'የሞተ',
  'ቆዳን',
  'እንዲሁም',
  'ቆሻሻን',
  'ለማፅዳት',
  'ተመራጭ',
  'ዋጋ፦',
  '200',
  'ብር',
  'ውስን',
  'ፍሬ',
  'ነው',
  'ያለው',
  'አድራሻ',
  'መገናኛ',
  'ታሜ',
  'ጋስ',
  'ህንፃ',
  'ጎን',
  'ስሪ',
  'ኤም',
  'ሲቲ',
  'ሞል',
  'ሁለተኛ',
  'ፎቅ',
  'ቢሮ',
  'ቁ.',
  'SL-05A',
  '(ከ',
  'ሊፍቱ',
  'ፊት',
  'ለ',
  'ፊት)',
  '0909522840',
  '0923350054',
  'በTelegram',
  'ለማዘዝ',
  'ይጠቀሙ',
  '@shager_onlinestore',
  'ለተጨማሪ',
  'ማብራሪያ',
  'የቴሌግራም',
  'ገፃችን',
  'https://t.me/Shageronlinestore'],
 ['Long',
  'Handle',
  'Pot',
  'Brush',
  'የእቃ',
  'ማጠቢያ',
  'ብሩሽ',
  'የራሱ',
  'ሳሙና',
  'መያዣ',
  'ያለዉ',
  'Comes',
  'with',
  '2',
  'types',
  'of',
  'easy-to-replace',
  'brush',
  'heads.',
  'Detergent',
  'can',
  'be',
  'added,',
  'press',
  'the',
  'non-slip',
  'button',
  'to',
  'release',
  'the',
  'soap',
  'flow',
  'when',
  'in',
  'use.',
  'The',
  'brush',
  'head',
  'is',
  'not',
  'easy',
  'to',
  'stick',
  'oil',
  'and',
  'easy',
  'to',
  'clean.',
 