In [1]:
import pandas as pd
import re
import os
from typing import List, Tuple

In [10]:
# Load the dataset
def load_telegram_data(file_path: str = 'telegram_data.csv'):
    """Load the telegram dataset"""
    try:
        df = pd.read_csv(file_path)
        print(f"Dataset loaded successfully with {len(df)} rows")
        print(f"Columns: {list(df.columns)}")
        return df
    except FileNotFoundError:
        print(f"File {file_path} not found. Please ensure the file exists in the current directory.")
        return None
    except Exception as e:
        print(f"Error loading data: {e}")
        return None

# Load data
df = load_telegram_data()
if df is not None:
    print("\nFirst few messages:")
    print(df['text'].head())

Dataset loaded successfully with 27520 rows
Columns: ['message_id', 'channel', 'text', 'timestamp', 'views', 'sender', 'image_path']

First few messages:
0    **ሁለት **ሲገዙ **5%** ቅናሽ ሙሉ በሙሉ የመኝታ ቤትዎን ውበት የሚ...
1    የልጆች ወብ ባማረ ዲዛይን የተሰራ ቀሚስ ለማዘዝ 0974312223 ይደውሉ...
2    አስደናቄ አትዮጵያዊ ምስልሎች ያላቸው የአረቄ መለኪያ ለማዘዝ 0974312...
3    ብዙ ተወዳጀነትን የተረፈ የቃልኪዳን ጉዞ ለሁሉም እድሜ የሚሆን የአማረኛ ...
4    ለቤትዎ ደምቀት የሚጨምሩ ሻማዎች በተለያየ መጠን እና ቀለም ለማዘዝ 097...
Name: text, dtype: object


In [14]:
## Step 2: Text Preprocessing and Tokenization

def clean_and_tokenize(text: str) -> List[str]:
    """
    Clean and tokenize Amharic text
    """
    if pd.isna(text) or text == '':
        return []

    # Remove extra whitespaces and normalize
    text = re.sub(r'\s+', ' ', str(text).strip())

    # Split by whitespace (basic tokenization for Amharic)
    tokens = text.split()

    # Filter out empty tokens
    tokens = [token for token in tokens if token.strip()]

    return tokens

def display_sample_messages(df: pd.DataFrame, n: int = 5):
    """Display sample messages with their tokenization"""
    print("Sample messages and their tokenization:")
    print("=" * 50)

    for i in range(min(n, len(df))):
        # Corrected line: Accessing the 'text' column instead of 'Message'
        message = df.iloc[i]['text']
        if pd.notna(message):
            tokens = clean_and_tokenize(message)
            print(f"\nMessage {i+1}: {message}")
            print(f"Tokens: {tokens}")
            print(f"Token count: {len(tokens)}")

# Display sample messages
if df is not None:
    display_sample_messages(df)

Sample messages and their tokenization:

Message 1: **ሁለት **ሲገዙ **5%** ቅናሽ ሙሉ በሙሉ የመኝታ ቤትዎን ውበት የሚጨምር አነሶላ ለማዘዝ 8420 ይደውሉ ወይም https://t.me/helloo_market_bot?start=175610010 ይጠቀሙ! **መጠን: 2.40 * 1.75** **ብዛት: ሁለት አንሶላ እና ሁለት የተራስ ልብስ** #Madeinethiopia #Bedding #Ethiopian #Marketplace #BuyEthiopia
Tokens: ['**ሁለት', '**ሲገዙ', '**5%**', 'ቅናሽ', 'ሙሉ', 'በሙሉ', 'የመኝታ', 'ቤትዎን', 'ውበት', 'የሚጨምር', 'አነሶላ', 'ለማዘዝ', '8420', 'ይደውሉ', 'ወይም', 'https://t.me/helloo_market_bot?start=175610010', 'ይጠቀሙ!', '**መጠን:', '2.40', '*', '1.75**', '**ብዛት:', 'ሁለት', 'አንሶላ', 'እና', 'ሁለት', 'የተራስ', 'ልብስ**', '#Madeinethiopia', '#Bedding', '#Ethiopian', '#Marketplace', '#BuyEthiopia']
Token count: 33

Message 2: የልጆች ወብ ባማረ ዲዛይን የተሰራ ቀሚስ ለማዘዝ 0974312223 ይደውሉ ወይም https://t.me/helloo_market_bot?start=176010001 ይጠቀሙ! መጠን: S: L እና M #Madeinethiopia #Kids #KidsFashion #Ethiopian #Marketplace #BuyEthiopia
Tokens: ['የልጆች', 'ወብ', 'ባማረ', 'ዲዛይን', 'የተሰራ', 'ቀሚስ', 'ለማዘዝ', '0974312223', 'ይደውሉ', 'ወይም', 'https://t.me/helloo_market_bot?start=17601

In [15]:
## Step 3: Manual Labeling Interface
class AmharicNERLabeler:
    def __init__(self):
        self.entity_types = {
            'B-Product': 'Beginning of Product entity',
            'I-Product': 'Inside Product entity',
            'B-LOC': 'Beginning of Location entity',
            'I-LOC': 'Inside Location entity',
            'B-PRICE': 'Beginning of Price entity',
            'I-PRICE': 'Inside Price entity',
            'O': 'Outside any entity'
        }
        self.labeled_data = []

    def display_guidelines(self):
        """Display labeling guidelines"""
        print("LABELING GUIDELINES:")
        print("=" * 40)
        print("Entity Types:")
        for tag, description in self.entity_types.items():
            print(f"  {tag}: {description}")
        print("\nExamples:")
        print("  'Baby bottle' -> Baby(B-Product) bottle(I-Product)")
        print("  'Addis Abeba' -> Addis(B-LOC) Abeba(I-LOC)")
        print("  'ዋጋ 1000 ብር' -> ዋጋ(B-PRICE) 1000(I-PRICE) ብር(I-PRICE)")
        print("  Other words -> O")
        print("\n" + "=" * 40)

    def label_message(self, message: str, message_id: int):
        """
        Interactively label a single message
        """
        tokens = clean_and_tokenize(message)

        if not tokens:
            return None

        print(f"\nMessage {message_id}: {message}")
        print(f"Tokens to label: {tokens}")
        print("\nEnter labels for each token (or 'skip' to skip this message):")

        labeled_tokens = []

        for i, token in enumerate(tokens):
            while True:
                label = input(f"Token {i+1} '{token}': ").strip()

                if label.lower() == 'skip':
                    return None

                if label in self.entity_types:
                    labeled_tokens.append((token, label))
                    break
                else:
                    print(f"Invalid label. Choose from: {list(self.entity_types.keys())}")

        return labeled_tokens

    def save_conll_format(self, output_file: str = 'amharic_ner_conll.txt'):
        """
        Save labeled data in CoNLL format
        """
        with open(output_file, 'w', encoding='utf-8') as f:
            for message_data in self.labeled_data:
                for token, label in message_data:
                    f.write(f"{token}\t{label}\n")
                f.write("\n")  # Blank line between messages

        print(f"Data saved to {output_file} in CoNLL format")

    def display_progress(self):
        """Display labeling progress"""
        print(f"\nProgress: {len(self.labeled_data)} messages labeled")
        if len(self.labeled_data) >= 30:
            print("✅ Minimum requirement met (30 messages)")
        else:
            print(f"⏳ Need {30 - len(self.labeled_data)} more messages")

# Initialize labeler
labeler = AmharicNERLabeler()
labeler.display_guidelines()

LABELING GUIDELINES:
Entity Types:
  B-Product: Beginning of Product entity
  I-Product: Inside Product entity
  B-LOC: Beginning of Location entity
  I-LOC: Inside Location entity
  B-PRICE: Beginning of Price entity
  I-PRICE: Inside Price entity
  O: Outside any entity

Examples:
  'Baby bottle' -> Baby(B-Product) bottle(I-Product)
  'Addis Abeba' -> Addis(B-LOC) Abeba(I-LOC)
  'ዋጋ 1000 ብር' -> ዋጋ(B-PRICE) 1000(I-PRICE) ብር(I-PRICE)
  Other words -> O



In [35]:
# Load dataset
import pandas as pd
import re

file_path = 'telegram_data.csv'
df = pd.read_csv(file_path)

# Debug: Check what columns exist
print("Available columns:", df.columns.tolist())
print("First few rows:")
print(df.head())

# Handle different possible column names
text_column = None
if 'clean_text' in df.columns:
    text_column = 'clean_text'
elif 'Message' in df.columns:
    text_column = 'Message'
elif 'text' in df.columns:
    text_column = 'text'
elif 'message' in df.columns:
    text_column = 'message'
else:
    # Try to find any column that might contain text
    for col in df.columns:
        if df[col].dtype == 'object' and df[col].str.contains('ብር|birr|Br|ETB', na=False).any():
            text_column = col
            break

if text_column is None:
    raise ValueError("Could not find a text column in the dataset")

print(f"Using column: {text_column}")

def clean_and_tokenize(text: str) -> list:
    if pd.isna(text) or text == '':
        return []
    text = re.sub(r'\s+', ' ', str(text).strip())
    # Separate numbers and currency but keep them as separate tokens
    text = re.sub(r'(\d+)(ብር|birr|Br|ETB)', r'\1 \2', text)
    # Handle price patterns like **750br**
    text = re.sub(r'\*\*(\d+)(br|ብር)\*\*', r'\1 \2', text)
    return [token for token in text.split() if token.strip()]

def create_sample_labels(df: pd.DataFrame, n: int = 50):
    # Filter messages with potential entities
    df = df[df[text_column].str.contains('ብር|birr|Br|ETB|አዲስ|ቦሌ', na=False)]
    sample_df = df.sample(n=min(n, len(df)), random_state=42)
    labeled_data = []

    for _, row in sample_df.iterrows():
        message = row[text_column]
        if pd.isna(message):
            continue

        tokens = clean_and_tokenize(message)
        labels = ['O'] * len(tokens)

        print(f"\nProcessing message: {message}")
        print(f"Tokens: {tokens}")

        # Process each token
        i = 0
        while i < len(tokens):
            token = tokens[i]
            print(f"Processing token {i}: '{token}'")

            # === PRICE LABELING (HIGHEST PRIORITY) ===
            if token.lower() in ['ዋጋ', 'ዋጋ:', 'ዋጋ፦', 'price', 'price:']:
                print(f"  Found price indicator: {token}")
                labels[i] = 'B-PRICE'
                # Look for following numbers and currency
                j = i + 1
                while j < len(tokens) and j < i + 5:
                    if re.match(r'\d+', tokens[j]):
                        labels[j] = 'I-PRICE'
                        print(f"  Price number: {tokens[j]} -> I-PRICE")
                    elif tokens[j].lower() in ['ብር', 'birr', 'br', 'etb']:
                        labels[j] = 'I-PRICE'
                        print(f"  Price currency: {tokens[j]} -> I-PRICE")
                        break
                    elif tokens[j] in [':', '፦']:
                        labels[j] = 'I-PRICE'
                    else:
                        break
                    j += 1
                i = j
                continue

            elif token == 'በ' and i + 1 < len(tokens) and re.match(r'\d+', tokens[i+1]):
                print(f"  Found 'በ' price pattern")
                labels[i] = 'B-PRICE'
                labels[i+1] = 'I-PRICE'
                if i + 2 < len(tokens) and tokens[i+2].lower() in ['ብር', 'birr', 'br', 'etb']:
                    labels[i+2] = 'I-PRICE'
                    i += 2
                else:
                    i += 1
                continue

            elif re.match(r'\d+', token) and i + 1 < len(tokens) and tokens[i+1].lower() in ['ብር', 'birr', 'br', 'etb']:
                print(f"  Found direct number+currency: {token} {tokens[i+1]}")
                labels[i] = 'B-PRICE'
                labels[i+1] = 'I-PRICE'
                i += 1
                continue

            elif re.match(r'\*\*\d+br\*\*', token.lower()):
                print(f"  Found **numberbr** pattern: {token}")
                labels[i] = 'B-PRICE'

            # === LOCATION LABELING ===
            elif token.lower() == 'አዲስ' and i + 1 < len(tokens) and tokens[i+1].lower() in ['አበባ', 'አበባ,']:
                print(f"  Found 'አዲስ አበባ'")
                labels[i] = 'B-LOC'
                labels[i+1] = 'I-LOC'
                i += 1
                continue

            elif token.replace(',', '').replace('፡', '').replace(':', '').lower() in [
                'ቦሌ', 'መገናኛ', 'ዘፍመሽ', 'ሜክሲኮ', 'ጀሞ', 'ቄራ', 'ፒያሳ', 'መርካቶ'
            ]:
                print(f"  Found location: {token}")
                labels[i] = 'B-LOC'

            # === PRODUCT LABELING (LOWEST PRIORITY) ===
            elif labels[i] == 'O':  # Only if not already labeled
                # Check for exact product matches
                if token.upper() in ['NIKE', 'ADIDAS', 'PUMA', 'CONVERSE']:
                    print(f"  Found brand: {token}")
                    labels[i] = 'B-Product'

                elif token.upper() in ['AIRFORCE', 'AIR', 'FORCE', 'ONE', 'JORDAN']:
                    print(f"  Found product model: {token}")
                    # Check if previous token was already a product
                    if i > 0 and labels[i-1] == 'B-Product':
                        labels[i] = 'I-Product'
                    else:
                        labels[i] = 'B-Product'

                elif token.upper() in ['TSHIRT', 'T-SHIRT', 'JACKET', 'SHIRT', 'DRESS']:
                    print(f"  Found clothing: {token}")
                    labels[i] = 'B-Product'

                elif token in ['ቦርሳ', 'ጫማ', 'ልብስ', 'ሸሚዝ', 'ስልክ']:
                    print(f"  Found Amharic product: {token}")
                    labels[i] = 'B-Product'

                elif token.upper() in ['BRAND', 'ብራንድ', 'QUALITY', 'HIGH', 'CLASSY']:
                    print(f"  Found product descriptor: {token}")
                    if i > 0 and labels[i-1] in ['B-Product', 'I-Product']:
                        labels[i] = 'I-Product'
                    else:
                        labels[i] = 'B-Product'

                elif token.upper() in ['MADE', 'SIZE', 'COLOR']:
                    print(f"  Found product attribute: {token}")
                    # Usually part of product description
                    if i > 0 and labels[i-1] in ['B-Product', 'I-Product']:
                        labels[i] = 'I-Product'
                    # Don't label standalone

                elif token in ['Baby', 'bottle', 'ህፃን', 'ብርጭቆ']:
                    print(f"  Found baby product: {token}")
                    if i > 0 and labels[i-1] == 'B-Product':
                        labels[i] = 'I-Product'
                    else:
                        labels[i] = 'B-Product'

            i += 1

        # Post-processing: Fix label sequences
        for i in range(len(tokens)-1):
            # Convert consecutive B-Product to B-Product + I-Product
            if labels[i] == 'B-Product' and labels[i+1] == 'B-Product':
                labels[i+1] = 'I-Product'
                print(f"  Post-process: {tokens[i+1]} B-Product -> I-Product")

        print(f"Final labels: {labels}")
        labeled_data.append(list(zip(tokens, labels)))

    return labeled_data

def save_sample_conll():
    sample_labels = create_sample_labels(df, n=5)  # Start with just 5 samples for debugging
    with open('amharic_ner_conll.txt', 'w', encoding='utf-8') as f:
        for message_labels in sample_labels:
            for token, label in message_labels:
                f.write(f"{token}\t{label}\n")
            f.write("\n")
    print("CoNLL file created: amharic_ner_conll.txt")

def validate_conll_format(file_path: str):
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            lines = f.readlines()
        message_count = 0
        token_count = 0
        label_stats = {}
        current_message_tokens = 0

        for line in lines:
            line = line.strip()
            if line == '':
                if current_message_tokens > 0:
                    message_count += 1
                    current_message_tokens = 0
            else:
                parts = line.split('\t')
                if len(parts) == 2:
                    token, label = parts
                    token_count += 1
                    current_message_tokens += 1
                    label_stats[label] = label_stats.get(label, 0) + 1
                else:
                    print(f"Warning: Skipping malformed line: '{line}'")

        if current_message_tokens > 0:
            message_count += 1

        print(f"Validation Results for {file_path}:")
        print(f"Messages: {message_count}")
        print(f"Total tokens: {token_count}")
        print(f"Label distribution: {label_stats}")
        return True
    except Exception as e:
        print(f"Error validating file: {e}")
        return False

# Run the functions
save_sample_conll()
validate_conll_format('amharic_ner_conll.txt')

Available columns: ['message_id', 'channel', 'text', 'timestamp', 'views', 'sender', 'image_path']
First few rows:
   message_id                channel  \
0        4562  @helloomarketethiopia   
1        4561  @helloomarketethiopia   
2        4560  @helloomarketethiopia   
3        4559  @helloomarketethiopia   
4        4558  @helloomarketethiopia   

                                                text  \
0  **ሁለት **ሲገዙ **5%** ቅናሽ ሙሉ በሙሉ የመኝታ ቤትዎን ውበት የሚ...   
1  የልጆች ወብ ባማረ ዲዛይን የተሰራ ቀሚስ ለማዘዝ 0974312223 ይደውሉ...   
2  አስደናቄ አትዮጵያዊ ምስልሎች ያላቸው የአረቄ መለኪያ ለማዘዝ 0974312...   
3  ብዙ ተወዳጀነትን የተረፈ የቃልኪዳን ጉዞ ለሁሉም እድሜ የሚሆን የአማረኛ ...   
4  ለቤትዎ ደምቀት የሚጨምሩ ሻማዎች በተለያየ መጠን እና ቀለም ለማዘዝ 097...   

                   timestamp  views      sender  \
0  2025-05-23 13:34:44+00:00   2334  1403573865   
1  2025-05-16 13:18:16+00:00   3324  1403573865   
2  2025-05-12 13:45:37+00:00   3626  1403573865   
3  2025-05-10 09:47:16+00:00   3458  1403573865   
4  2025-05-09 09:35:20+00:00   3040  1403573865   



True

In [18]:
# Final Steps and Instructions

def main_labeling_workflow():
    """
    Main workflow for labeling process
    """
    print("AMHARIC NER LABELING WORKFLOW")
    print("=" * 40)
    print("1. Load your telegram_data.csv file")
    print("2. Run the interactive labeling session")
    print("3. Label 30-50 messages minimum")
    print("4. Save in CoNLL format")
    print("5. Validate the output")
    print("\nTo start labeling:")
    print("1. Uncomment the interactive_labeling_session call above")
    print("2. Run the cells step by step")
    print("3. Follow the prompts to label each token")

    print("\nRemember:")
    print("- B- prefix for beginning of entity")
    print("- I- prefix for inside/continuation of entity")
    print("- O for outside any entity")
    print("- Products: items being sold")
    print("- Locations: places mentioned")
    print("- Prices: monetary amounts")

main_labeling_workflow()


AMHARIC NER LABELING WORKFLOW
1. Load your telegram_data.csv file
2. Run the interactive labeling session
3. Label 30-50 messages minimum
4. Save in CoNLL format
5. Validate the output

To start labeling:
1. Uncomment the interactive_labeling_session call above
2. Run the cells step by step
3. Follow the prompts to label each token

Remember:
- B- prefix for beginning of entity
- I- prefix for inside/continuation of entity
- O for outside any entity
- Products: items being sold
- Locations: places mentioned
- Prices: monetary amounts
