In [1]:
# ------------------------
# SETUP AND IMPORTS
# ------------------------
import pandas as pd
import re

# Load the dataset
df = pd.read_csv("../../data/telegram_data.csv")
messages = df["text"].dropna().tolist()[:50]  

In [2]:
def tokenize_amharic(text):
    # Tokenize Amharic and numbers, fallback to whitespace for other tokens
    tokens = re.findall(r'[\u1200-\u137F]+|\d+|[^\s\u1200-\u137F]+', str(text))
    return tokens

In [3]:
# Manually label entities in the messages
# Entity labels: B-Product, I-Product, B-LOC, I-LOC


entity_labels = ["B-Product", "I-Product", "B-LOC", "I-LOC", "B-PRICE", "I-PRICE", "O"]
labeled_sentences = []

for idx, msg in enumerate(messages):
    print(f"\nMessage {idx+1}: {msg}")
    tokens = tokenize_amharic(msg)
    labels = []
    for token in tokens:
        print(f"Token: {token}")
        label = input(f"Label for '{token}' ({'/'.join(entity_labels)}): ")
        if label not in entity_labels:
            label = "O"
        labels.append((token, label))
    labeled_sentences.append(labels)


Message 1: Skechers Ultra Lace  
Size 40,41,43
Price 3400 birr
📌አድራሻ-ሜክሲኮ ኮሜርስ ጀርባ መዚድ ፕላዛ የመጀመሪያ ደረጃ እንደወጡ 101 የቢሮ ቁጥር ያገኙናል or call 0920238243
                                    [EthioBrand](https://t.me/ethio_brand_collection) ✅
Token: Skechers
Token: Ultra
Token: Lace
Token: Size
Token: 40
Token: ,41,43
Token: Price
Token: 3400
Token: birr
Token: 📌
Token: አድራሻ
Token: -
Token: ሜክሲኮ
Token: ኮሜርስ
Token: ጀርባ
Token: መዚድ
Token: ፕላዛ
Token: የመጀመሪያ
Token: ደረጃ
Token: እንደወጡ
Token: 101
Token: የቢሮ
Token: ቁጥር
Token: ያገኙናል
Token: or
Token: call
Token: 0920238243
Token: [EthioBrand](https://t.me/ethio_brand_collection)
Token: ✅

Message 2: ‼️ እሁድ ሁሌም ክፍት ነን ‼️

Reebok Club Vintage   
size 40,41,42,43
Price 3300 birr
📌አድራሻ-ሜክሲኮ ኮሜርስ ጀርባ መዚድ ፕላዛ የመጀመሪያ ደረጃ እንደወጡ 101 የቢሮ ቁጥር ያገኙናል or call 0920238243
                                    [EthioBrand](https://t.me/ethio_brand_collection) ✅
Token: ‼️
Token: እሁድ
Token: ሁሌም
Token: ክፍት
Token: ነን
Token: ‼️
Token: Reebok
Token: Club
Token: Vintage
Token: size

In [4]:
#Save labeled data to a file 
with open("../../data/labeled_amharic.conll", "w", encoding="utf-8") as f:
    for sentence in labeled_sentences:
        for token, label in sentence:
            f.write(f"{token} {label}\n")
        f.write("\n")  # Sentence separator
print("Saved labeled data to ../../data/labeled_amharic.conll")

Saved labeled data to ../../data/labeled_amharic.conll
