In [2]:
import pandas as pd
import numpy as np
import pyarrow as pa

In [3]:
# Load the saved parquet data
parquet_df = pd.read_parquet('../data/processed/messages.parquet')
print(parquet_df.head())

     ID                                         Clean_Text  \
0  7403  BARDEFU 2 IN 1 Multi purpose juicer ·ä≥·àä·â≤ ·ã®·àÜ·äê ·ã®·åÅ...   
1  7401  portable electrical water dispenser ·â£·àà 3 press...   
2  7399  GROOMING SET ·à∂·àµ·âµ ·â†·ä†·äï·ãµ ·ã®·ã´·ãò ·ã®·çÄ·åâ·à≠ ·àõ·àΩ·äï ·ä•·äì ·àº·â®·à≠ ·ã®·àö·à∞·à´...   
3  7395  GROOMING SET ·à∂·àµ·âµ ·â†·ä†·äï·ãµ ·ã®·ã´·ãò ·ã®·çÄ·åâ·à≠ ·àõ·àΩ·äï ·ä•·äì ·àº·â®·à≠ ·ã®·àö·à∞·à´...   
4  7393  1L Water Bottle High Quality 1L water time sca...   

                                              Tokens  \
0  [BARDEFU, 2, IN, 1, Multi, purpose, juicer, ·ä≥·àä...   
1  [portable, electrical, water, dispenser, ·â£·àà, 3...   
2  [GROOMING, SET, ·à∂·àµ·âµ, ·â†·ä†·äï·ãµ, ·ã®·ã´·ãò, ·ã®·çÄ·åâ·à≠, ·àõ·àΩ·äï, ·ä•·äì,...   
3  [GROOMING, SET, ·à∂·àµ·âµ, ·â†·ä†·äï·ãµ, ·ã®·ã´·ãò, ·ã®·çÄ·åâ·à≠, ·àõ·àΩ·äï, ·ä•·äì,...   
4  [1L, Water, Bottle, High, Quality, 1L, water, ...   

                                           Processed  \
0  [BARDEFU, 2, IN, 1, Mul

In [None]:

# Select 30‚Äì50 messages to label
subset = parquet_df[['ID', 'Processed']].dropna().head(30)

# Convert stringified token lists (if necessary)
if isinstance(subset.iloc[0]['Processed'], str):
    import ast
    subset['Processed'] = subset['Processed'].apply(ast.literal_eval)

# Labeling function
def label_tokens(tokens):
    print("\nInstructions:")
    print(" - Use: B-Product, I-Product, B-LOC, I-LOC, B-PRICE, I-PRICE, O")
    print(" - Press Enter to assign 'O' (Outside) to a token\n")
    
    labeled_tokens = []
    for tok in tokens:
        label = input(f"{tok}: ").strip()
        if label == "":
            label = "O"
        labeled_tokens.append((tok, label))
    return labeled_tokens

# Save labeled data to CoNLL format
with open("../data/processed/amharic_ner_conll.txt", "w", encoding="utf-8") as f:
    for _, row in subset.iterrows():
        tokens = row['Processed']
        labeled = label_tokens(tokens)
        for token, tag in labeled:
            f.write(f"{token} {tag}\n")
        f.write("\n")  # Blank line separates messages

print("‚úÖ Done! Saved 50 labeled messages to 'amharic_ner_conll.txt'")



Instructions:
 - Use: B-Product, I-Product, B-LOC, I-LOC, B-PRICE, I-PRICE, O
 - Press Enter to assign 'O' (Outside) to a token


Instructions:
 - Use: B-Product, I-Product, B-LOC, I-LOC, B-PRICE, I-PRICE, O
 - Press Enter to assign 'O' (Outside) to a token



In [None]:
import ipywidgets as widgets
from IPython.display import display, clear_output
import pandas as pd

subset = parquet_df[['ID', 'Processed']].dropna().head(30)  

labels_list = ['B-Product', 'I-Product', 'B-LOC', 'I-LOC', 'B-PRICE', 'I-PRICE', 'O','B-CONTACT','I-CONTACT']

# File path to save labeled data
output_path = '../data/processed/amharic_ner_conll_labeled.txt'

def label_message(tokens, idx=0):
    dropdowns = []
    
    # Create dropdowns for each token
    for tok in tokens:
        dd = widgets.Dropdown(options=labels_list, value='O', description=tok)
        dropdowns.append(dd)
    
    btn = widgets.Button(description="Submit Labels")
    out = widgets.Output()
    
    def on_submit(b):
        with out:
            clear_output()
            labels = [dd.value for dd in dropdowns]
            for token, label in zip(tokens, labels):
                print(f"{token} : {label}")

            # Save current labeled tokens to file (append mode)
            with open(output_path, 'a', encoding='utf-8') as f:
                for token, label in zip(tokens, labels):
                    f.write(f"{token} {label}\n")
                f.write('\n')  # blank line to separate messages
            
            print(f"\nLabeled message {idx+1}/{len(subset)} saved!")
            
            # Move to next message or finish
            if idx + 1 < len(subset):
                clear_output(wait=True)
                label_message(subset.iloc[idx+1]['Processed'], idx + 1)
            else:
                print("\n‚úÖ All messages labeled and saved.")
    
    btn.on_click(on_submit)
    
    display(*dropdowns, btn, out)

# Clear the output file before starting labeling
with open(output_path, 'w', encoding='utf-8') as f:
    pass

# Start labeling from first message
label_message(subset.iloc[0]['Processed'])


Dropdown(description='BARDEFU', index=6, options=('B-Product', 'I-Product', 'B-LOC', 'I-LOC', 'B-PRICE', 'I-PR‚Ä¶

Dropdown(description='2', index=6, options=('B-Product', 'I-Product', 'B-LOC', 'I-LOC', 'B-PRICE', 'I-PRICE', ‚Ä¶

Dropdown(description='IN', index=6, options=('B-Product', 'I-Product', 'B-LOC', 'I-LOC', 'B-PRICE', 'I-PRICE',‚Ä¶

Dropdown(description='1', index=6, options=('B-Product', 'I-Product', 'B-LOC', 'I-LOC', 'B-PRICE', 'I-PRICE', ‚Ä¶

Dropdown(description='Multi', index=6, options=('B-Product', 'I-Product', 'B-LOC', 'I-LOC', 'B-PRICE', 'I-PRIC‚Ä¶

Dropdown(description='purpose', index=6, options=('B-Product', 'I-Product', 'B-LOC', 'I-LOC', 'B-PRICE', 'I-PR‚Ä¶

Dropdown(description='juicer', index=6, options=('B-Product', 'I-Product', 'B-LOC', 'I-LOC', 'B-PRICE', 'I-PRI‚Ä¶

Dropdown(description='·ä≥·àä·â≤', index=6, options=('B-Product', 'I-Product', 'B-LOC', 'I-LOC', 'B-PRICE', 'I-PRICE'‚Ä¶

Dropdown(description='·ã®·åÅ·àµ', index=6, options=('B-Product', 'I-Product', 'B-LOC', 'I-LOC', 'B-PRICE', 'I-PRICE'‚Ä¶

Dropdown(description='·àò·çç·å´', index=6, options=('B-Product', 'I-Product', 'B-LOC', 'I-LOC', 'B-PRICE', 'I-PRICE'‚Ä¶

Dropdown(description='·ã®·åÄ·à≠·àò·äï', index=6, options=('B-Product', 'I-Product', 'B-LOC', 'I-LOC', 'B-PRICE', 'I-PRIC‚Ä¶

Dropdown(description='·â¥·ä≠·äñ·àé·åÇ', index=6, options=('B-Product', 'I-Product', 'B-LOC', 'I-LOC', 'B-PRICE', 'I-PRIC‚Ä¶

Dropdown(description='3', index=6, options=('B-Product', 'I-Product', 'B-LOC', 'I-LOC', 'B-PRICE', 'I-PRICE', ‚Ä¶

Dropdown(description='·àå·âµ·à≠', index=6, options=('B-Product', 'I-Product', 'B-LOC', 'I-LOC', 'B-PRICE', 'I-PRICE'‚Ä¶

Dropdown(description='·åÅ·àµ', index=6, options=('B-Product', 'I-Product', 'B-LOC', 'I-LOC', 'B-PRICE', 'I-PRICE',‚Ä¶

Dropdown(description='·ã®·àö·çà·å≠', index=6, options=('B-Product', 'I-Product', 'B-LOC', 'I-LOC', 'B-PRICE', 'I-PRICE‚Ä¶

Dropdown(description='·åÜ·åç', index=6, options=('B-Product', 'I-Product', 'B-LOC', 'I-LOC', 'B-PRICE', 'I-PRICE',‚Ä¶

Dropdown(description='·ã´·àà·ãç', index=6, options=('B-Product', 'I-Product', 'B-LOC', 'I-LOC', 'B-PRICE', 'I-PRICE'‚Ä¶

Dropdown(description='·ã®·â•·äì', index=6, options=('B-Product', 'I-Product', 'B-LOC', 'I-LOC', 'B-PRICE', 'I-PRICE'‚Ä¶

Dropdown(description='·ä•·äì', index=6, options=('B-Product', 'I-Product', 'B-LOC', 'I-LOC', 'B-PRICE', 'I-PRICE',‚Ä¶

Dropdown(description='·ã®·âÖ·àò·àõ', index=6, options=('B-Product', 'I-Product', 'B-LOC', 'I-LOC', 'B-PRICE', 'I-PRICE‚Ä¶

Dropdown(description='·âÖ·àò·àù', index=6, options=('B-Product', 'I-Product', 'B-LOC', 'I-LOC', 'B-PRICE', 'I-PRICE'‚Ä¶

Dropdown(description='·àò·çç·å´', index=6, options=('B-Product', 'I-Product', 'B-LOC', 'I-LOC', 'B-PRICE', 'I-PRICE'‚Ä¶

Dropdown(description='·ã´·àà·ãç', index=6, options=('B-Product', 'I-Product', 'B-LOC', 'I-LOC', 'B-PRICE', 'I-PRICE'‚Ä¶

Dropdown(description='8000Watt', index=6, options=('B-Product', 'I-Product', 'B-LOC', 'I-LOC', 'B-PRICE', 'I-P‚Ä¶

Dropdown(description='·àù·àã·åÆ·âπ', index=6, options=('B-Product', 'I-Product', 'B-LOC', 'I-LOC', 'B-PRICE', 'I-PRICE‚Ä¶

Dropdown(description='·å†·äï·ä´·à´', index=6, options=('B-Product', 'I-Product', 'B-LOC', 'I-LOC', 'B-PRICE', 'I-PRICE‚Ä¶

Dropdown(description='·ã®·àÜ·äë', index=6, options=('B-Product', 'I-Product', 'B-LOC', 'I-LOC', 'B-PRICE', 'I-PRICE'‚Ä¶

Dropdown(description='·àà·â§·âµ', index=6, options=('B-Product', 'I-Product', 'B-LOC', 'I-LOC', 'B-PRICE', 'I-PRICE'‚Ä¶

Dropdown(description='·ä•·äï·ã≤·àÅ·àù', index=6, options=('B-Product', 'I-Product', 'B-LOC', 'I-LOC', 'B-PRICE', 'I-PRIC‚Ä¶

Dropdown(description='·àà·àµ·à´', index=6, options=('B-Product', 'I-Product', 'B-LOC', 'I-LOC', 'B-PRICE', 'I-PRICE'‚Ä¶

Dropdown(description='·ã®·àö·àÜ·äï', index=6, options=('B-Product', 'I-Product', 'B-LOC', 'I-LOC', 'B-PRICE', 'I-PRICE‚Ä¶

Dropdown(description='·ä†·à™·çç', index=6, options=('B-Product', 'I-Product', 'B-LOC', 'I-LOC', 'B-PRICE', 'I-PRICE'‚Ä¶

Dropdown(description='·ä•·âÉ', index=6, options=('B-Product', 'I-Product', 'B-LOC', 'I-LOC', 'B-PRICE', 'I-PRICE',‚Ä¶

Dropdown(description='·àà·ä†·å†·âÉ·âÄ·àù', index=6, options=('B-Product', 'I-Product', 'B-LOC', 'I-LOC', 'B-PRICE', 'I-PRI‚Ä¶

Dropdown(description='·âÄ·àã·àç', index=6, options=('B-Product', 'I-Product', 'B-LOC', 'I-LOC', 'B-PRICE', 'I-PRICE'‚Ä¶

Dropdown(description='·â†·à®·ã∂', index=6, options=('B-Product', 'I-Product', 'B-LOC', 'I-LOC', 'B-PRICE', 'I-PRICE'‚Ä¶

Dropdown(description='·ã≠·çà·å´·àç', index=6, options=('B-Product', 'I-Product', 'B-LOC', 'I-LOC', 'B-PRICE', 'I-PRICE‚Ä¶

Dropdown(description='·ãã·åã6800·â•·à≠', index=6, options=('B-Product', 'I-Product', 'B-LOC', 'I-LOC', 'B-PRICE', 'I-P‚Ä¶

Dropdown(description='·ãç·àµ·äï', index=6, options=('B-Product', 'I-Product', 'B-LOC', 'I-LOC', 'B-PRICE', 'I-PRICE'‚Ä¶

Dropdown(description='·çç·à¨', index=6, options=('B-Product', 'I-Product', 'B-LOC', 'I-LOC', 'B-PRICE', 'I-PRICE',‚Ä¶

Dropdown(description='·ã´·àà·äï', index=6, options=('B-Product', 'I-Product', 'B-LOC', 'I-LOC', 'B-PRICE', 'I-PRICE'‚Ä¶

Dropdown(description='Limited', index=6, options=('B-Product', 'I-Product', 'B-LOC', 'I-LOC', 'B-PRICE', 'I-PR‚Ä¶

Dropdown(description='Stock', index=6, options=('B-Product', 'I-Product', 'B-LOC', 'I-LOC', 'B-PRICE', 'I-PRIC‚Ä¶

Dropdown(description='·ä†·ãµ·à´·àª', index=6, options=('B-Product', 'I-Product', 'B-LOC', 'I-LOC', 'B-PRICE', 'I-PRICE‚Ä¶

Dropdown(description='·âÅ1', index=6, options=('B-Product', 'I-Product', 'B-LOC', 'I-LOC', 'B-PRICE', 'I-PRICE',‚Ä¶

Dropdown(description='·àò·åà·äì·äõ', index=6, options=('B-Product', 'I-Product', 'B-LOC', 'I-LOC', 'B-PRICE', 'I-PRICE‚Ä¶

Dropdown(description='·â≥·àú', index=6, options=('B-Product', 'I-Product', 'B-LOC', 'I-LOC', 'B-PRICE', 'I-PRICE',‚Ä¶

Dropdown(description='·åã·àµ', index=6, options=('B-Product', 'I-Product', 'B-LOC', 'I-LOC', 'B-PRICE', 'I-PRICE',‚Ä¶

Dropdown(description='·àÖ·äï·çÉ', index=6, options=('B-Product', 'I-Product', 'B-LOC', 'I-LOC', 'B-PRICE', 'I-PRICE'‚Ä¶

Dropdown(description='·åé·äï', index=6, options=('B-Product', 'I-Product', 'B-LOC', 'I-LOC', 'B-PRICE', 'I-PRICE',‚Ä¶

Dropdown(description='·àµ·à™', index=6, options=('B-Product', 'I-Product', 'B-LOC', 'I-LOC', 'B-PRICE', 'I-PRICE',‚Ä¶

Dropdown(description='·ä§·àù', index=6, options=('B-Product', 'I-Product', 'B-LOC', 'I-LOC', 'B-PRICE', 'I-PRICE',‚Ä¶

Dropdown(description='·à≤·â≤', index=6, options=('B-Product', 'I-Product', 'B-LOC', 'I-LOC', 'B-PRICE', 'I-PRICE',‚Ä¶

Dropdown(description='·àû·àç', index=6, options=('B-Product', 'I-Product', 'B-LOC', 'I-LOC', 'B-PRICE', 'I-PRICE',‚Ä¶

Dropdown(description='·àÅ·àà·â∞·äõ', index=6, options=('B-Product', 'I-Product', 'B-LOC', 'I-LOC', 'B-PRICE', 'I-PRICE‚Ä¶

Dropdown(description='·çé·âÖ', index=6, options=('B-Product', 'I-Product', 'B-LOC', 'I-LOC', 'B-PRICE', 'I-PRICE',‚Ä¶

Dropdown(description='·â¢·àÆ', index=6, options=('B-Product', 'I-Product', 'B-LOC', 'I-LOC', 'B-PRICE', 'I-PRICE',‚Ä¶

Dropdown(description='·âÅ', index=6, options=('B-Product', 'I-Product', 'B-LOC', 'I-LOC', 'B-PRICE', 'I-PRICE', ‚Ä¶

Dropdown(description='SL05A·ä®', index=6, options=('B-Product', 'I-Product', 'B-LOC', 'I-LOC', 'B-PRICE', 'I-PRI‚Ä¶

Dropdown(description='·àä·çç·â±', index=6, options=('B-Product', 'I-Product', 'B-LOC', 'I-LOC', 'B-PRICE', 'I-PRICE'‚Ä¶

Dropdown(description='·çä·âµ', index=6, options=('B-Product', 'I-Product', 'B-LOC', 'I-LOC', 'B-PRICE', 'I-PRICE',‚Ä¶

Dropdown(description='·àà', index=6, options=('B-Product', 'I-Product', 'B-LOC', 'I-LOC', 'B-PRICE', 'I-PRICE', ‚Ä¶

Dropdown(description='·çä·âµ', index=6, options=('B-Product', 'I-Product', 'B-LOC', 'I-LOC', 'B-PRICE', 'I-PRICE',‚Ä¶

Dropdown(description='·âÅ2', index=6, options=('B-Product', 'I-Product', 'B-LOC', 'I-LOC', 'B-PRICE', 'I-PRICE',‚Ä¶

Dropdown(description='·àà·â°', index=6, options=('B-Product', 'I-Product', 'B-LOC', 'I-LOC', 'B-PRICE', 'I-PRICE',‚Ä¶

Dropdown(description='·àò·ã≥·àÖ·äí·ãì·àà·àù', index=6, options=('B-Product', 'I-Product', 'B-LOC', 'I-LOC', 'B-PRICE', 'I-PR‚Ä¶

Dropdown(description='·â§·â∞·ä≠·à≠·àµ·â≤·ã´·äï', index=6, options=('B-Product', 'I-Product', 'B-LOC', 'I-LOC', 'B-PRICE', 'I-P‚Ä¶

Dropdown(description='·ãà·ã∞', index=6, options=('B-Product', 'I-Product', 'B-LOC', 'I-LOC', 'B-PRICE', 'I-PRICE',‚Ä¶

Dropdown(description='·àô·ãö·âÉ', index=6, options=('B-Product', 'I-Product', 'B-LOC', 'I-LOC', 'B-PRICE', 'I-PRICE'‚Ä¶

Dropdown(description='·â§·âµ', index=6, options=('B-Product', 'I-Product', 'B-LOC', 'I-LOC', 'B-PRICE', 'I-PRICE',‚Ä¶

Dropdown(description='·ä®·çç', index=6, options=('B-Product', 'I-Product', 'B-LOC', 'I-LOC', 'B-PRICE', 'I-PRICE',‚Ä¶

Dropdown(description='·â•·àé', index=6, options=('B-Product', 'I-Product', 'B-LOC', 'I-LOC', 'B-PRICE', 'I-PRICE',‚Ä¶

Dropdown(description='·ãõ·àù_·àû·àç', index=6, options=('B-Product', 'I-Product', 'B-LOC', 'I-LOC', 'B-PRICE', 'I-PRIC‚Ä¶

Dropdown(description='2·äõ', index=6, options=('B-Product', 'I-Product', 'B-LOC', 'I-LOC', 'B-PRICE', 'I-PRICE',‚Ä¶

Dropdown(description='·çé·âÖ', index=6, options=('B-Product', 'I-Product', 'B-LOC', 'I-LOC', 'B-PRICE', 'I-PRICE',‚Ä¶

Dropdown(description='·â¢·àÆ·âÅ', index=6, options=('B-Product', 'I-Product', 'B-LOC', 'I-LOC', 'B-PRICE', 'I-PRICE'‚Ä¶

Dropdown(description='214', index=6, options=('B-Product', 'I-Product', 'B-LOC', 'I-LOC', 'B-PRICE', 'I-PRICE'‚Ä¶

Dropdown(description='0909522840', index=6, options=('B-Product', 'I-Product', 'B-LOC', 'I-LOC', 'B-PRICE', 'I‚Ä¶

Dropdown(description='0923350054', index=6, options=('B-Product', 'I-Product', 'B-LOC', 'I-LOC', 'B-PRICE', 'I‚Ä¶

Dropdown(description='·àà·â°', index=6, options=('B-Product', 'I-Product', 'B-LOC', 'I-LOC', 'B-PRICE', 'I-PRICE',‚Ä¶

Dropdown(description='·âÖ·à≠·äï·å´·çç0973611819', index=6, options=('B-Product', 'I-Product', 'B-LOC', 'I-LOC', 'B-PRICE‚Ä¶

Dropdown(description='·â†Telegram', index=6, options=('B-Product', 'I-Product', 'B-LOC', 'I-LOC', 'B-PRICE', 'I-‚Ä¶

Dropdown(description='·àà·àõ·ãò·ãù', index=6, options=('B-Product', 'I-Product', 'B-LOC', 'I-LOC', 'B-PRICE', 'I-PRICE‚Ä¶

Dropdown(description='·ã≠·å†·âÄ·àô', index=6, options=('B-Product', 'I-Product', 'B-LOC', 'I-LOC', 'B-PRICE', 'I-PRICE‚Ä¶

Dropdown(description='shager_onlinestore', index=6, options=('B-Product', 'I-Product', 'B-LOC', 'I-LOC', 'B-PR‚Ä¶

Dropdown(description='·àà·â∞·å®·àõ·à™', index=6, options=('B-Product', 'I-Product', 'B-LOC', 'I-LOC', 'B-PRICE', 'I-PRIC‚Ä¶

Dropdown(description='·àõ·â•·à´·à™·ã´', index=6, options=('B-Product', 'I-Product', 'B-LOC', 'I-LOC', 'B-PRICE', 'I-PRIC‚Ä¶

Dropdown(description='·ã®·â¥·àå·åç·à´·àù', index=6, options=('B-Product', 'I-Product', 'B-LOC', 'I-LOC', 'B-PRICE', 'I-PRI‚Ä¶

Dropdown(description='·åà·çÉ·âΩ·äï', index=6, options=('B-Product', 'I-Product', 'B-LOC', 'I-LOC', 'B-PRICE', 'I-PRICE‚Ä¶

Dropdown(description='httpstmeShageronlinestore', index=6, options=('B-Product', 'I-Product', 'B-LOC', 'I-LOC'‚Ä¶

Button(description='Submit Labels', style=ButtonStyle())

Output()