In [1]:
import pyarrow
import pandas as pd
import numpy as np
import os

# use paraquet to load the data as-is
# Define the target directory path
parquet_dir = "../data/processed/"

# Create the directory if it doesn't exist
os.makedirs(parquet_dir, exist_ok=True)  # `exist_ok=True` prevents errors if dir already exists
df = pd.read_parquet(os.path.join(parquet_dir, "messages.parquet"))


In [2]:
print(df)

        ID                                         Clean_Text  \
0     7403  BARDEFU 2 IN 1 Multi purpose juicer ኳሊቲ የሆነ የጁ...   
1     7401  portable electrical water dispenser ባለ 3 press...   
2     7399  GROOMING SET ሶስት በአንድ የያዘ የፀጉር ማሽን እና ሼቨር የሚሰራ...   
3     7395  GROOMING SET ሶስት በአንድ የያዘ የፀጉር ማሽን እና ሼቨር የሚሰራ...   
4     7393  1L Water Bottle High Quality 1L water time sca...   
...    ...                                                ...   
1861  2640  JORDAN 9 size 44142434445 MADE IN VIETNAM SHEW...   
1862  2639  Reebok hunter Green size 40414243 MADE IN VIET...   
1863  2638  NIKE Alpha Huarache Elite 3 size 40414243 MADE...   
1864  2637  Alexander McQUEEN size 36373839 SHEWA BRAND አድ...   
1865  2636  NIKE TEMPUS size 40414243 MADE IN ITALY SHEWA ...   

                                                 Tokens  \
0     [BARDEFU, 2, IN, 1, Multi, purpose, juicer, ኳሊ...   
1     [portable, electrical, water, dispenser, ባለ, 3...   
2     [GROOMING, SET, ሶስት, በአንድ, የያዘ, የፀጉር

In [4]:
%pip install spacy

Collecting spacy
  Using cached spacy-3.8.7-cp313-cp313-win_amd64.whl.metadata (28 kB)
Collecting spacy-legacy<3.1.0,>=3.0.11 (from spacy)
  Using cached spacy_legacy-3.0.12-py2.py3-none-any.whl.metadata (2.8 kB)
Collecting spacy-loggers<2.0.0,>=1.0.0 (from spacy)
  Using cached spacy_loggers-1.0.5-py3-none-any.whl.metadata (23 kB)
Collecting murmurhash<1.1.0,>=0.28.0 (from spacy)
  Using cached murmurhash-1.0.13-cp313-cp313-win_amd64.whl.metadata (2.2 kB)
Collecting cymem<2.1.0,>=2.0.2 (from spacy)
  Using cached cymem-2.0.11-cp313-cp313-win_amd64.whl.metadata (8.8 kB)
Collecting preshed<3.1.0,>=3.0.2 (from spacy)
  Using cached preshed-3.0.10-cp313-cp313-win_amd64.whl.metadata (2.5 kB)
Collecting thinc<8.4.0,>=8.3.4 (from spacy)
  Using cached thinc-8.3.6-cp313-cp313-win_amd64.whl.metadata (15 kB)
Collecting wasabi<1.2.0,>=0.9.1 (from spacy)
  Using cached wasabi-1.1.3-py3-none-any.whl.metadata (28 kB)
Collecting srsly<3.0.0,>=2.4.3 (from spacy)
  Using cached srsly-2.5.1-cp313-cp313


[notice] A new release of pip is available: 25.0.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [3]:
import re
import pandas as pd
from typing import List, Dict, Optional
import spacy
from spacy.tokens import Doc, Span

# Load Amharic/English language model (you may need to train a custom one)
nlp = spacy.blank('en')
nlp.add_pipe('sentencizer')

<spacy.pipeline.sentencizer.Sentencizer at 0x2326af3b050>

Custom Function for phone extraction

In [4]:
def extract_ethiopian_phone(text: str) -> Optional[str]:
    """
    Improved Ethiopian phone number extraction
    Handles formats:
    - 0912345678
    - 911234567
    - +251912345678
    - 251912345678
    - 011-123-4567 (landline)
    """
    patterns = [
        r'(?<!\d)(?:0|251|\+251)?(9\d{8})(?!\d)',  # Mobile
        r'(?<!\d)(?:0|251|\+251)?(1[1-9]\d{6,7})(?!\d)',  # Landline
        r'(?<!\d)(0\d{2}-\d{3}-\d{4})(?!\d)'  # Hyphenated
    ]
    
    for pattern in patterns:
        match = re.search(pattern, text)
        if match:
            num = match.group(1).replace('-', '')
            if num.startswith('0'):
                return '+251' + num[1:]
            elif num.startswith('251'):
                return '+' + num
            elif len(num) == 9 and num.startswith('9'):
                return '+251' + num
            return num
    return None

Extract Entities

In [6]:

def extract_entities(text: str) -> Dict:
    """
    Enhanced entity extraction for Ethiopian marketplace
    """
    # Price patterns with better ETH handling
    price_patterns = [
        r'(?:[\$]?\s*(\d+[\.,]?\d*)\s*(?:ብር|ETB|birr|br|ብር|ብ))',  # Ethiopian
        r'(?:[\$€£]\s*(\d+[\.,]?\d*))'  # International
    ]
    
    # Location patterns with Amharic support
    location_pattern = r'(?:pickup|delivery|location|ቦታ|አድራሻ)[:\s]*(.+?)(?:\n|$|\.|,)'
    
    entities = {
        'product_name': None,
        'price': None,
        'currency': 'ETB',
        'location': None,
        'contact_info': None,
        'is_available': False
    }
    
    # Price extraction with better validation
    for pattern in price_patterns:
        matches = re.finditer(pattern, text)
        for match in matches:
            price_str = match.group(1).replace(',', '')
            try:
                entities['price'] = float(price_str)
                if '$' in match.group(0):
                    entities['currency'] = 'USD'
                break
            except ValueError:
                continue
    
    # Location extraction with boundary checks
    loc_match = re.search(location_pattern, text, re.IGNORECASE)
    if loc_match:
        entities['location'] = loc_match.group(1).strip(' ,.-')
    
    # Contact info using improved phone extractor
    entities['contact_info'] = extract_ethiopian_phone(text)
    
    # Product name from first clean line
    clean_lines = [line.strip() for line in text.split('\n') if line.strip()]
    entities['product_name'] = clean_lines[0] if clean_lines else None
    
    # Availability check with Amharic support
    availability_indicators = [
        'for sale', 'selling', 'available', 'የሚገኝ', 'ሽያጭ',
        'በዋጋ', 'ዋጋ', 'price', 'cost', 'ተገኝቷል'
    ]
    entities['is_available'] = any(
        ind.lower() in text.lower() for ind in availability_indicators
    )
    
    return entities

def process_message_dataset(df: pd.DataFrame) -> pd.DataFrame:
    """
    Process dataset with improved entity extraction
    """
    results = []
    
    for _, row in df.iterrows():
        try:
            extracted = extract_entities(row['Clean_Text'])
            extracted['message_id'] = row['ID']
            results.append(extracted)
        except Exception as e:
            print(f"Error processing message {row['ID']}: {str(e)}")
            continue
    
    return pd.DataFrame(results)

# Example usage with your DataFrame
ner_df = process_message_dataset(df)


In [7]:
ner_df

Unnamed: 0,product_name,price,currency,location,contact_info,is_available,message_id
0,BARDEFU 2 IN 1 Multi purpose juicer ኳሊቲ የሆነ የጁ...,6800.0,ETB,ቁ1 መገናኛ ታሜ ጋስ ህንፃ ጎን ስሪ ኤም ሲቲ ሞል ሁለተኛ ፎቅ ቢሮ ቁ ...,+251909522840,True,7403
1,portable electrical water dispenser ባለ 3 press...,1600.0,ETB,ቁ1 መገናኛ ታሜ ጋስ ህንፃ ጎን ስሪ ኤም ሲቲ ሞል ሁለተኛ ፎቅ ቢሮ ቁ ...,+251909522840,True,7401
2,GROOMING SET ሶስት በአንድ የያዘ የፀጉር ማሽን እና ሼቨር የሚሰራ...,2300.0,ETB,ቁ1 መገናኛ ታሜ ጋስ ህንፃ ጎን ስሪ ኤም ሲቲ ሞል ሁለተኛ ፎቅ ቢሮ ቁ ...,+251909522840,True,7399
3,GROOMING SET ሶስት በአንድ የያዘ የፀጉር ማሽን እና ሼቨር የሚሰራ...,2300.0,ETB,ቁ1 መገናኛ ታሜ ጋስ ህንፃ ጎን ስሪ ኤም ሲቲ ሞል ሁለተኛ ፎቅ ቢሮ ቁ ...,+251909522840,True,7395
4,1L Water Bottle High Quality 1L water time sca...,800.0,ETB,ቁ1 መገናኛ ታሜ ጋስ ህንፃ ጎን ስሪ ኤም ሲቲ ሞል ሁለተኛ ፎቅ ቢሮ ቁ ...,+251909522840,True,7393
...,...,...,...,...,...,...,...
1861,JORDAN 9 size 44142434445 MADE IN VIETNAM SHEW...,,ETB,ድሬዳዋ አሸዋ ሚና 1 ፎቅ እንገኛለን ስልክ 0987336458 0924209...,+251987336458,False,2640
1862,Reebok hunter Green size 40414243 MADE IN VIET...,,ETB,ድሬዳዋ አሸዋ ሚና 1 ፎቅ እንገኛለን ስልክ 0987336458 0924209...,+251987336458,False,2639
1863,NIKE Alpha Huarache Elite 3 size 40414243 MADE...,,ETB,ድሬዳዋ አሸዋ ሚና 1 ፎቅ እንገኛለን ስልክ 0987336458 0924209...,+251987336458,False,2638
1864,Alexander McQUEEN size 36373839 SHEWA BRAND አድ...,,ETB,ድሬዳዋ አሸዋ ሚና 1 ፎቅ እንገኛለን ስልክ 0987336458 0924209...,+251987336458,False,2637


Categorize Products

In [8]:
# Add product categorization
def categorize_product(product_name: str) -> str:
    """
    Categorize products based on keywords
    """
    product_name = product_name.lower()
    
    categories = {
        'Electronics': ['phone', 'tv', 'laptop', 'camera', 'charger', 'juicer', 'dispenser'],
        'Clothing': ['shirt', 'dress', 'jeans', 'shoe', 'jordan', 'nike', 'reebok'],
        'Home Goods': ['bottle', 'grooming', 'set', 'kitchen', 'furniture'],
        'Cosmetics': ['cream', 'makeup', 'perfume', 'cosmetic'],
        'Food': ['coffee', 'tea', 'honey', 'spice']
    }
    
    for category, keywords in categories.items():
        if any(keyword in product_name for keyword in keywords):
            return category
    
    return 'Other'

# Apply categorization
ner_df['product_category'] = ner_df['product_name'].apply(categorize_product)

In [9]:
ner_df

Unnamed: 0,product_name,price,currency,location,contact_info,is_available,message_id,product_category
0,BARDEFU 2 IN 1 Multi purpose juicer ኳሊቲ የሆነ የጁ...,6800.0,ETB,ቁ1 መገናኛ ታሜ ጋስ ህንፃ ጎን ስሪ ኤም ሲቲ ሞል ሁለተኛ ፎቅ ቢሮ ቁ ...,+251909522840,True,7403,Electronics
1,portable electrical water dispenser ባለ 3 press...,1600.0,ETB,ቁ1 መገናኛ ታሜ ጋስ ህንፃ ጎን ስሪ ኤም ሲቲ ሞል ሁለተኛ ፎቅ ቢሮ ቁ ...,+251909522840,True,7401,Electronics
2,GROOMING SET ሶስት በአንድ የያዘ የፀጉር ማሽን እና ሼቨር የሚሰራ...,2300.0,ETB,ቁ1 መገናኛ ታሜ ጋስ ህንፃ ጎን ስሪ ኤም ሲቲ ሞል ሁለተኛ ፎቅ ቢሮ ቁ ...,+251909522840,True,7399,Home Goods
3,GROOMING SET ሶስት በአንድ የያዘ የፀጉር ማሽን እና ሼቨር የሚሰራ...,2300.0,ETB,ቁ1 መገናኛ ታሜ ጋስ ህንፃ ጎን ስሪ ኤም ሲቲ ሞል ሁለተኛ ፎቅ ቢሮ ቁ ...,+251909522840,True,7395,Home Goods
4,1L Water Bottle High Quality 1L water time sca...,800.0,ETB,ቁ1 መገናኛ ታሜ ጋስ ህንፃ ጎን ስሪ ኤም ሲቲ ሞል ሁለተኛ ፎቅ ቢሮ ቁ ...,+251909522840,True,7393,Home Goods
...,...,...,...,...,...,...,...,...
1861,JORDAN 9 size 44142434445 MADE IN VIETNAM SHEW...,,ETB,ድሬዳዋ አሸዋ ሚና 1 ፎቅ እንገኛለን ስልክ 0987336458 0924209...,+251987336458,False,2640,Clothing
1862,Reebok hunter Green size 40414243 MADE IN VIET...,,ETB,ድሬዳዋ አሸዋ ሚና 1 ፎቅ እንገኛለን ስልክ 0987336458 0924209...,+251987336458,False,2639,Clothing
1863,NIKE Alpha Huarache Elite 3 size 40414243 MADE...,,ETB,ድሬዳዋ አሸዋ ሚና 1 ፎቅ እንገኛለን ስልክ 0987336458 0924209...,+251987336458,False,2638,Clothing
1864,Alexander McQUEEN size 36373839 SHEWA BRAND አድ...,,ETB,ድሬዳዋ አሸዋ ሚና 1 ፎቅ እንገኛለን ስልክ 0987336458 0924209...,+251987336458,False,2637,Other


Save extracted data

In [11]:
ner_df.to_csv("../data/processed/ner_extracted.csv", index=False)