# Task-2: Label Subset to CoNLL
Demonstrates running the labeling script and inspecting the output.

In [3]:
import sys, runpy, importlib
from pathlib import Path
for m in list(sys.modules):
    if m == "src" or m.startswith("src."):
        del sys.modules[m]

PROJECT_ROOT = next(
    (p for p in [Path.cwd(), *Path.cwd().parents] if (p / "src").is_dir()),
    None,
)
if PROJECT_ROOT is None:
    raise RuntimeError("Could not locate project root containing a 'src' directory")

sys.path.insert(0, str(PROJECT_ROOT))      
(SRC_DIR := PROJECT_ROOT / "src").joinpath("__init__.py").touch(exist_ok=True)
importlib.invalidate_caches()


csv_path = PROJECT_ROOT / "data" / "preview_messages.csv"
csv_path.parent.mkdir(parents=True, exist_ok=True)
if not csv_path.exists():
    csv_path.write_text("text\ntest message\n", encoding="utf-8")
    print(f"[info] {csv_path} was missing – created a 1-row placeholder.\n"
          "       Run your ingestion pipeline later to replace it.")

runpy.run_module("src.labeling.conll_labeler", run_name="__main__")


conll_path = PROJECT_ROOT / "data" / "labels" / "subset.conll"
print("\n── First 60 lines of subset.conll ──\n")
print("\n".join(conll_path.read_text(encoding="utf-8").splitlines()[:60]))

try:
    from conllu import parse
    sents = parse(conll_path.read_text(encoding="utf-8"), fields=("form", "tag"))
    print(f"\nParsed {len(sents)} sentences  ({sum(len(s) for s in sents)} tokens)")
except ImportError:
    print("\n[optional] pip install conllu  ➜ structured access to the file")

Wrote C:\Users\senta\OneDrive\Documents\Proj\10 Ac\Amharic-E-commerce-Data-Extractor\data\labels\subset.conll with 1032 lines

── First 60 lines of subset.conll ──

nan	O

Saachi	O
Electric	O
Kettle	O
Borosilicate	O
Glass	O
Body	O
Overheat	O
protection	O
Automatic	O
switch	O
off	O
2200	B-PRICE
w	O
ዋጋ፦	O
2700	B-PRICE
ብር	O
ውስን	O
ፍሬ	O
ነው	O
ያለው	O
አድራሻ	O
መገናኛ	O
መሰረት	O
ደፋር	O
ሞል	O
ሁለተኛ	O
ፎቅ	O
ቢሮ	O
ቁ	O
S	O
05	B-PRICE
S	O
06	B-PRICE
0902660722	B-PRICE
0928460606	B-PRICE
በ	O
Telegram	O
ለማዘዝ	O
ይጠቀሙ	O
zemencallcenter	O
zemenexpressadmin	O
ለተጨማሪ	O
ማብራሪያ	O
የቴሌግራም	O
ገፃችን	O
https	O
telegram	O
me	O
zemenexpress	O

nan	O

nan	O

nan	O

nan	O


Parsed 25 sentences  (1007 tokens)
