
# MorphoTokenizer Test Harness (`test_harness.ipynb`)

This notebook lets you **interactively tune** the parameters of `MorphoTokenizer`
and see how they affect segmentation on a real biomedical PDF:

> `nasal-congestion_ijgm-3-059.pdf`

**Workflow**

1. Adjust the parameters in **Cell 2** (`TOKENIZER_PARAMS`).
2. Run the cells:
   - The notebook will load the PDF from the same directory as the notebook.
   - It will extract text and keep a sample chunk.
3. Use the inspection cells to:
   - Segment specific words (e.g., `hyperparathyroidism`, `thyroid`, `secondary`).
   - See a side‑by‑side view of original vs. `<ETY>`‑marked text.


In [None]:
# !pip install -U pypdf --quiet

# 1) Imports and file paths

import os
from pathlib import Path
import re
from typing import List, Tuple

# PDF reading
try:
    from pypdf import PdfReader
except ImportError:
    # If pypdf is not installed, uncomment this line in your environment:
    # !pip install pypdf
    from pypdf import PdfReader  # will still fail here if not installed

# Our tokenizer module (must be in the same directory or on PYTHONPATH)
from morpho_vtb_tokenizer import MorphoTokenizer

# Base directory = directory of this notebook
BASE_DIR = Path.cwd()

PDF_NAME = "nasal-congestion_ijgm-3-059.pdf"
PDF_PATH = BASE_DIR / PDF_NAME

ROOT_LEX_NAME_GZ = "morph_lexicon_by_root.json.gz"
PROTO_LEX_NAME_GZ = "morph_lexicon_by_protoroot.json.gz"

ROOT_LEX_PATH = BASE_DIR / ROOT_LEX_NAME_GZ
PROTO_LEX_PATH = BASE_DIR / PROTO_LEX_NAME_GZ

print("Working directory:", BASE_DIR)
print("PDF path:", PDF_PATH, "exists?", PDF_PATH.exists())
print("Root lexicon:", ROOT_LEX_PATH, "exists?", ROOT_LEX_PATH.exists())
print("Proto lexicon:", PROTO_LEX_PATH, "exists?", PROTO_LEX_PATH.exists())


Working directory: g:\My Drive\Education\UMBC\2025\CMSC 691\Project
PDF path: g:\My Drive\Education\UMBC\2025\CMSC 691\Project\nasal-congestion_ijgm-3-059.pdf exists? True
Root lexicon: g:\My Drive\Education\UMBC\2025\CMSC 691\Project\morph_lexicon_by_root.json.gz exists? True
Proto lexicon: g:\My Drive\Education\UMBC\2025\CMSC 691\Project\morph_lexicon_by_protoroot.json.gz exists? True


In [2]:

# 2) Parameter block for MorphoTokenizer
# You can edit these values and re-run the cell to tune behavior.
#
# Recommended range for the three adjustments: [-100, +100]

TOKENIZER_PARAMS = dict(
    lexicon_json=ROOT_LEX_PATH,
    proto_lexicon_json=PROTO_LEX_PATH,
    max_morpheme_len=12,          # hard cap on candidate substring length
    unk_base_penalty=-2.0,        # base score for unknown pieces
    unk_per_char=-0.2,            # per-character penalty for unknown pieces
    add_generic_suffixes=True,
    lambda_penalty=4.5,           # cost per piece (discourages over-segmentation)
    long_unsplit_min_len=5,       # word length at which we start penalizing "no split"
    long_unsplit_penalty=3.0,     # penalty if long word kept as a single piece

    # Tunable adjustments (you can change these and re-run)
    short2_adjust=-5.0,           # adjustment for 2-char roots (negative = penalize)
    mid_root_adjust=+2.0,         # adjustment for mid-length roots (4–9 chars)
    long_root_adjust=-0.5,        # per-char adjustment when length > LONG_ROOT_THRESHOLD
)

# Threshold length for "long" roots; this is used together with long_root_adjust.
LONG_ROOT_THRESHOLD = 7

print("Current TOKENIZER_PARAMS:")
for k, v in TOKENIZER_PARAMS.items():
    print(f"  {k}: {v}")
print("LONG_ROOT_THRESHOLD:", LONG_ROOT_THRESHOLD)


Current TOKENIZER_PARAMS:
  lexicon_json: g:\My Drive\Education\UMBC\2025\CMSC 691\Project\morph_lexicon_by_root.json.gz
  proto_lexicon_json: g:\My Drive\Education\UMBC\2025\CMSC 691\Project\morph_lexicon_by_protoroot.json.gz
  max_morpheme_len: 12
  unk_base_penalty: -2.0
  unk_per_char: -0.2
  add_generic_suffixes: True
  lambda_penalty: 4.5
  long_unsplit_min_len: 5
  long_unsplit_penalty: 3.0
  short2_adjust: -5.0
  mid_root_adjust: 2.0
  long_root_adjust: -0.5
LONG_ROOT_THRESHOLD: 7


In [3]:

# 3) Instantiate MorphoTokenizer with the current parameters

morpho_tok = MorphoTokenizer(
    lexicon_json=TOKENIZER_PARAMS["lexicon_json"],
    proto_lexicon_json=TOKENIZER_PARAMS["proto_lexicon_json"],
    max_morpheme_len=TOKENIZER_PARAMS["max_morpheme_len"],
    unk_base_penalty=TOKENIZER_PARAMS["unk_base_penalty"],
    unk_per_char=TOKENIZER_PARAMS["unk_per_char"],
    add_generic_suffixes=TOKENIZER_PARAMS["add_generic_suffixes"],
    lambda_penalty=TOKENIZER_PARAMS["lambda_penalty"],
    long_unsplit_min_len=TOKENIZER_PARAMS["long_unsplit_min_len"],
    long_unsplit_penalty=TOKENIZER_PARAMS["long_unsplit_penalty"],
    short2_adjust=TOKENIZER_PARAMS["short2_adjust"],
    mid_root_adjust=TOKENIZER_PARAMS["mid_root_adjust"],
    long_root_adjust=TOKENIZER_PARAMS["long_root_adjust"],
)

print("MorphoTokenizer is ready.")


MorphoTokenizer is ready.


In [4]:

# 4) Helper: morpho_preseg(text) and visualization utilities

def _tokenize_with_separators(text: str):
    # Split text into tokens, keeping whitespace and punctuation as separate tokens.
    return re.findall(r"\w+|\s+|[^\w\s]", text, flags=re.UNICODE)


def morpho_preseg(text: str) -> str:
    # Insert <ETY> markers on morphological roots for all alphabetic tokens.
    tokens = _tokenize_with_separators(text)
    out_parts = []
    for t in tokens:
        if t.isalpha():
            pieces = morpho_tok.segment_word(t)
            chunk = ""
            for p in pieces:
                if p.kind == "root":
                    chunk += f"{p.text}<ETY>"
                else:
                    chunk += p.text
            out_parts.append(chunk or t)
        else:
            out_parts.append(t)
    return "".join(out_parts)


def show_word_segmentation(words):
    # Pretty-print segmentation of one or more words.
    if isinstance(words, str):
        words = [words]
    for w in words:
        pieces = morpho_tok.segment_word(w)
        label = " + ".join(f"{p.text}[{p.kind}:{p.score:.2f}]" for p in pieces)
        print(f"{w} -> {label}")


def show_side_by_side(text: str, max_chars: int = 400):
    # Show original vs. morpho_preseg(text) for a short snippet.
    snippet = text[:max_chars]
    seg = morpho_preseg(snippet)
    print("ORIGINAL:")
    print(snippet)
    print("\nSEGMENTED (with <ETY>):")
    print(seg)


In [5]:
# 5) Load text from the PDF and keep a sample chunk

if not PDF_PATH.exists():
    raise FileNotFoundError(f"Expected PDF at {PDF_PATH}, but it does not exist.")

reader = PdfReader(str(PDF_PATH))

all_text_pages = []

# --- Skip the first 2 pages ---
# Start from reader.pages[2:] instead of reader.pages
for page_idx, page in enumerate(reader.pages[2:], start=1):
    try:
        page_text = page.extract_text() or ""
    except Exception as e:
        print(f"Warning: could not extract text from page {page_idx}: {e}")
        page_text = ""
    all_text_pages.append(page_text)

full_text = "\n\n".join(all_text_pages)

print("Total text length:", len(full_text))
print("First 500 characters:\n")
print(full_text[:500])

# For convenience, keep a shorter sample for repeated experiments.
SAMPLE_TEXT = full_text[:5000]


Total text length: 26515
First 500 characters:

International Journal of General Medicine 2010:3 61
Nasal congestion diagnosisDovepress
submit your manuscript | www.dovepress.com
Dovepress 
T able 1 Differential diagnosis of nasal congestion
Rhinitis Duration T ypical other symptoms
Infectious rhinitis
 viral 10 days Sneezing, watery rhinorrhea, sore throat  
Purulent discharge, facial pain
 Bacterial 10 days
 Other infectious agents 10 days
Allergic rhinitis
 Intermittent 4 days/week For all allergic rhinitis: Sneezing, watery rhinorrhea


In [6]:

# 6) Quick sanity checks on key words

words_to_check = [
    "hyperparathyroidism",
    "thyroid",
    "secondary",
    "oxacalcitriol",
    "biomedical",
]

print("=== Word-level segmentation ===")
show_word_segmentation(words_to_check)

print("\n=== morpho_preseg('hyperparathyroidism') ===")
print(morpho_preseg("hyperparathyroidism"))


=== Word-level segmentation ===
hyperparathyroidism -> hyper[root:7.60] + para[root:7.30] + thyroid[root:7.79] + ism[root:5.00]
thyroid -> thyroid[root:7.79]
secondary -> second[root:7.49] + ary[root:4.59]
oxacalcitriol -> oxa[unk:-2.60] + cal[root:5.00] + citri[root:7.19] + ol[root:-0.30]
biomedical -> bio[root:5.29] + medical[root:7.79]

=== morpho_preseg('hyperparathyroidism') ===
hyper<ETY>para<ETY>thyroid<ETY>ism<ETY>


In [7]:

# 7) Side-by-side view on a snippet from the PDF

show_side_by_side(SAMPLE_TEXT, max_chars=800)


ORIGINAL:
International Journal of General Medicine 2010:3 61
Nasal congestion diagnosisDovepress
submit your manuscript | www.dovepress.com
Dovepress 
T able 1 Differential diagnosis of nasal congestion
Rhinitis Duration T ypical other symptoms
Infectious rhinitis
 viral 10 days Sneezing, watery rhinorrhea, sore throat  
Purulent discharge, facial pain
 Bacterial 10 days
 Other infectious agents 10 days
Allergic rhinitis
 Intermittent 4 days/week For all allergic rhinitis: Sneezing, watery rhinorrhea,  
itch, eye symptoms, lower airway symptoms
4 weeks/year
 Persistent 4 days/week
4 weeks/year
Occupational rhinitis
 Allergic
Usually less when away from workplace As allergic rhinitis
 Nonallergic Usually mainly nasal blockage, rhinorrhea
Drug induced Sometimes difficult to make relation,  
ca

SEGMENTED (with <ETY>):
Inter<ETY>national<ETY> Journal<ETY> of<ETY> General<ETY> Medicine<ETY> 2010:3 61
Nasal<ETY> con<ETY>gestion<ETY> dia<ETY>gnosis<ETY>Dove<ETY>press<ETY>
sub<ETY>mit