In [13]:
import json
from pathlib import Path
import stanza
import re
from difflib import get_close_matches
import sys 

sys.path.append(str(Path.cwd().parent))
from utils.regexes import SPEAKER_PAIRS, SPEAKERS, speaker_labels, speaker_labels_restricted

nlp = stanza.Pipeline(lang="en", processors="tokenize,mwt,pos,lemma,ner", verbose=False)

def detect_proper_nouns(text: str, nlp_pipeline, include_non_person: bool = False, show_title_pairs: bool = True):
    doc = nlp_pipeline(text)
    # show all NER entities
    if include_non_person:
        print("All Named Entities (NER):")
        for ent in doc.ents:
            print(f"{ent.text:<30} -> {ent.type}")

    # PERSON-only view
    print("\nPERSON entities:")
    person_tags = {"PERSON", "PER"}
    found_person = False
    for ent in doc.ents:
        if ent.type in person_tags:
            found_person = True
            print(ent.text)
    if not found_person:
        print("(none)")

    # Lightweight title + name using POS
    if show_title_pairs:
        print("\nTitle + Name pairs (POS-based):")
        title_words = {"Dr.", "Mr.", "Ms.", "Miss.", "Mrs.", "Prof.", "Professor"}
        found_pair = False
        for sent in doc.sentences:
            words = sent.words
            for i in range(len(words) - 1):
                if words[i].text in title_words and words[i + 1].upos == "PROPN":
                    found_pair = True
                    print(f"{words[i].text} {words[i+1].text}")
        if not found_pair:
            print("(none)")

def process_transcripts(input_folder, nlp_pipeline, only_stem: str | None = None,
                        include_non_person: bool = False, show_title_pairs: bool = True):
    input_path = Path(input_folder).resolve()
    print(f"[INFO] Input folder: {input_path}")
    files = sorted(input_path.glob("*.txt.json"))
    print(f"[INFO] Found {len(files)} files matching *.txt.json")
    if only_stem:
        files = [f for f in files if f.stem.startswith(only_stem)]
        print(f"[INFO] After filtering by stem '{only_stem}': {len(files)} file(s)")

    if not files:
        print("[WARN] No matching files.")
        return

    for file in files:
        print("\n" + "=" * 70)
        print(f"[FILE] {file.name}")
        with open(file, "r", encoding="utf-8") as f:
            data = json.load(f)

        rows = data.get("rows", [])
        if not rows:
            print("[WARN] Skipping: No 'rows' key or it's empty.")
            continue

        text = "\n".join(row.get("content", "") for row in rows)
        detect_proper_nouns(text, nlp_pipeline, include_non_person=include_non_person, show_title_pairs=show_title_pairs)

def process_string(sample_text: str, nlp_pipeline, include_non_person: bool = True, show_title_pairs: bool = True):
    print("[INFO] Running detection on a literal string…")
    detect_proper_nouns(sample_text, nlp_pipeline, include_non_person=include_non_person, show_title_pairs=show_title_pairs)

# Example usage:
if __name__ == "__main__":
    # Example: adjust to your project layout
    project_root = Path.cwd().parent
    input_folder = project_root / "data" / "s1055-1058" / "REVIEW"

    # Target just one file (two test files. in use are 057_707 and 055_684)
    process_transcripts(
        input_folder=input_folder,
        nlp_pipeline=nlp,
        only_stem="055_684",          # set to None to scan all
        include_non_person=True,       # set False to show only PERSON entities
        show_title_pairs=True
    )


[INFO] Input folder: /Users/admin/Documents/coding_land/HoardingDisorderScripts/data/s1055-1058/REVIEW
[INFO] Found 37 files matching *.txt.json
[INFO] After filtering by stem '055_684': 1 file(s)

[FILE] 055_684.txt.json
All Named Entities (NER):
26:28                          -> CARDINAL
26:42                          -> CARDINAL
26:48                          -> CARDINAL
I’ve                           -> PERSON
27:01                          -> CARDINAL
27:07                          -> CARDINAL
Gotcha                         -> PERSON
two                            -> CARDINAL
27:33                          -> CARDINAL
27:38                          -> CARDINAL
27:41                          -> CARDINAL
28:09                          -> CARDINAL
28:18                          -> CARDINAL
Gotcha                         -> PERSON
28:42                          -> CARDINAL
29:04                          -> CARDINAL
29:12                          -> CARDINAL
29:18                      

In [16]:
import json
from pathlib import Path
import re
from difflib import get_close_matches

# ---------- Canonical speaker set (fallback if import fails) ----------
try:
    speaker_set = set(SPEAKERS)  # e.g., from utils.regexes
except NameError:
    speaker_set = {"Interviewer", "Participant", "Interviewee", "Speaker"}

# ---------- Helpers ----------
def _alt(speakers: set[str]) -> str:
    return "|".join(re.escape(s) for s in sorted(speakers, key=len, reverse=True))

def _iter_lines(text: str):
    for i, line in enumerate(text.splitlines(), start=1):
        yield i, line

# ---------- Producer 1 ----------
def find_speaker_format_issues(text: str, speakers: set[str]):
    """
    Returns a dict with two keys; each value is a list of EXACT (line_no, snippet, full_line) tuples:
      {
        "spacing_issue":   [(ln, snip, full), ...],  # there is a space right before the colon
        "bad_punctuation": [(ln, snip, full), ...],  # label not followed by allowed patterns
      }

    Allowed patterns after the label:
      - optional whitespace + ':'                        -> "Interviewer: ..."
      - optional whitespace + '(' timestamp ')' + ':'    -> "Interviewer (20:14): ..." or "Interviewer (1:02:33): ..."
    """
    labels = _alt(speakers)

    # Timestamp patterns like (20:14) or (1:02:33), with optional spaces inside
    ts = r"\(\s*\d{1,2}:\d{2}(?::\d{2})?\s*\)"

    # A fully valid prefix (no error) is:
    #   Label [spaces] [optional (timestamp)] [spaces] ':'
    valid_prefix = re.compile(
        rf"^\s*(?:{labels})\s*(?:{ts})?\s*:",
        re.MULTILINE
    )

    # Spacing issue = space(s) immediately before the colon.
    # We catch both:
    #   - Label [spaces] ':'   (classic)  e.g., "Interviewer :"
    #   - Label [spaces] (ts) [spaces] ':'  e.g., "Interviewer (20:14) :"
    spacing_after_label = re.compile(
        rf"^\s*(?:{labels})\s+:(?=\s|\S)",
        re.MULTILINE
    )
    spacing_after_label_ts = re.compile(
        rf"^\s*(?:{labels})\s*(?:{ts})\s+:(?=\s|\S)",
        re.MULTILINE
    )

    spacing_hits: list[tuple[int, str, str]] = []
    bad_punct_hits: list[tuple[int, str, str]] = []

    for ln, line in _iter_lines(text):
        # spacing issues (either directly after label, or after timestamp)
        if spacing_after_label.search(line) or spacing_after_label_ts.search(line):
            snippet = line.strip()[:40]
            spacing_hits.append((ln, snippet, line))
            # Note: a line can have spacing issue but still be otherwise "valid";
            # we don't mark it as bad_punctuation if it matches valid_prefix.
            continue

        # If line begins with a label but does NOT match a valid prefix,
        # then it's bad punctuation (e.g., "Interviewer." "Speaker-" "Participant's" "Interviewee?")
        starts_with_label = re.match(rf"^\s*(?:{labels})\b", line)
        if starts_with_label and not valid_prefix.search(line):
            snippet = line.strip()[:40]
            bad_punct_hits.append((ln, snippet, line))

    return {
        "spacing_issue": spacing_hits,
        "bad_punctuation": bad_punct_hits,
    }


# ---------- Producer 2 ----------
def find_spelling_variants(text: str, speakers: set[str], threshold: float = 0.8):
    cand_re = re.compile(r'^([A-Z][A-Za-z0-9_ ]{1,30})(?=\s*:\s*)', re.MULTILINE)
    candidates = {m.group(1) for m in cand_re.finditer(text)} - speakers
    out = {}
    for cand in candidates:
        match = get_close_matches(cand, list(speakers), n=1, cutoff=threshold)
        if match and match[0] != cand:
            out[cand] = match[0]
    return out

# ---------- Producer 3 ----------
def find_multi_speaker_lines(text: str, speakers: set[str]):
    labels = _alt(speakers)
    label_colon = re.compile(rf'\b({labels})\s*:', re.IGNORECASE)
    hits = []
    for ln, line in _iter_lines(text):
        found = [m.group(1) for m in label_colon.finditer(line)]
        if len(found) >= 2:
            hits.append((ln, line, found))
    return hits

# ---------- Main runner ----------
def process_file(file_path: Path):
    print(f"\n[FILE] {file_path.name}")
    with open(file_path, "r", encoding="utf-8") as f:
        data = json.load(f)

    rows = data.get("rows", [])
    if not rows:
        print("[WARN] Skipping: No 'rows' key or empty")
        return

    text = "\n".join(row.get("content", "") for row in rows)

    # Run detectors
    fmt_issues = find_speaker_format_issues(text, speaker_set)
    misspellings = find_spelling_variants(text, speaker_set, threshold=0.8)
    multi_lines = find_multi_speaker_lines(text, speaker_set)

    total = sum(len(v) for v in fmt_issues.values()) + len(misspellings) + len(multi_lines)

    if total:
        print(f"YES — errors found: {total}")
        if fmt_issues["spacing_issue"]:
            print(f"  • Space-before-colon: {len(fmt_issues['spacing_issue'])}")
            for ln, snip, full in fmt_issues["spacing_issue"]:
                print(f"      line {ln}: {snip!r}")
        if fmt_issues["bad_punctuation"]:
            print(f"  • Bad punctuation after label: {len(fmt_issues['bad_punctuation'])}")
            for ln, snip, full in fmt_issues["bad_punctuation"]:
                print(f"      line {ln}: {snip!r}")
        if misspellings:
            print(f"  • Likely misspellings: {len(misspellings)}")
            for bad, sug in misspellings.items():
                print(f"      {bad!r} -> {sug!r}")
        if multi_lines:
            print(f"  • Multiple labels on one line: {len(multi_lines)}")
            for ln, line_txt, labels in multi_lines:
                print(f"      line {ln}: labels={labels} | {line_txt[:120]}")
    else:
        print("NO ERRORS")

# ---------- Example call ----------
if __name__ == "__main__":
    project_root = Path.cwd().parent  # adjust if needed
    input_folder = project_root / "data" / "s1055-1058" / "REVIEW"
    target_file = input_folder / "057_707.txt.json"   # change this file name

    process_file(target_file)

from pathlib import Path

def process_all_files(folder_path):
    folder = Path(folder_path)
    for file in sorted(folder.glob("*.txt.json")):
        process_file(file)   # reuse your existing function

# Example usage
if __name__ == "__main__":
    input_folder = Path.cwd().parent / "data" / "s1055-1058" / "REVIEW"
    process_all_files(input_folder)




[FILE] 057_707.txt.json
NO ERRORS

[FILE] 055_678.txt.json
YES — errors found: 1
  • Bad punctuation after label: 1
      line 6: 'Participant 055 (0:33)'

[FILE] 055_679.txt.json
NO ERRORS

[FILE] 055_680.txt.json
NO ERRORS

[FILE] 055_681.txt.json
NO ERRORS

[FILE] 055_682.txt.json
NO ERRORS

[FILE] 055_683.txt.json
NO ERRORS

[FILE] 055_684.txt.json
NO ERRORS

[FILE] 055_685.txt.json
NO ERRORS

[FILE] 055_686.txt.json
YES — errors found: 1
  • Bad punctuation after label: 1
      line 20: 'Participant 055 (36:06)'

[FILE] 055_687.txt.json
NO ERRORS

[FILE] 055_688.txt.json
NO ERRORS

[FILE] 055_689.txt.json
NO ERRORS

[FILE] 056_690.txt.json
NO ERRORS

[FILE] 056_691.txt.json
NO ERRORS

[FILE] 056_692.txt.json
NO ERRORS

[FILE] 056_693.txt.json
NO ERRORS

[FILE] 056_694.txt.json
NO ERRORS

[FILE] 056_695.txt.json
YES — errors found: 1
  • Bad punctuation after label: 1
      line 10: 'Participant 056 (2:47)'

[FILE] 056_696.txt.json
NO ERRORS

[FILE] 056_697.txt.json
YES — errors f