# Code#1

## This code uses MT as baseline, unfortunately the output we want mustn't have vowels for easier text rendering when looking for TORH and YHWH

NB: Had to discard for an easier alternative with already pre-existing non-vowels baseline. This one had too many vowels, I'd have to clean manually prior to the later search for our characters

In [None]:
#!/usr/bin/env python3
"""
reconstruct_torah.py

Usage:
    - Put your Sefaria-exported Torah CSV (one verse per row, in canonical order) as:
        input_sefaria_verses.csv

    - Put the retroversion mapping CSV as:
        lxx_hebrew_retroversions.csv
      with columns: Book,Chapter,Verse,LXX_Retro_Hebrew,RetroversionConfidence

    - Run:
        python reconstruct_torah.py

Outputs:
    - torah_reconstructed.csv  (Book,Chapter,Verse,MT_Text,Authoritative_Text,Replaced,Reason,RetroversionConfidence)
    - a short audit printed at the end
"""

import csv
import requests
import re
import difflib
import sys
from collections import namedtuple

# ---------- CONFIG ----------
SEFARIA_BASE = "https://www.sefaria.org"
BOOKS = ["Genesis", "Exodus", "Leviticus", "Numbers", "Deuteronomy"]

INPUT_VERSES_CSV = "input_sefaria_verses.csv"            # user-provided one-verse-per-row CSV
LXX_RETRO_CSV = "lxx_hebrew_retroversions.csv"          # mapping we created earlier
OUTPUT_CSV = "torah_reconstructed.csv"

# fuzzy matching thresholds
FUZZY_MATCH_RATIO_ACCEPT = 0.90     # treat as good match
FUZZY_MATCH_RATIO_WARN = 0.75       # treat as weak match (flag)
SEARCH_WINDOW = 6                   # when index mismatch, search +/- window for best match

# ---------- DATA STRUCTS ----------
VerseRef = namedtuple("VerseRef", ["book", "chapter", "verse"])
RetroEntry = namedtuple("RetroEntry", ["book", "chapter", "verse", "retro_he", "confidence"])

# ---------- HELPERS ----------
def clean_text_for_compare(s):
    """Normalize text for safe comparison: remove punctuation, vowels, extra spaces."""
    if s is None:
        return ""
    # remove non-Hebrew letters and spaces (keep Hebrew letters and spaces)
    s2 = re.sub(r'[^א-ת\s]', '', s)
    s2 = re.sub(r'\s+', ' ', s2).strip()
    return s2

def fetch_chapter_from_sefaria(book, chapter, lang='he'):
    """Fetch chapter JSON from Sefaria. Returns JSON or None."""
    url = f"{SEFARIA_BASE}/api/texts/{book}.{chapter}"
    try:
        resp = requests.get(url, params={'lang': lang}, timeout=20)
        if resp.status_code == 200:
            return resp.json()
    except Exception as e:
        print("SEFARIA FETCH ERROR:", e)
    return None

def get_hebrew_verses_from_chapter_json(chjson):
    """Return list of verse strings from possible fields."""
    if not chjson:
        return []
    for key in ("he", "text", "heText"):
        if key in chjson and isinstance(chjson[key], list):
            # elements may be str or dict with 'text'
            arr = []
            for el in chjson[key]:
                if isinstance(el, str):
                    arr.append(el)
                elif isinstance(el, dict) and 'text' in el:
                    arr.append(el['text'])
                else:
                    arr.append(str(el))
            return arr
    # fallback: sometimes the 'chapter' endpoint wraps content differently
    if 'text' in chjson and isinstance(chjson['text'], list):
        return [str(x) for x in chjson['text']]
    return []

def build_canonical_index():
    """
    Build a canonical list of (book,chapter,verse,mt_text) in order.
    This queries Sefaria; it may take ~10-30s depending on network.
    """
    canonical = []
    print("Building canonical verse index from Sefaria (this may take a little while)...")
    for book in BOOKS:
        chapter = 1
        while True:
            chjson = fetch_chapter_from_sefaria(book, chapter, lang='he')
            if not chjson:
                # no chapter at this index -> stop for this book
                break
            verses = get_hebrew_verses_from_chapter_json(chjson)
            if not verses:
                break
            for i, v in enumerate(verses):
                canonical.append((VerseRef(book, chapter, i+1), v))
            chapter += 1
    print(f"Canonical index built: {len(canonical)} verses.")
    return canonical

def load_input_verses(input_csv):
    """Load user-provided verses (one verse per row). Returns list of strings."""
    verses = []
    with open(input_csv, newline='', encoding='utf-8') as f:
        reader = csv.reader(f)
        for row in reader:
            if not row:
                continue
            # take first non-empty column as verse text
            text = None
            for c in row:
                if str(c).strip() != "":
                    text = c
                    break
            if text is None:
                text = ""
            verses.append(str(text))
    return verses

def load_retroversions(csvpath):
    """Load retroversion mapping into dict keyed by (book,chapter,verse)."""
    d = {}
    with open(csvpath, newline='', encoding='utf-8') as f:
        reader = csv.DictReader(f)
        # expected headers: Book,Chapter,Verse,LXX_Retro_Hebrew,RetroversionConfidence
        for r in reader:
            try:
                b = r.get("Book") or r.get("book")
                ch = int(r.get("Chapter") or r.get("chapter"))
                vs = int(r.get("Verse") or r.get("verse"))
                retro = r.get("LXX_Retro_Hebrew") or r.get("retro_hebrew") or ""
                conf = r.get("RetroversionConfidence") or r.get("confidence") or ""
                d[(b, ch, vs)] = RetroEntry(b, ch, vs, retro, conf)
            except Exception as e:
                print("Skipping bad retroversion row:", r, e)
    return d

# ---------- CORE PROCESS ----------
def reconstruct_torah(input_csv, retro_csv, output_csv):
    # Load data
    input_verses = load_input_verses(input_csv)
    print("Loaded input verses:", len(input_verses))
    retro_map = load_retroversions(retro_csv)
    print("Loaded retroversion entries:", len(retro_map))

    # Build canonical index
    canonical = build_canonical_index()
    canon_len = len(canonical)
    input_len = len(input_verses)

    if input_len != canon_len:
        print("WARNING: input verse count != canonical verse count.")
        print(" input:", input_len, " canonical:", canon_len)
        print("The script will try to map by index and attempt fuzzy alignment when mismatches occur.")
    else:
        print("Input length matches canonical length. Proceeding with index mapping.")

    rows_out = []
    replaced = 0
    mismatch_warns = 0
    manual_flags = []

    # Iterate by index -- primary mapping method
    for idx in range(input_len):
        user_text = input_verses[idx]
        clean_user = clean_text_for_compare(user_text)

        # Determine canonical reference for this index
        if idx < canon_len:
            ref, canon_text = canonical[idx]
            clean_canon = clean_text_for_compare(canon_text)
            mapping_method = "index"
        else:
            # user has more verses than canonical: fallback - map to last verse repeatedly (flag)
            ref = VerseRef("UNKNOWN", 0, 0)
            canon_text = ""
            clean_canon = ""
            mapping_method = "index_oob"

        # sanity check: exact match?
        exact = (clean_user == clean_canon and clean_user != "")
        fuzzy_ratio = difflib.SequenceMatcher(None, clean_user, clean_canon).ratio() if clean_canon and clean_user else 0.0

        # if exact or high fuzzy, accept mapping; else try local search
        chosen_ref = ref
        chosen_canon_text = canon_text
        chosen_clean_canon = clean_canon
        sanity_note = ""

        if not exact and fuzzy_ratio < FUZZY_MATCH_RATIO_ACCEPT:
            # search nearby canonical verses
            best_ratio = fuzzy_ratio
            best_pos = idx
            left = max(0, idx - SEARCH_WINDOW)
            right = min(canon_len - 1, idx + SEARCH_WINDOW)
            for j in range(left, right + 1):
                refj, textj = canonical[j]
                r = difflib.SequenceMatcher(None, clean_user, clean_text_for_compare(textj)).ratio()
                if r > best_ratio:
                    best_ratio = r
                    best_pos = j
            if best_pos != idx and best_ratio >= FUZZY_MATCH_RATIO_ACCEPT:
                chosen_ref, chosen_canon_text = canonical[best_pos]
                chosen_clean_canon = clean_text_for_compare(chosen_canon_text)
                mapping_method = f"local_search_from_{idx}_to_{best_pos}"
                sanity_note = f"index_mismatch_but_found_best_at_{best_pos}_ratio_{best_ratio:.3f}"
            else:
                # no good local match
                mapping_method = "index_primary_but_no_good_match"
                sanity_note = f"no_good_match (best_ratio={best_ratio:.3f})"
                mismatch_warns += 1
                manual_flags.append({
                    "pos": idx+1,
                    "user_text": user_text,
                    "best_ratio": best_ratio,
                    "reason": "no good match"
                })

        # Now we have chosen_ref
        key = (chosen_ref.book, chosen_ref.chapter, chosen_ref.verse)
        mt_text = chosen_canon_text or user_text  # prefer canonical MT as authoritative for source text
        mt_clean = clean_text_for_compare(mt_text)

        # check retroversion map
        if key in retro_map:
            retro = retro_map[key]
            authoritative = retro.retro_he or retro.retro_he.strip()
            if not authoritative:
                # empty retroversion -> fallback to MT but mark
                authoritative = mt_text
                replaced_flag = "NO"
                reason = "Retroversion empty"
                retro_conf = retro.confidence
            else:
                replaced_flag = "YES"
                reason = f"Replaced via LXX retroversion ({retro.confidence})"
                retro_conf = retro.confidence
                replaced += 1
        else:
            authoritative = mt_text
            replaced_flag = "NO"
            reason = "No retroversion entry"

        rows_out.append({
            "Index": idx + 1,
            "Book": chosen_ref.book,
            "Chapter": chosen_ref.chapter,
            "Verse": chosen_ref.verse,
            "MT_Text": mt_text,
            "User_Input_Text": user_text,
            "Authoritative_Text": authoritative,
            "Replaced": replaced_flag,
            "Reason": reason,
            "RetroversionConfidence": retro_conf if key in retro_map else "",
            "MappingMethod": mapping_method,
            "SanityNote": sanity_note
        })

    # write output CSV
    fieldnames = ["Index","Book","Chapter","Verse","MT_Text","User_Input_Text",
                  "Authoritative_Text","Replaced","Reason","RetroversionConfidence",
                  "MappingMethod","SanityNote"]
    with open(output_csv, "w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=fieldnames)
        writer.writeheader()
        for r in rows_out:
            writer.writerow(r)

    # summary
    print("\n=== RECONSTRUCTION SUMMARY ===")
    print("Input verses:", input_len)
    print("Canonical verses:", canon_len)
    print("Total replacements made:", replaced)
    print("Total mapping warnings (manual review suggested):", mismatch_warns)
    print("Output written to:", output_csv)
    if manual_flags:
        print("\nSample manual flags (first 5):")
        for m in manual_flags[:5]:
            print(m)
    print("\nNote: Check rows where Replaced == NO but Reason indicates 'Retroversion empty' or where MappingMethod starts with 'index_primary_but_no_good_match'")

# ---------- ENTRY POINT ----------
if __name__ == "__main__":
    try:
        reconstruct_torah(INPUT_VERSES_CSV, LXX_RETRO_CSV, OUTPUT_CSV)
    except KeyboardInterrupt:
        print("Interrupted by user.")
        sys.exit(1)


Loaded input verses: 5846
Loaded retroversion entries: 52
Building canonical verse index from Sefaria (this may take a little while)...
Canonical index built: 5846 verses.
Input length matches canonical length. Proceeding with index mapping.

=== RECONSTRUCTION SUMMARY ===
Input verses: 5846
Canonical verses: 5846
Total replacements made: 52
Output written to: torah_reconstructed.csv

Sample manual flags (first 5):
{'pos': 93, 'user_text': 'ויאמר קין אל יהוה גדול עוני מנשא', 'best_ratio': 0.7126436781609196, 'reason': 'no good match'}
{'pos': 107, 'user_text': 'זה ספר תולדת אדם ביום ברא אלהים אדם בדמות אלהים עשה אתו', 'best_ratio': 0.8088235294117647, 'reason': 'no good match'}
{'pos': 109, 'user_text': 'ויחי אדם מאתים ושלשים שנה ויולד בדמותו כצלמו ויקרא את שמו שת', 'best_ratio': 0.8888888888888888, 'reason': 'no good match'}
{'pos': 112, 'user_text': 'ויחי שת מאתים וחמש שנים ויולד את אנוש', 'best_ratio': 0.7567567567567568, 'reason': 'no good match'}
{'pos': 121, 'user_text': 'ויחי מה

# Code #2

## This code uses my input_sefaria_verses.csv as baseline which we have already edited out vowels from

In [7]:
import csv
import requests

INPUT_VERSES_CSV = "input_sefaria_verses.csv"
LXX_RETRO_CSV = "lxx_hebrew_retroversions.csv"
OUTPUT_CSV = "torah_reconstructed.csv"

BOOKS = ["Genesis", "Exodus", "Leviticus", "Numbers", "Deuteronomy"]

def fetch_canonical_refs():
    """Build ordered list of Book,Chapter,Verse WITHOUT storing MT text"""
    refs = []

    for book in BOOKS:
        chapter = 1
        while True:
            url = f"https://www.sefaria.org/api/texts/{book}.{chapter}?lang=he"
            r = requests.get(url)

            if r.status_code != 200:
                break

            data = r.json()
            verses = data.get("he")

            if not verses:
                break

            for i in range(len(verses)):
                refs.append((book, chapter, i + 1))

            chapter += 1

    return refs


def load_user_verses():
    verses = []
    with open(INPUT_VERSES_CSV, encoding="utf-8") as f:
        reader = csv.reader(f)
        for row in reader:
            if row:
                verses.append(row[0].strip())
    return verses


def load_retroversions():
    retro = {}
    with open(LXX_RETRO_CSV, encoding="utf-8") as f:
        reader = csv.DictReader(f)
        for r in reader:
            key = (r["Book"], int(r["Chapter"]), int(r["Verse"]))
            retro[key] = r["LXX_Retro_Hebrew"]
    return retro


def reconstruct():
    print("Loading canonical verse references...")
    refs = fetch_canonical_refs()

    print("Loading user verses...")
    user_verses = load_user_verses()

    print("Loading retroversions...")
    retro = load_retroversions()

    if len(user_verses) != len(refs):
        print("WARNING: verse count mismatch!")
        print("User:", len(user_verses))
        print("Canonical:", len(refs))

    rows = []
    replacements = 0

    for i in range(min(len(user_verses), len(refs))):
        book, ch, vs = refs[i]
        user_text = user_verses[i]

        key = (book, ch, vs)

        if key in retro:
            final_text = retro[key]
            replaced = "YES"
            replacements += 1
        else:
            final_text = user_text
            replaced = "NO"

        rows.append([
            book,
            ch,
            vs,
            user_text,
            final_text,
            replaced
        ])

    print("Saving reconstructed Torah...")

    with open(OUTPUT_CSV, "w", newline="", encoding="utf-8") as f:
        writer = csv.writer(f)
        writer.writerow([
            "Book",
            "Chapter",
            "Verse",
            "Original_User_Text",
            "Authoritative_Text",
            "Replaced"
        ])
        writer.writerows(rows)

    print("DONE")
    print("Total replacements:", replacements)


if __name__ == "__main__":
    reconstruct()


Loading canonical verse references...
Loading user verses...
Loading retroversions...
Saving reconstructed Torah...
DONE
Total replacements: 52


# Code#3

## This code uses MT as baseline but removes all vowels prior to processing

NB: Had to discard this version because the MT baseline had a lot of special characters like tags

In [6]:
#!/usr/bin/env python3
"""
reconstruct_torah_strip_diacritics.py

Same behavior as the earlier reconstruct_torah.py but automatically strips
Hebrew vowels (niqqud), cantillation marks (trop), and the maqaf (dash)
from any Hebrew text fetched from Sefaria (and from retroversions).
"""

import csv
import requests
import re
import difflib
import sys
import unicodedata
from collections import namedtuple

# ---------- CONFIG ----------
SEFARIA_BASE = "https://www.sefaria.org"
BOOKS = ["Genesis", "Exodus", "Leviticus", "Numbers", "Deuteronomy"]

INPUT_VERSES_CSV = "input_sefaria_verses.csv"            # user-provided one-verse-per-row CSV
LXX_RETRO_CSV = "lxx_hebrew_retroversions.csv"          # mapping we created earlier
OUTPUT_CSV = "torah_reconstructed.csv"

# fuzzy matching thresholds
FUZZY_MATCH_RATIO_ACCEPT = 0.90     # treat as good match
FUZZY_MATCH_RATIO_WARN = 0.75       # treat as weak match (flag)
SEARCH_WINDOW = 6                   # when index mismatch, search +/- window for best match

# ---------- DATA STRUCTS ----------
VerseRef = namedtuple("VerseRef", ["book", "chapter", "verse"])
RetroEntry = namedtuple("RetroEntry", ["book", "chapter", "verse", "retro_he", "confidence"])

# ---------- DIACRITICS STRIPPING ----------
def strip_hebrew_diacritics(text):
    """
    Remove Hebrew niqqud (vowel points), cantillation (trope) and maqaf (dash).
    Returns NFC-normalized text without combining marks.
    """
    if text is None:
        return ""
    # Normalize to NFD to separate base letters and combining marks
    nfd = unicodedata.normalize('NFD', text)
    # Remove all combining marks (category 'Mn') — this strips niqqud & cantillation
    no_marks = ''.join(ch for ch in nfd if unicodedata.category(ch) != 'Mn')
    # Remove maqaf (U+05BE) and any remaining combining-like punctuation we don't want
    no_marks = no_marks.replace('\u05BE', ' ')  # maqaf -> space (so words don't glue)
    # Normalize back to NFC
    return unicodedata.normalize('NFC', no_marks)

# ---------- HELPERS ----------
def clean_text_for_compare(s):
    """Normalize text for safe comparison: strip diacritics, remove non-Hebrew, collapse spaces."""
    if s is None:
        return ""
    s = strip_hebrew_diacritics(s)
    # keep only Hebrew letters and spaces
    s2 = re.sub(r'[^א-ת\s]', '', s)
    s2 = re.sub(r'\s+', ' ', s2).strip()
    return s2

def fetch_chapter_from_sefaria(book, chapter, lang='he'):
    """Fetch chapter JSON from Sefaria. Returns JSON or None."""
    url = f"{SEFARIA_BASE}/api/texts/{book}.{chapter}"
    try:
        resp = requests.get(url, params={'lang': lang}, timeout=20)
        if resp.status_code == 200:
            return resp.json()
    except Exception as e:
        print("SEFARIA FETCH ERROR:", e)
    return None

def get_hebrew_verses_from_chapter_json(chjson):
    """Return list of verse strings from possible fields."""
    if not chjson:
        return []
    for key in ("he", "text", "heText"):
        if key in chjson and isinstance(chjson[key], list):
            # elements may be str or dict with 'text'
            arr = []
            for el in chjson[key]:
                if isinstance(el, str):
                    arr.append(el)
                elif isinstance(el, dict) and 'text' in el:
                    arr.append(el['text'])
                else:
                    arr.append(str(el))
            return arr
    # fallback: sometimes the 'chapter' endpoint wraps content differently
    if 'text' in chjson and isinstance(chjson['text'], list):
        return [str(x) for x in chjson['text']]
    return []

def build_canonical_index():
    """
    Build a canonical list of (book,chapter,verse,mt_text) in order.
    Strip diacritics from MT text when storing.
    """
    canonical = []
    print("Building canonical verse index from Sefaria (this may take a little while)...")
    for book in BOOKS:
        chapter = 1
        while True:
            chjson = fetch_chapter_from_sefaria(book, chapter, lang='he')
            if not chjson:
                # no chapter at this index -> stop for this book
                break
            verses = get_hebrew_verses_from_chapter_json(chjson)
            if not verses:
                break
            for i, v in enumerate(verses):
                # strip diacritics immediately so canonical text is consonantal Hebrew
                v_stripped = strip_hebrew_diacritics(v)
                canonical.append((VerseRef(book, chapter, i+1), v_stripped))
            chapter += 1
    print(f"Canonical index built: {len(canonical)} verses.")
    return canonical

def load_input_verses(input_csv):
    """Load user-provided verses (one verse per row). Returns list of strings."""
    verses = []
    with open(input_csv, newline='', encoding='utf-8') as f:
        reader = csv.reader(f)
        for row in reader:
            if not row:
                continue
            # take first non-empty column as verse text
            text = None
            for c in row:
                if str(c).strip() != "":
                    text = c
                    break
            if text is None:
                text = ""
            verses.append(str(text))
    return verses

def load_retroversions(csvpath):
    """Load retroversion mapping into dict keyed by (book,chapter,verse)."""
    d = {}
    with open(csvpath, newline='', encoding='utf-8') as f:
        reader = csv.DictReader(f)
        # expected headers: Book,Chapter,Verse,LXX_Retro_Hebrew,RetroversionConfidence
        for r in reader:
            try:
                b = r.get("Book") or r.get("book")
                ch = int(r.get("Chapter") or r.get("chapter"))
                vs = int(r.get("Verse") or r.get("verse"))
                retro_raw = r.get("LXX_Retro_Hebrew") or r.get("retro_hebrew") or ""
                # ensure retroversions are also stripped of diacritics
                retro = strip_hebrew_diacritics(retro_raw)
                conf = r.get("RetroversionConfidence") or r.get("confidence") or ""
                d[(b, ch, vs)] = RetroEntry(b, ch, vs, retro, conf)
            except Exception as e:
                print("Skipping bad retroversion row:", r, e)
    return d

# ---------- CORE PROCESS ----------
def reconstruct_torah(input_csv, retro_csv, output_csv):
    # Load data
    input_verses = load_input_verses(input_csv)
    print("Loaded input verses:", len(input_verses))
    retro_map = load_retroversions(retro_csv)
    print("Loaded retroversion entries:", len(retro_map))

    # Build canonical index
    canonical = build_canonical_index()
    canon_len = len(canonical)
    input_len = len(input_verses)

    if input_len != canon_len:
        print("WARNING: input verse count != canonical verse count.")
        print(" input:", input_len, " canonical:", canon_len)
        print("The script will try to map by index and attempt fuzzy alignment when mismatches occur.")
    else:
        print("Input length matches canonical length. Proceeding with index mapping.")

    rows_out = []
    replaced = 0
    mismatch_warns = 0
    manual_flags = []

    # Iterate by index -- primary mapping method
    for idx in range(input_len):
        user_text = input_verses[idx]
        clean_user = clean_text_for_compare(user_text)

        # Determine canonical reference for this index
        if idx < canon_len:
            ref, canon_text = canonical[idx]
            # canon_text already stripped of diacritics in build_canonical_index
            clean_canon = clean_text_for_compare(canon_text)
            mapping_method = "index"
        else:
            # user has more verses than canonical: fallback - map to last verse repeatedly (flag)
            ref = VerseRef("UNKNOWN", 0, 0)
            canon_text = ""
            clean_canon = ""
            mapping_method = "index_oob"

        # sanity check: exact match?
        exact = (clean_user == clean_canon and clean_user != "")
        fuzzy_ratio = difflib.SequenceMatcher(None, clean_user, clean_canon).ratio() if clean_canon and clean_user else 0.0

        # if exact or high fuzzy, accept mapping; else try local search
        chosen_ref = ref
        chosen_canon_text = canon_text
        chosen_clean_canon = clean_canon
        sanity_note = ""

        if not exact and fuzzy_ratio < FUZZY_MATCH_RATIO_ACCEPT:
            # search nearby canonical verses
            best_ratio = fuzzy_ratio
            best_pos = idx
            left = max(0, idx - SEARCH_WINDOW)
            right = min(canon_len - 1, idx + SEARCH_WINDOW)
            for j in range(left, right + 1):
                refj, textj = canonical[j]
                r = difflib.SequenceMatcher(None, clean_user, clean_text_for_compare(textj)).ratio()
                if r > best_ratio:
                    best_ratio = r
                    best_pos = j
            if best_pos != idx and best_ratio >= FUZZY_MATCH_RATIO_ACCEPT:
                chosen_ref, chosen_canon_text = canonical[best_pos]
                chosen_clean_canon = clean_text_for_compare(chosen_canon_text)
                mapping_method = f"local_search_from_{idx}_to_{best_pos}"
                sanity_note = f"index_mismatch_but_found_best_at_{best_pos}_ratio_{best_ratio:.3f}"
            else:
                # no good local match
                mapping_method = "index_primary_but_no_good_match"
                sanity_note = f"no_good_match (best_ratio={best_ratio:.3f})"
                mismatch_warns += 1
                manual_flags.append({
                    "pos": idx+1,
                    "user_text": user_text,
                    "best_ratio": best_ratio,
                    "reason": "no good match"
                })

        # Now we have chosen_ref
        key = (chosen_ref.book, chosen_ref.chapter, chosen_ref.verse)
        # mt_text: use canonical MT (already stripped of diacritics) if available, else keep user text (strip diacritics for MT_Text)
        mt_text = chosen_canon_text or strip_hebrew_diacritics(user_text)
        mt_clean = clean_text_for_compare(mt_text)

        retro_conf = ""
        # check retroversion map
        if key in retro_map:
            retro = retro_map[key]
            authoritative = retro.retro_he or retro.retro_he.strip()
            # authoritative is already stripped when loaded
            if not authoritative:
                # empty retroversion -> fallback to MT but mark
                authoritative = mt_text
                replaced_flag = "NO"
                reason = "Retroversion empty"
                retro_conf = retro.confidence
            else:
                replaced_flag = "YES"
                reason = f"Replaced via LXX retroversion ({retro.confidence})"
                retro_conf = retro.confidence
                replaced += 1
        else:
            authoritative = mt_text
            replaced_flag = "NO"
            reason = "No retroversion entry"

        rows_out.append({
            "Index": idx + 1,
            "Book": chosen_ref.book,
            "Chapter": chosen_ref.chapter,
            "Verse": chosen_ref.verse,
            "MT_Text": mt_text,
            "User_Input_Text": user_text,
            "Authoritative_Text": authoritative,
            "Replaced": replaced_flag,
            "Reason": reason,
            "RetroversionConfidence": retro_conf if key in retro_map else "",
            "MappingMethod": mapping_method,
            "SanityNote": sanity_note
        })

    # write output CSV
    fieldnames = ["Index","Book","Chapter","Verse","MT_Text","User_Input_Text",
                  "Authoritative_Text","Replaced","Reason","RetroversionConfidence",
                  "MappingMethod","SanityNote"]
    with open(output_csv, "w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=fieldnames)
        writer.writeheader()
        for r in rows_out:
            writer.writerow(r)

    # summary
    print("\n=== RECONSTRUCTION SUMMARY ===")
    print("Input verses:", input_len)
    print("Canonical verses:", canon_len)
    print("Total replacements made:", replaced)
    print("Total mapping warnings (manual review suggested):", mismatch_warns)
    print("Output written to:", output_csv)
    if manual_flags:
        print("\nSample manual flags (first 5):")
        for m in manual_flags[:5]:
            print(m)
    print("\nNote: Check rows where Replaced == NO but Reason indicates 'Retroversion empty' or where MappingMethod starts with 'index_primary_but_no_good_match'")

# ---------- ENTRY POINT ----------
if __name__ == "__main__":
    try:
        reconstruct_torah(INPUT_VERSES_CSV, LXX_RETRO_CSV, OUTPUT_CSV)
    except KeyboardInterrupt:
        print("Interrupted by user.")
        sys.exit(1)


Loaded input verses: 5846
Loaded retroversion entries: 52
Building canonical verse index from Sefaria (this may take a little while)...
Canonical index built: 5846 verses.
Input length matches canonical length. Proceeding with index mapping.

=== RECONSTRUCTION SUMMARY ===
Input verses: 5846
Canonical verses: 5846
Total replacements made: 52
Output written to: torah_reconstructed.csv

Sample manual flags (first 5):
{'pos': 93, 'user_text': 'ויאמר קין אל יהוה גדול עוני מנשא', 'best_ratio': 0.7272727272727273, 'reason': 'no good match'}
{'pos': 107, 'user_text': 'זה ספר תולדת אדם ביום ברא אלהים אדם בדמות אלהים עשה אתו', 'best_ratio': 0.8088235294117647, 'reason': 'no good match'}
{'pos': 109, 'user_text': 'ויחי אדם מאתים ושלשים שנה ויולד בדמותו כצלמו ויקרא את שמו שת', 'best_ratio': 0.8983050847457628, 'reason': 'no good match'}
{'pos': 112, 'user_text': 'ויחי שת מאתים וחמש שנים ויולד את אנוש', 'best_ratio': 0.7894736842105263, 'reason': 'no good match'}
{'pos': 121, 'user_text': 'ויחי מה