# PHASE ONE

## This code uses my input_sefaria_verses.csv as baseline which we have already edited out vowels from

In [None]:
import csv
import requests

INPUT_VERSES_CSV = "input_sefaria_verses.csv"
LXX_RETRO_CSV = "lxx_hebrew_retroversions.csv"
OUTPUT_CSV = "torah_reconstructed.csv"

BOOKS = ["Genesis", "Exodus", "Leviticus", "Numbers", "Deuteronomy"]

def fetch_canonical_refs():
    """Build ordered list of Book,Chapter,Verse WITHOUT storing MT text"""
    refs = []

    for book in BOOKS:
        chapter = 1
        while True:
            url = f"https://www.sefaria.org/api/texts/{book}.{chapter}?lang=he"
            r = requests.get(url)

            if r.status_code != 200:
                break

            data = r.json()
            verses = data.get("he")

            if not verses:
                break

            for i in range(len(verses)):
                refs.append((book, chapter, i + 1))

            chapter += 1

    return refs


def load_user_verses():
    verses = []
    with open(INPUT_VERSES_CSV, encoding="utf-8") as f:
        reader = csv.reader(f)
        for row in reader:
            if row:
                verses.append(row[0].strip())
    return verses


def load_retroversions():
    retro = {}
    with open(LXX_RETRO_CSV, encoding="utf-8") as f:
        reader = csv.DictReader(f)
        for r in reader:
            key = (r["Book"], int(r["Chapter"]), int(r["Verse"]))
            retro[key] = r["LXX_Retro_Hebrew"]
    return retro


def reconstruct():
    print("Loading canonical verse references...")
    refs = fetch_canonical_refs()

    print("Loading user verses...")
    user_verses = load_user_verses()

    print("Loading retroversions...")
    retro = load_retroversions()

    if len(user_verses) != len(refs):
        print("WARNING: verse count mismatch!")
        print("User:", len(user_verses))
        print("Canonical:", len(refs))

    rows = []
    replacements = 0

    for i in range(min(len(user_verses), len(refs))):
        book, ch, vs = refs[i]
        user_text = user_verses[i]

        key = (book, ch, vs)

        if key in retro:
            final_text = retro[key]
            replaced = "YES"
            replacements += 1
        else:
            final_text = user_text
            replaced = "NO"

        rows.append([
            book,
            ch,
            vs,
            user_text,
            final_text,
            replaced
        ])

    print("Saving reconstructed Torah...")

    with open(OUTPUT_CSV, "w", newline="", encoding="utf-8") as f:
        writer = csv.writer(f)
        writer.writerow([
            "Book",
            "Chapter",
            "Verse",
            "Original_User_Text",
            "Authoritative_Text",
            "Replaced"
        ])
        writer.writerows(rows)

    print("DONE")
    print("Total replacements:", replacements)


if __name__ == "__main__":
    reconstruct()


Loading canonical verse references...
Loading user verses...
Loading retroversions...
Saving reconstructed Torah...
DONE
Total replacements: 52


# PHASE TWO

Indexing our result into a letter-table

In [None]:
#!/usr/bin/env python3

import csv
import re

INPUT_FILE = "torah_reconstructed.csv"
OUTPUT_FILE = "torah_letter_index.csv"


def clean_word(word):
    """
    Keep Hebrew letters only.
    Removes punctuation, numbers, stray marks.
    """
    return re.sub(r'[^א-ת]', '', word)


def main():
    index = 1
    rows_out = []

    with open(INPUT_FILE, newline='', encoding="utf-8") as f:
        reader = csv.DictReader(f)

        for row in reader:
            verse = row["Authoritative_Text"].strip()

            # Split into Hebrew words
            words = verse.split()

            for word in words:
                clean = clean_word(word)

                # Skip empty results
                if not clean:
                    continue

                for letter in clean:
                    rows_out.append({
                        "Index": index,
                        "Letter": letter,
                        "Hebrew Word": clean
                    })
                    index += 1

    # Write output
    with open(OUTPUT_FILE, "w", newline='', encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=["Index", "Letter", "Hebrew Word"])
        writer.writeheader()
        writer.writerows(rows_out)

    print("✅ Done.")
    print("Total letters indexed:", index - 1)
    print("Output saved to:", OUTPUT_FILE)


if __name__ == "__main__":
    main()


✅ Done.
Total letters indexed: 304493
Output saved to: torah_letter_index.csv


# Code to count YHWH occurrences

## This is borrowed from paintedpotato (2024 project)

NB: I counted 20 mentions of YHWH from the results

In [None]:
import csv

# Function to read the CSV and load the letter data
def read_torah_csv(filename):
    letters_info = []

    with open(filename, mode='r', encoding='utf-8') as file:
        reader = csv.DictReader(file)
        for row in reader:
            letters_info.append({
                'Index': int(row['Index']),
                'Letter': row['Letter'],
                'Hebrew Word': row['Hebrew Word']
            })

    return letters_info

# Function to find sequences of Tav, Vav, Resh, Hey separated by 50 letters
def find_tav_vav_resh_hey_sequences(letters_info):
    sequences = []

    # Loop through the list of letters
    for i in range(len(letters_info) - 150):
        if (letters_info[i]['Letter'] == 'ת' and
            letters_info[i + 50]['Letter'] == 'ו' and
            letters_info[i + 100]['Letter'] == 'ר' and
            letters_info[i + 150]['Letter'] == 'ה'):
            sequences.append({
                'Tav Index': letters_info[i]['Index'],
                'Tav Word': letters_info[i]['Hebrew Word'],
                'Vav Index': letters_info[i + 50]['Index'],
                'Vav Word': letters_info[i + 50]['Hebrew Word'],
                'Resh Index': letters_info[i + 100]['Index'],
                'Resh Word': letters_info[i + 100]['Hebrew Word'],
                'Hey Index': letters_info[i + 150]['Index'],
                'Hey Word': letters_info[i + 150]['Hebrew Word']
            })

    return sequences

# Function to print the sequences found
def print_sequences(sequences):
    for seq in sequences:
        print(f"Tav at index {seq['Tav Index']} (Word: {seq['Tav Word']})")
        print(f"Vav at index {seq['Vav Index']} (Word: {seq['Vav Word']})")
        print(f"Resh at index {seq['Resh Index']} (Word: {seq['Resh Word']})")
        print(f"Hey at index {seq['Hey Index']} (Word: {seq['Hey Word']})")
        print('-' * 40)

# Main process
filename = 'torah_letter_index.csv'  # The CSV file generated earlier
letters_info = read_torah_csv(filename)

# Find and print the sequences
sequences = find_tav_vav_resh_hey_sequences(letters_info)
print_sequences(sequences)

# Optionally, you can save the sequences to another CSV file if needed.


Tav at index 6 (Word: בראשית)
Vav at index 56 (Word: תהום)
Resh at index 106 (Word: וירא)
Hey at index 156 (Word: אלהים)
----------------------------------------
Tav at index 18472 (Word: ותאמר)
Vav at index 18522 (Word: והתעני)
Resh at index 18572 (Word: מרב)
Hey at index 18622 (Word: יהוה)
----------------------------------------
Tav at index 76069 (Word: וזאת)
Vav at index 76119 (Word: ויאמר)
Resh at index 76169 (Word: עפרון)
Hey at index 76219 (Word: קנה)
----------------------------------------
Tav at index 94183 (Word: אתו)
Vav at index 94233 (Word: בחפזון)
Resh at index 94283 (Word: בארץ)
Hey at index 94333 (Word: והיה)
----------------------------------------
Tav at index 96259 (Word: תשברו)
Vav at index 96309 (Word: המול)
Resh at index 96359 (Word: תורה)
Hey at index 96409 (Word: יהוה)
----------------------------------------
Tav at index 97983 (Word: את)
Vav at index 98033 (Word: בחור)
Resh at index 98083 (Word: וירדף)
Hey at index 98133 (Word: אחריהם)
-----------------------

## This was freshly coded to compare results

In [None]:
#!/usr/bin/env python3

import csv

INPUT_FILE = "torah_letter_index.csv"
OUTPUT_FILE = "yhvh_els_50.csv"

TARGET = ["י", "ה", "ו", "ה"]
SKIP = 50


def load_letters():
    letters = []
    with open(INPUT_FILE, newline='', encoding="utf-8") as f:
        reader = csv.DictReader(f)
        for row in reader:
            letters.append(row["Letter"])
    return letters


def find_els_matches(letters):
    matches = []
    total = len(letters)

    for i in range(total - SKIP * 3):
        seq = [
            letters[i],
            letters[i + SKIP],
            letters[i + SKIP * 2],
            letters[i + SKIP * 3]
        ]

        if seq == TARGET:
            match_indices = [
                i + 1,
                i + SKIP + 1,
                i + SKIP * 2 + 1,
                i + SKIP * 3 + 1
            ]

            matches.append({
                "StartIndex": i + 1,
                "Indices": ",".join(map(str, match_indices)),
                "Letters": "".join(seq)
            })

    return matches


def save_results(matches):
    with open(OUTPUT_FILE, "w", newline='', encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=["StartIndex", "Indices", "Letters"])
        writer.writeheader()
        writer.writerows(matches)


def main():
    print("Loading Torah letters...")
    letters = load_letters()

    print("Searching for יהוה in skip = 50...")
    matches = find_els_matches(letters)

    save_results(matches)

    print("\n===== RESULTS =====")
    print("Total letters scanned:", len(letters))
    print("ELS skip:", SKIP)
    print("Matches found:", len(matches))
    print("Saved to:", OUTPUT_FILE)


if __name__ == "__main__":
    main()


Loading Torah letters...
Searching for יהוה in skip = 50...

===== RESULTS =====
Total letters scanned: 304493
ELS skip: 50
Matches found: 25
Saved to: yhvh_els_50.csv


# Fuzzy vs Strict search of a 50-window sequence

In [None]:
#!/usr/bin/env python3
# yhwh_hybrid_windowed.py
# Input: torah_letter_index.csv (Index,Letter,Hebrew Word)
# Output: yhwh_comparison.csv (one row per 50-letter window with any matches)

import csv
from difflib import SequenceMatcher

INPUT_LETTER_CSV = "torah_letter_index.csv"
OUTPUT_CSV = "yhwh_comparison.csv"

WINDOW_SIZE = 50        # letters per window (you asked for intervals of 50 characters)
TARGET = "יהוה"         # tetragram to detect
TARGET_LEN = len(TARGET)
FUZZY_THRESHOLD = 0.80  # similarity threshold for fuzzy matches (0..1). Tune as needed.

def similarity(a: str, b: str) -> float:
    return SequenceMatcher(None, a, b).ratio()

def load_letter_index(path):
    """Load torah_letter_index.csv and return two parallel lists: indices[], letters[]"""
    indices = []
    letters = []
    with open(path, newline='', encoding='utf-8') as f:
        reader = csv.DictReader(f)
        # Expect columns: Index, Letter, Hebrew Word
        for r in reader:
            idx_raw = r.get("Index") or r.get("index")
            letter = r.get("Letter") or r.get("letter")
            if idx_raw is None or letter is None:
                continue
            try:
                idx = int(idx_raw)
            except:
                # try to strip and convert
                idx = int(str(idx_raw).strip())
            letter = str(letter).strip()
            if letter == "":
                continue
            indices.append(idx)
            letters.append(letter)
    return indices, letters

def run_hybrid_scan(indices, letters,
                    window_size=WINDOW_SIZE,
                    target=TARGET,
                    target_len=TARGET_LEN,
                    fuzzy_thresh=FUZZY_THRESHOLD):
    N = len(letters)
    results = []
    total_exact = 0
    total_fuzzy = 0

    # Precompute target letters as list
    target_letters = list(target)

    # iterate windows by starting position in letters array
    for start in range(0, N - window_size + 1):
        window_letters = letters[start:start + window_size]
        window_indices = indices[start:start + window_size]
        exact_positions = []
        fuzzy_matches = []

        # Exact contiguous search inside this window
        # check every possible 4-letter start within the window
        for j in range(0, window_size - target_len + 1):
            seg = window_letters[j:j + target_len]
            if seg == target_letters:
                # map to the true Index of the first letter
                pos_index = window_indices[j]
                exact_positions.append(pos_index)

        # Fuzzy search: slide a target_len window and compute similarity
        for j in range(0, window_size - target_len + 1):
            seg_letters = window_letters[j:j + target_len]
            seg_text = "".join(seg_letters)
            score = similarity(seg_text, target)
            if score >= fuzzy_thresh:
                pos_index = window_indices[j]
                # store segment, score (rounded), start index
                fuzzy_matches.append((pos_index, seg_text, round(score, 3)))

        if exact_positions or fuzzy_matches:
            results.append({
                "WindowStartIndex": window_indices[0],
                "WindowEndIndex": window_indices[-1],
                "ExactCount": len(exact_positions),
                "ExactPositions": ",".join(map(str, exact_positions)) if exact_positions else "",
                "FuzzyCount": len(fuzzy_matches),
                # fuzzy details as 'Index:Segment:Score' separated by |
                "FuzzyDetails": "|".join(f"{pos}:{seg}:{score}" for (pos, seg, score) in fuzzy_matches)
            })
            total_exact += len(exact_positions)
            total_fuzzy += len(fuzzy_matches)

    summary = {
        "LettersScanned": N,
        "WindowsScanned": max(0, N - window_size + 1),
        "TotalExactMatches": total_exact,
        "TotalFuzzyMatches": total_fuzzy
    }
    return results, summary

def save_results(path, rows):
    fieldnames = ["WindowStartIndex","WindowEndIndex","ExactCount","ExactPositions","FuzzyCount","FuzzyDetails"]
    with open(path, "w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=fieldnames)
        writer.writeheader()
        for r in rows:
            writer.writerow(r)

def main():
    print("Loading letters from:", INPUT_LETTER_CSV)
    indices, letters = load_letter_index(INPUT_LETTER_CSV)
    if not letters:
        print("No letters loaded. Check file and headers (Index,Letter,Hebrew Word).")
        return

    print(f"Total letters loaded: {len(letters)}")
    print(f"Running hybrid scan: window={WINDOW_SIZE}, target='{TARGET}', fuzzy_thresh={FUZZY_THRESHOLD}")

    results, summary = run_hybrid_scan(indices, letters,
                                      window_size=WINDOW_SIZE,
                                      target=TARGET,
                                      target_len=len(TARGET),
                                      fuzzy_thresh=FUZZY_THRESHOLD)

    save_results(OUTPUT_CSV, results)

    print("\n=== SCAN SUMMARY ===")
    print("Letters scanned:", summary["LettersScanned"])
    print("Windows scanned:", summary["WindowsScanned"])
    print("Total exact matches found:", summary["TotalExactMatches"])
    print("Total fuzzy matches found:", summary["TotalFuzzyMatches"])
    print("Results saved to:", OUTPUT_CSV)
    print("\nSample output rows:", min(5, len(results)))
    for r in results[:5]:
        print(r)

if __name__ == "__main__":
    main()


Loading letters from: torah_letter_index.csv
Total letters loaded: 304493
Running hybrid scan: window=50, target='יהוה', fuzzy_thresh=0.8

=== SCAN SUMMARY ===
Letters scanned: 304493
Windows scanned: 304444
Total exact matches found: 86386
Total fuzzy matches found: 86386
Results saved to: yhwh_comparison.csv

Sample output rows: 5
{'WindowStartIndex': 1803, 'WindowEndIndex': 1852, 'ExactCount': 1, 'ExactPositions': '1849', 'FuzzyCount': 1, 'FuzzyDetails': '1849:יהוה:1.0'}
{'WindowStartIndex': 1804, 'WindowEndIndex': 1853, 'ExactCount': 1, 'ExactPositions': '1849', 'FuzzyCount': 1, 'FuzzyDetails': '1849:יהוה:1.0'}
{'WindowStartIndex': 1805, 'WindowEndIndex': 1854, 'ExactCount': 1, 'ExactPositions': '1849', 'FuzzyCount': 1, 'FuzzyDetails': '1849:יהוה:1.0'}
{'WindowStartIndex': 1806, 'WindowEndIndex': 1855, 'ExactCount': 1, 'ExactPositions': '1849', 'FuzzyCount': 1, 'FuzzyDetails': '1849:יהוה:1.0'}
{'WindowStartIndex': 1807, 'WindowEndIndex': 1856, 'ExactCount': 1, 'ExactPositions': '18

## More sequences

In [None]:
#!/usr/bin/env python3
"""
yhwh_full_analysis.py

Input:
 - torah_letter_index.csv  (columns: Index,Letter,Hebrew Word)

Outputs:
 - exact_occurrences.csv
 - els_matches.csv
 - fuzzy_matches.csv
 - spacing_stats.csv
 - clusters.csv
 - summary.txt

Configurable parameters are near the top of the file.
"""

import csv
import math
import sys
from collections import Counter
from difflib import SequenceMatcher
from statistics import mean, median, pstdev

# ---------------- CONFIG ----------------
INPUT_LETTER_CSV = "torah_letter_index.csv"

OUTPUT_EXACT = "exact_occurrences.csv"
OUTPUT_ELS = "els_matches.csv"
OUTPUT_FUZZY = "fuzzy_matches.csv"
OUTPUT_SPACING = "spacing_stats.csv"
OUTPUT_CLUSTERS = "clusters.csv"
OUTPUT_SUMMARY = "summary.txt"

TARGET = "יהוה"
TARGET_LETTERS = list(TARGET)

# ELS scanning: search skips s in 1..MAX_SKIP
MAX_SKIP = 200      # default; increase to search larger skips (slower)

# Fuzzy contiguous scanning parameters
FUZZY_ENABLED = True
FUZZY_THRESHOLD = 0.80    # similarity threshold (0..1)
FUZZY_MIN_LEN = 3
FUZZY_MAX_LEN = 6

# Clustering gap threshold (letters). If consecutive occurrences are <= CLUSTER_GAP apart they are in same cluster
CLUSTER_GAP = 200

# ---------------- HELPERS ----------------
def similarity(a, b):
    return SequenceMatcher(None, a, b).ratio()

def load_letters(path):
    indices = []
    letters = []
    words = []
    with open(path, newline='', encoding='utf-8') as f:
        reader = csv.DictReader(f)
        for r in reader:
            idx_raw = r.get("Index") or r.get("index")
            letter = r.get("Letter") or r.get("letter")
            word = r.get("Hebrew Word") or r.get("HebrewWord") or r.get("word")
            if idx_raw is None or letter is None:
                continue
            try:
                idx = int(str(idx_raw).strip())
            except:
                continue
            letter = str(letter).strip()
            indices.append(idx)
            letters.append(letter)
            words.append(word if word is not None else "")
    return indices, letters, words

def write_csv(path, header, rows):
    with open(path, "w", newline='', encoding='utf-8') as f:
        w = csv.writer(f)
        w.writerow(header)
        for row in rows:
            w.writerow(row)

# ---------------- MAIN ANALYSIS ----------------
def main():
    print("Loading letters from:", INPUT_LETTER_CSV)
    indices, letters, words = load_letters(INPUT_LETTER_CSV)
    N = len(letters)
    if N == 0:
        print("No letters found. Check input file.")
        return
    print("Total letters:", N)

    # Build a mapping from index position in array -> global letter index
    # indices[i] is the original global index of letters[i]

    # 1) Exact contiguous detection (יהוה)
    exact_positions = []   # store starting global index of each occurrence
    target_len = len(TARGET_LETTERS)

    for i in range(0, N - target_len + 1):
        if letters[i:i + target_len] == TARGET_LETTERS:
            exact_positions.append(indices[i])  # store global index of first letter

    # Reduce to unique (they are unique by construction)
    unique_exact_positions = exact_positions  # already unique by start index
    total_exact = len(unique_exact_positions)
    print("Exact contiguous occurrences found:", total_exact)

    # Write exact occurrences CSV
    exact_rows = []
    for pos in unique_exact_positions:
        # find array index
        arr_pos = None
        # binary search since indices is increasing
        lo = 0; hi = N-1
        while lo <= hi:
            mid = (lo + hi)//2
            if indices[mid] == pos:
                arr_pos = mid
                break
            elif indices[mid] < pos:
                lo = mid + 1
            else:
                hi = mid - 1
        word_at = words[arr_pos] if arr_pos is not None else ""
        # also include the 4-letter sequence (should be יהוה)
        seq_letters = "".join(letters[arr_pos:arr_pos+target_len]) if arr_pos is not None else ""
        exact_rows.append([pos, arr_pos, seq_letters, word_at])
    write_csv(OUTPUT_EXACT, ["StartGlobalIndex","ArrayIndex","Letters","HebrewWord"], exact_rows)

    # 2) Spacing statistics between consecutive exact occurrences
    spacing_list = []
    if total_exact >= 2:
        # sort positions (they should already be sorted)
        sorted_pos = sorted(unique_exact_positions)
        for a, b in zip(sorted_pos, sorted_pos[1:]):
            spacing_list.append(b - a)
    # Compute histogram
    spacing_counts = Counter(spacing_list)
    spacing_rows = []
    for span, cnt in sorted(spacing_counts.items()):
        spacing_rows.append([span, cnt])
    # include summary stats to spacing CSV later
    write_csv(OUTPUT_SPACING, ["GapBetweenOccurrences","Count"], spacing_rows)

    # 3) Clustering consecutive occurrences by gap threshold (simple 1D clustering)
    clusters = []
    if total_exact > 0:
        cur_cluster = [unique_exact_positions[0]]
        for pos in unique_exact_positions[1:]:
            if pos - cur_cluster[-1] <= CLUSTER_GAP:
                cur_cluster.append(pos)
            else:
                clusters.append(cur_cluster)
                cur_cluster = [pos]
        clusters.append(cur_cluster)
    cluster_rows = []
    for c in clusters:
        cluster_rows.append([c[0], c[-1], len(c), ",".join(map(str,c))])
    write_csv(OUTPUT_CLUSTERS, ["ClusterStart","ClusterEnd","Count","PositionsCSV"], cluster_rows)

    # 4) ELS search for skips 1..MAX_SKIP
    print("Running ELS search for skips 1..", MAX_SKIP)
    # We'll collect matches as: skip, start_global_index (pos1), pos2, pos3, pos4, span, letters_concat
    els_matches = []
    max_skip = MAX_SKIP
    # For performance, we'll access letters by index variables
    for s in range(1, max_skip + 1):
        # last starting index in array: N - 3*s - 1
        last_start = N - 3 * s
        if last_start <= 0:
            break
        # optimize by pulling the target letters
        t0, t1, t2, t3 = TARGET_LETTERS
        # iterate start i
        for i in range(0, last_start):
            if letters[i] != t0:
                continue
            # quickly check the rest
            if letters[i + s] == t1 and letters[i + 2 * s] == t2 and letters[i + 3 * s] == t3:
                pos1 = indices[i]
                pos2 = indices[i + s]
                pos3 = indices[i + 2 * s]
                pos4 = indices[i + 3 * s]
                span = pos4 - pos1 + 1
                letters_concat = letters[i] + letters[i + s] + letters[i + 2 * s] + letters[i + 3 * s]
                els_matches.append([s, pos1, pos2, pos3, pos4, span, letters_concat])
        # optional progress print for large runs
        if s % 50 == 0:
            print("  scanned skip", s)
    write_csv(OUTPUT_ELS, ["Skip","Pos1","Pos2","Pos3","Pos4","Span","LettersConcat"], els_matches)
    print("ELS matches found:", len(els_matches))

    # 5) Fuzzy contiguous matches (optional)
    fuzzy_rows = []
    total_fuzzy = 0
    if FUZZY_ENABLED:
        print("Running fuzzy contiguous scan (len", FUZZY_MIN_LEN, "-", FUZZY_MAX_LEN, "), threshold", FUZZY_THRESHOLD)
        for L in range(FUZZY_MIN_LEN, FUZZY_MAX_LEN + 1):
            if N < L:
                continue
            for i in range(0, N - L + 1):
                seg = "".join(letters[i:i+L])
                score = similarity(seg, TARGET)
                if score >= FUZZY_THRESHOLD:
                    pos = indices[i]
                    span = indices[i+L-1] - pos + 1
                    fuzzy_rows.append([pos, i, seg, L, round(score,3), span])
                    total_fuzzy += 1
            # small progress
            # print("  finished length", L)
    write_csv(OUTPUT_FUZZY, ["StartGlobalIndex","ArrayIndex","Segment","SegmentLen","Score","Span"], fuzzy_rows)
    print("Fuzzy matches found:", total_fuzzy)

    # 6) Appearance window sizes: for exact contiguous -> span = 4 (always); for ELS we computed span.
    # aggregate span distribution across all matches (exact + els + fuzzy)
    span_counter = Counter()
    # exact contiguous spans (should be 4)
    for pos in unique_exact_positions:
        span_counter[4] += 1
    for row in els_matches:
        span_counter[row[5]] += 1
    for row in fuzzy_rows:
        span_counter[row[5]] += 1
    span_rows = sorted([(span, cnt) for span, cnt in span_counter.items()], key=lambda x: x[0])
    # write appearance window sizes to a CSV (reuse spacing CSV file? create clusters file)
    # we'll append to spacing_stats.csv for convenience (or write separate)
    # create a combined spacing/appearance CSV
    appearance_file = "appearance_window_sizes.csv"
    write_csv(appearance_file, ["SpanChars","Count"], span_rows)

    # 7) Summary statistics and spacing basic metrics
    unique_sorted = sorted(unique_exact_positions)
    gaps = [b - a for a, b in zip(unique_sorted, unique_sorted[1:])] if len(unique_sorted) >= 2 else []
    summary_lines = []
    summary_lines.append(f"Total letters: {N}")
    summary_lines.append(f"Exact contiguous יהוה occurrences: {total_exact}")
    summary_lines.append(f"ELS matches total: {len(els_matches)} (skips scanned 1..{MAX_SKIP})")
    summary_lines.append(f"Fuzzy matches total: {total_fuzzy} (threshold {FUZZY_THRESHOLD})")
    summary_lines.append("")
    if gaps:
        summary_lines.append("Spacing between consecutive exact occurrences (basic):")
        summary_lines.append(f"  min gap: {min(gaps)}")
        summary_lines.append(f"  mean gap: {mean(gaps):.2f}")
        summary_lines.append(f"  median gap: {median(gaps)}")
        summary_lines.append(f"  stddev gap: {pstdev(gaps):.2f}")
        # produce top 20 most common gaps
        gap_counts = Counter(gaps)
        most_common_gaps = gap_counts.most_common(20)
        summary_lines.append("  most common gaps (gap:count): " + ", ".join(f"{g}:{c}" for g,c in most_common_gaps))
    else:
        summary_lines.append("Not enough exact occurrences to compute gaps.")

    # clusters summary
    summary_lines.append("")
    summary_lines.append(f"Clusters found (gap threshold {CLUSTER_GAP}): {len(clusters)}")
    cluster_sizes = [len(c) for c in clusters]
    if cluster_sizes:
        summary_lines.append(f"  largest cluster size: {max(cluster_sizes)}")
        summary_lines.append(f"  mean cluster size: {mean(cluster_sizes):.2f}")

    # span distribution summary
    summary_lines.append("")
    summary_lines.append("Appearance window sizes (span chars) summary (span:count):")
    for span, cnt in span_rows[:40]:
        summary_lines.append(f"  {span}: {cnt}")

    # write summary to file
    with open(OUTPUT_SUMMARY, "w", encoding="utf-8") as f:
        f.write("\n".join(summary_lines))

    # Print short summary to console
    print("\n=== ANALYSIS SUMMARY ===")
    for line in summary_lines[:12]:
        print(line)
    print("... (full summary written to", OUTPUT_SUMMARY, ")")

if __name__ == "__main__":
    main()


Loading letters from: torah_letter_index.csv
Total letters: 304493
Exact contiguous occurrences found: 1838
Running ELS search for skips 1.. 200
  scanned skip 50
  scanned skip 100
  scanned skip 150
  scanned skip 200
ELS matches found: 7142
Running fuzzy contiguous scan (len 3 - 6 ), threshold 0.8
Fuzzy matches found: 15867

=== ANALYSIS SUMMARY ===
Total letters: 304493
Exact contiguous יהוה occurrences: 1838
ELS matches total: 7142 (skips scanned 1..200)
Fuzzy matches total: 15867 (threshold 0.8)

Spacing between consecutive exact occurrences (basic):
  min gap: 4
  mean gap: 164.70
  median gap: 76
  stddev gap: 532.41
  most common gaps (gap:count): 28:25, 38:22, 51:22, 19:20, 14:20, 33:20, 39:19, 36:19, 31:19, 24:19, 42:18, 44:18, 62:18, 9:18, 37:18, 21:18, 48:18, 22:17, 50:16, 35:16

... (full summary written to summary.txt )


# Save file sessions

In [None]:
from google.colab import drive
import os

# 1. Mount Google Drive
drive.mount('/content/drive')

# 2. Create a destination folder in Drive
destination_folder = '/content/drive/MyDrive/Colab_Project_Files/LXX_DSS_Torahcode'
if not os.path.exists(destination_folder):
    os.makedirs(destination_folder)

# 3. Run the sync command
# Flags explained:
# -a: archive mode (keeps permissions/dates)
# -v: verbose (shows progress)
# --exclude 'drive': CRITICAL. Prevents trying to copy Drive into itself.
# --exclude 'sample_data': Skips the default Colab sample files.
!rsync -av --exclude='drive' --exclude='.config' --exclude='sample_data' /content/ "$destination_folder"

print(f"Files synced to {destination_folder}")