# Data Parser

### Installations

In [1]:
# %pip install bs4
# %pip install lxml

### Imports

In [1]:
import re
from collections import OrderedDict
from bs4 import BeautifulSoup
import lxml 
import os
import glob
import pandas as pd
import io
import csv
from typing import List

## By verse

### Step 1: Download the raw files and place them into your folder

From the data scraper, the output are raw data in .txt files. Put them in `/parser/yourname/raw` folder. See `/parser/yna/raw` folder. So, the folder structure is as follows.
```
/parser 
-- /yourname 
---- /language1
-------- text files here
---- /language2 
---- /language3 
---- /language4 
```

### Step 2: Convert Raw TXT Files into Clean HTML
After scraping, the resulting files were saved as `.txt` documents, but their contents still contained raw HTML fragments such as `<div>`, `<span>`, and other tags. To standardize and clean these files, we implemented a conversion pipeline that transforms the raw text into properly structured `.html` files. This pipeline automates the cleanup of scraped `.txt` files across multiple language datasets, standardizing them into well-formed HTML files while preserving the original folder hierarchy.

**Source Language Directories**  
   The script is configured to process multiple languages at once. Each entry in `LANG_DIRS` corresponds to a folder containing raw `.txt` files (e.g., `yna/raw/Cebuano`, `yna/raw/Spanish`, etc.). Commented-out entries can be enabled later when needed.

**Folder Mapping (Raw → HTML)**  
   - The variable `SRC = "raw"` defines the source segment of the path.  
   - The variable `DST = "html"` defines the target segment.  
   - The helper function `swap_segment` replaces `"raw"` with `"html"` in a given path so that outputs mirror the original folder structure.

**Conversion Process**  
- The `convert_tree` function traverses each source folder recursively.  
- For every `.txt` file found, it creates a corresponding `.html` file in the mirrored output directory.  
- Files are wrapped with a minimal `<html> ... </html>` block, parsed with BeautifulSoup, and then prettified to ensure clean and valid HTML formatting.  
- If a file cannot be decoded with UTF-8, the script falls back to `latin-1` encoding to prevent crashes.

**Execution Flow**  
- The main loop iterates through all `LANG_DIRS`.  
- For each language folder, the script ensures that the destination directory exists.  
- The conversion runs and counts how many files were successfully transformed.  
- A log message prints the mapping for every file processed.  

**Completion Summary**  
After processing all directories, the script prints the total number of `.txt` files converted into `.html`. This ensures visibility into the batch operation and confirms that all language folders were handled consistently.

In [4]:
# CHANGE: List all source folders that contain .txt files
LANG_DIRS = [
    "yna/raw/Cebuano",
    "yna/raw/Spanish",
    "yna/raw/Chavacano",
    "yna/raw/Tausug",
    "cj/raw/Ivatan",
    "cj/raw/Pangasinense",
    "cj/raw/Tagalog",
    "cj/raw/Yami",
    "trish/raw/Bikolano",
    "trish/raw/Ilokano",
    "trish/raw/Kalinga",
    "trish/raw/Kapampangan",
    "enz/raw/Isnag",
    "enz/raw/Kankanaey",
    "enz/raw/Sambal",
    "enz/raw/Yakan",
]

SRC_SEGMENT = "raw"   
DST_SEGMENT = "html"

In [5]:
# helper functions
def swap_segment(path: str, old: str, new: str) -> str:
    parts = path.split(os.sep)
    for i, p in enumerate(parts):
        if p == old:
            parts[i] = new
            return os.sep.join(parts)
    parent, leaf = os.path.split(path)
    return os.path.join(os.path.dirname(parent), new, leaf)

def convert_tree(src_root: str, dst_root: str) -> int:
    files_converted = 0
    for root, _, files in os.walk(src_root):
        rel = os.path.relpath(root, src_root)
        target_dir = dst_root if rel == "." else os.path.join(dst_root, rel)
        os.makedirs(target_dir, exist_ok=True)

        for name in files:
            if not name.lower().endswith(".txt"):
                continue

            txt_path = os.path.join(root, name)
            html_name = os.path.splitext(name)[0] + ".html"
            html_path = os.path.join(target_dir, html_name)

            try:
                with open(txt_path, "r", encoding="utf-8") as f:
                    content = f.read()
            except UnicodeDecodeError:
                with open(txt_path, "r", encoding="latin-1") as f:
                    content = f.read()

            wrapped = f"<html>\n{content}\n</html>"
            soup = BeautifulSoup(wrapped, "html.parser")
            pretty_html = soup.prettify()

            with open(html_path, "w", encoding="utf-8") as f:
                f.write(pretty_html)

            files_converted += 1
            print(f"Converted {txt_path} → {html_path}")
    return files_converted

In [6]:
# main
total = 0
for src in LANG_DIRS:
    dst = swap_segment(src, SRC_SEGMENT, DST_SEGMENT)
    os.makedirs(dst, exist_ok=True)
    print(f"\nFolder {src} → {dst}")
    total += convert_tree(src, dst)

print(f"\nDone. Converted {total} file(s).")


Folder yna/raw/Cebuano → yna/html/Cebuano
Converted yna/raw/Cebuano/ABCEB_PHP_raw.txt → yna/html/Cebuano/ABCEB_PHP_raw.html
Converted yna/raw/Cebuano/ABCEB_ZEP_raw.txt → yna/html/Cebuano/ABCEB_ZEP_raw.html
Converted yna/raw/Cebuano/ABCEB_2SA_raw.txt → yna/html/Cebuano/ABCEB_2SA_raw.html
Converted yna/raw/Cebuano/ABCEB_PSA_raw.txt → yna/html/Cebuano/ABCEB_PSA_raw.html
Converted yna/raw/Cebuano/ABCEB_HAB_raw.txt → yna/html/Cebuano/ABCEB_HAB_raw.html
Converted yna/raw/Cebuano/ABCEB_EZR_raw.txt → yna/html/Cebuano/ABCEB_EZR_raw.html
Converted yna/raw/Cebuano/ABCEB_PHM_raw.txt → yna/html/Cebuano/ABCEB_PHM_raw.html
Converted yna/raw/Cebuano/ABCEB_HOS_raw.txt → yna/html/Cebuano/ABCEB_HOS_raw.html
Converted yna/raw/Cebuano/ABCEB_JUD_raw.txt → yna/html/Cebuano/ABCEB_JUD_raw.html
Converted yna/raw/Cebuano/ABCEB_EST_raw.txt → yna/html/Cebuano/ABCEB_EST_raw.html
Converted yna/raw/Cebuano/ABCEB_1JN_raw.txt → yna/html/Cebuano/ABCEB_1JN_raw.html
Converted yna/raw/Cebuano/ABCEB_GAL_raw.txt → yna/html/

### Step 3: Parse HTML Files into Structured CSV and TXT Outputs
Once the `.html` files have been generated, the next stage is to parse them into structured tabular data for analysis. The script below handles this by reading verse spans from HTML, extracting metadata, and producing parallel `.csv` and `.txt` files for each language dataset. This stage converts the cleaned HTML corpus into normalized, language-specific `.csv` and `.txt` files, providing a standardized dataset.


**Configuration**
- `LANG_DIRS` lists all the language folders to be processed (e.g., Cebuano, Spanish, Chavacano, etc.).  
- `SRC_SEGMENT = "html"` identifies the location of the input files.  
- `DST_SEGMENT = "parsed"` defines the destination where outputs will be written.  

Helper functions (`to_src_root`, `to_dst_root`, and `replace_segment`) ensure that the folder structure is preserved, with only the path segment swapped (`html → parsed`).

**HTML Parsing**
- Each `.html` file is read and parsed using **BeautifulSoup** (with `lxml` if available).  
- Verses are identified by the CSS selector `span.verse[data-usfm]`.  
- Inside each verse:
  - Child spans such as labels or notes are removed.  
  - Remaining text fragments are normalized (whitespace collapsed, punctuation spacing fixed).  
  - Metadata (book, chapter, verse) is parsed from the `data-usfm` attribute.  
  - Additional fields such as language code (`iso6393`) and version ID (`vid`) are extracted from parent `<div class="version">`.

**Handling Tables** 
- Any `<table>` elements encountered in the HTML are intentionally discarded during parsing. This ensures that only verse text and metadata are retained, preventing structural markup or formatting artifacts from polluting the output dataset.

**DataFrame Assembly**
- Extracted rows are assembled into a **Pandas DataFrame** using `rows_to_df`.  
- Missing book/chapter/verse fields are back-filled from the `usfm` string.  
- Data is sorted by canonical order (`book`, `chapter`, `verse`).  

**Output Generation**
For each input `.html` file:
- A corresponding `.csv` file is written with structured columns (`usfm`, `book`, `chapter`, `verse`, `text`, `iso6393`, `vid`).  
- A `.txt` file is also written, containing verse-level text prefixed by its `usfm` code.  

**Logging and Error Handling**
- Empty outputs (no verses found) are tracked separately.  
- Errors during parsing are collected and summarized at the end.  
- A final summary prints:
  - Total files processed  
  - Number of empty outputs  
  - Number of errors 

In [8]:
# CHANGE: List all source folders that contain .html files

LANG_DIRS = [
    "yna/html/Cebuano",
    "yna/html/Spanish",
    "yna/html/Chavacano",
    "yna/html/Tausug",
    "cj/html/Ivatan",
    "cj/html/Pangasinense",
    "cj/html/Tagalog",
    "cj/html/Yami",
    "trish/html/Bikolano",
    "trish/html/Ilokano",
    "trish/html/Kalinga",
    "trish/html/Kapampangan",
    "enz/html/Isnag",
    "enz/html/Kankanaey",
    "enz/html/Sambal",
    "enz/html/Yakan",
]

SRC_SEGMENT = "html"    
DST_SEGMENT = "parsed"   

_PARSER = "lxml"

In [9]:
# helper functions

def replace_segment(path: str, old_seg: str, new_seg: str) -> str:
    parts = path.split(os.sep)
    for i, p in enumerate(parts):
        if p == old_seg:
            parts[i] = new_seg
            return os.sep.join(parts)
    parent, leaf = os.path.split(path)
    if parent:
        return os.path.join(os.path.dirname(parent), new_seg, leaf)
    return os.path.join(new_seg, leaf)

def to_src_root(lang_dir: str) -> str:
    """
    Given a LANG_DIR (which may contain 'sentence' or 'raw' or already 'html'),
    produce the source root where .html files are located.
    """
    parts = lang_dir.split(os.sep)
    if "sentence" in parts:
        return replace_segment(lang_dir, "sentence", SRC_SEGMENT)
    if "raw" in parts:
        return replace_segment(lang_dir, "raw", SRC_SEGMENT)
    return lang_dir

def to_dst_root(src_root: str) -> str:
    """
    Given a source root that contains the SRC_SEGMENT, swap that segment to DST_SEGMENT.
    """
    parts = src_root.split(os.sep)
    if SRC_SEGMENT in parts:
        return replace_segment(src_root, SRC_SEGMENT, DST_SEGMENT)
    parent, leaf = os.path.split(src_root)
    return os.path.join(os.path.dirname(parent), DST_SEGMENT, leaf)

def _norm_space(s: str) -> str:
    s = re.sub(r"\s+", " ", s).strip()
    s = re.sub(r"\s+([,.;:?!])", r"\1", s)
    s = re.sub(r"\s+([”\)\]])", r"\1", s)
    return s

def parse_usfm_html(raw_html: str):
    soup = BeautifulSoup(raw_html, _PARSER)

    for tbl in soup.find_all("table"):
        tbl.decompose()

    parts_by_usfm = OrderedDict()

    for vtag in soup.select('span.verse[data-usfm]'):
        usfm = vtag.get("data-usfm")
        if not usfm:
            continue

        for t in vtag.select("span.label, span.note"):
            t.decompose()

        frag = " ".join(s.strip() for s in vtag.stripped_strings)
        frag = _norm_space(frag)
        if frag:
            parts_by_usfm.setdefault(usfm, []).append(frag)

    rows = []
    for usfm, frags in parts_by_usfm.items():
        text = _norm_space(" ".join(frags))
        book = ch = ver = None
        try:
            book, ch, ver = usfm.split(".")
            ch = int(ch); ver = int(ver)
        except Exception:
            pass

        vmeta = soup.select_one(f'span.verse[data-usfm="{usfm}"]')
        iso = vid = None
        if vmeta:
            ver_div = vmeta.find_parent("div", class_="version")
            if ver_div:
                iso = ver_div.get("data-iso6393")
                vid = ver_div.get("data-vid")

        rows.append({
            "usfm": usfm,
            "book": book,
            "chapter": ch,
            "verse": ver,
            "text": text,
            "iso6393": iso,
            "vid": vid,
        })
    return rows

# def expand_multi_usfm_rows(rows):
#     """
#     If a row has usfm like '1CO.5.12+1CO.5.13', split into two rows
#     with identical text/metadata but separate usfm/book/chapter/verse.
#     """
#     out = []
#     for r in rows:
#         usfm = (r.get("usfm") or "").strip()
#         if "+" not in usfm:
#             out.append(r)
#             continue

#         tokens = [t.strip() for t in usfm.split("+") if t.strip()]

#         def split_parts(tok, fb=None):
#             parts = tok.split(".")
#             if len(parts) == 3:
#                 b, c, v = parts[0], int(parts[1]), int(parts[2])
#                 return b, c, v
#             if len(parts) == 1 and parts[0].isdigit() and fb:
#                 b, c = fb
#                 return b, c, int(parts[0])
#             return None, None, None

#         fb_b, fb_c, _ = split_parts(tokens[0], None)

#         for tok in tokens:
#             b, c, v = split_parts(tok, (fb_b, fb_c))
#             new = r.copy()
#             new["usfm"] = f"{b}.{c}.{v}" if b and c is not None and v is not None else tok
#             new["book"] = b
#             new["chapter"] = c
#             new["verse"] = v
#             out.append(new)
#     return out

def rows_to_df(rows):
    df = pd.DataFrame(rows)
    if df.empty:
        for c in ["usfm","book","chapter","verse","text","iso6393","vid"]:
            if c not in df.columns:
                df[c] = pd.Series(dtype="object")
        return df

    def parse_primary_bcv(usfm: str):
        """
        For sorting & columns: from '1CO.5.12+1CO.5.13' (or similar) extract
        (book, chapter, first_verse). Supports shorthand like '+13' using the
        first token's book/chapter as fallback.
        """
        if not isinstance(usfm, str):
            return (None, None, None)

        tokens = [t.strip() for t in usfm.split("+") if t.strip()]
        if not tokens:
            return (None, None, None)

        def split_tok(tok: str, fb=None):
            parts = tok.split(".")
            try:
                if len(parts) == 3:
                    b, c, v = parts[0], int(parts[1]), int(parts[2])
                    return b, c, v
                if len(parts) == 1 and parts[0].isdigit() and fb:
                    b, c = fb
                    return b, c, int(parts[0])
            except Exception:
                pass
            return None, None, None

        b0, c0, v0 = split_tok(tokens[0], None)

        if (b0 is None or c0 is None) and len(tokens) > 1:
            for t in tokens[1:]:
                bt, ct, vt = split_tok(t, (b0, c0))
                if b0 is None and bt is not None:
                    b0 = bt
                if c0 is None and ct is not None:
                    c0 = ct
                if v0 is None and vt is not None:
                    v0 = vt

        return (b0, c0, v0)

    df[["book","chapter","verse"]] = df["usfm"].apply(
        lambda u: pd.Series(parse_primary_bcv(u))
    )

    df = df.sort_values(["book","chapter","verse"], kind="stable").reset_index(drop=True)
    return df

def process_tree(src_root: str, dst_root: str):
    processed = 0
    empties = 0
    errors = []

    if not os.path.isdir(src_root):
        print(f"[Skip] Source not found: {os.path.abspath(src_root)}")
        return processed, empties, errors

    os.makedirs(dst_root, exist_ok=True)

    for root, dirs, files in os.walk(src_root):
        for fname in files:
            if not fname.lower().endswith(".html"):
                continue
            processed += 1

            in_path = os.path.join(root, fname)
            rel = os.path.relpath(in_path, start=src_root)

            try:
                with io.open(in_path, "r", encoding="utf-8") as f:
                    raw_html = f.read()

                rows = parse_usfm_html(raw_html)
                #rows = expand_multi_usfm_rows(rows)
                df = rows_to_df(rows)

                out_base = os.path.splitext(os.path.join(dst_root, rel))[0]
                out_dir = os.path.dirname(out_base)
                os.makedirs(out_dir, exist_ok=True)

                out_csv = out_base + ".csv"
                out_txt = out_base + ".txt"

                df.to_csv(out_csv, index=False, encoding="utf-8")

                with io.open(out_txt, "w", encoding="utf-8") as g:
                    for _, r in df.iterrows():
                        usfm = "" if pd.isna(r.get("usfm")) else str(r.get("usfm"))
                        text = "" if pd.isna(r.get("text")) else str(r.get("text"))
                        g.write(f"{usfm} {text}\n")

                if df.empty:
                    empties += 1
                    print(f"Empty {rel}")
                else:
                    print(f"Parsed {rel} → {os.path.basename(out_csv)}, {os.path.basename(out_txt)}")

            except Exception as e:
                errors.append((rel, str(e)))
                print(f"Error {rel}: {e}")

    return processed, empties, errors

In [10]:
#main

grand_total = 0
grand_empties = 0
grand_errors = []

for lang_dir in LANG_DIRS:
    src_root = to_src_root(lang_dir)  
    dst_root = to_dst_root(src_root)  
    p, e, errs = process_tree(src_root, dst_root)
    grand_total  += p
    grand_empties += e
    grand_errors += errs

print("\nSummary")
print(f"Processed (all folders): {grand_total}")
print(f"Empty filesets         : {grand_empties}")
print(f"Errors                 : {len(grand_errors)}")
if grand_errors:
    for pth, msg in grand_errors[:10]:
        print(f" - {pth}: {msg}")

Parsed ABCEB_JON_raw.html → ABCEB_JON_raw.csv, ABCEB_JON_raw.txt
Parsed ABCEB_1CH_raw.html → ABCEB_1CH_raw.csv, ABCEB_1CH_raw.txt
Parsed ABCEB_EST_raw.html → ABCEB_EST_raw.csv, ABCEB_EST_raw.txt
Parsed ABCEB_MAL_raw.html → ABCEB_MAL_raw.csv, ABCEB_MAL_raw.txt
Parsed ABCEB_2KI_raw.html → ABCEB_2KI_raw.csv, ABCEB_2KI_raw.txt
Parsed ABCEB_MIC_raw.html → ABCEB_MIC_raw.csv, ABCEB_MIC_raw.txt
Parsed ABCEB_DAN_raw.html → ABCEB_DAN_raw.csv, ABCEB_DAN_raw.txt
Parsed ABCEB_HAG_raw.html → ABCEB_HAG_raw.csv, ABCEB_HAG_raw.txt
Parsed ABCEB_COL_raw.html → ABCEB_COL_raw.csv, ABCEB_COL_raw.txt
Parsed ABCEB_2JN_raw.html → ABCEB_2JN_raw.csv, ABCEB_2JN_raw.txt
Parsed ABCEB_HEB_raw.html → ABCEB_HEB_raw.csv, ABCEB_HEB_raw.txt
Parsed ABCEB_PHM_raw.html → ABCEB_PHM_raw.csv, ABCEB_PHM_raw.txt
Parsed ABCEB_AMO_raw.html → ABCEB_AMO_raw.csv, ABCEB_AMO_raw.txt
Parsed ABCEB_MAT_raw.html → ABCEB_MAT_raw.csv, ABCEB_MAT_raw.txt
Parsed ABCEB_EPH_raw.html → ABCEB_EPH_raw.csv, ABCEB_EPH_raw.txt
Parsed ABCEB_1SA_raw.html

## By sentence

### Step 1: Remove Duplicate Verse Entries

**Input Sources**  
   - Each language’s processed `.csv` files are located under the `parsed/` directory.  
   - The script reads these CSVs recursively, ensuring that subfolder structure is preserved.  

**Target Outputs**  
   - Cleaned `.txt` files are written into the corresponding `sentence/` directory for each language.  
   - The folder hierarchy is mirrored so that each input CSV has a matching output TXT.

**Robustness Features**  
   - CSVs are read with multiple encoding fallbacks (`utf-8`, `utf-8-sig`, `latin-1`).  
   - Bad lines are skipped to prevent failures on malformed files.  
   - Files without usable text columns are skipped gracefully with log messages.  

**Logging and Summary**  
   - For each file, the script prints the number of unique lines written.  
   - At the end of processing, a summary reports:  
     - total CSVs processed  
     - total TXT files written  
     - skipped files (due to missing `text` column or read errors)

In [12]:
# CHANGE: List all source folders that contain .txt files

LANG_DIRS = [
    "yna/sentence/Cebuano",
    "yna/sentence/Spanish",
    "yna/sentence/Chavacano",
    "yna/sentence/Tausug",
    "cj/sentence/Ivatan",
    "cj/sentence/Pangasinense",
    "cj/sentence/Tagalog",
    "cj/sentence/Yami",
    "trish/sentence/Bikolano",
    "trish/sentence/Ilokano",
    "trish/sentence/Kalinga",
    "trish/sentence/Kapampangan",
    "enz/sentence/Isnag",
    "enz/sentence/Kankanaey",
    "enz/sentence/Sambal",
    "enz/sentence/Yakan",
]

EXACT_TEXT_COL_NAME_ONLY = True # if True, only columns named exactly "text" (case-insensitive) are used.


In [13]:
# helper functions
def read_csv_any_encoding(path: str) -> pd.DataFrame:
    """
    Robust CSV reader trying common encodings and skipping bad lines.
    """
    for enc in ("utf-8", "utf-8-sig", "latin-1"):
        try:
            return pd.read_csv(path, dtype=str, encoding=enc, on_bad_lines="skip")
        except Exception:
            continue
    raise RuntimeError(f"Failed to read CSV: {path}")

def replace_segment(path: str, old_seg: str, new_seg: str) -> str:
    """
    Replace a single path segment (e.g., 'sentence' -> 'parsed').
    If not found, create a sibling: parent/NEW/leaf
    """
    parts = path.split(os.sep)
    for i, p in enumerate(parts):
        if p == old_seg:
            parts[i] = new_seg
            return os.sep.join(parts)
    parent, leaf = os.path.split(path)
    return os.path.join(os.path.dirname(parent), new_seg, leaf)

def derive_input_root_from_lang_dir(lang_dir: str) -> str:
    """
    Given an output root like '.../sentence/Tagalog', return '.../parsed/Tagalog'.
    """
    parts = lang_dir.split(os.sep)
    if "sentence" in parts:
        return replace_segment(lang_dir, "sentence", "parsed")
    parent, leaf = os.path.split(lang_dir)
    return os.path.join(os.path.dirname(parent), "parsed", leaf)

def process_pair(input_root: str, output_root: str,
                 exact_text_only: bool = True):
    """
    Process all CSVs under input_root and write TXT under output_root, mirroring structure.
    """
    pattern = os.path.join(input_root, "**", "*.csv")
    csv_paths = glob.glob(pattern, recursive=True)

    print(f"Found {len(csv_paths)} CSV file(s).")

    processed = 0
    written   = 0
    skipped_no_text_col = 0
    skipped_read_error  = 0

    for in_path in csv_paths:
        processed += 1

        try:
            df = read_csv_any_encoding(in_path)
        except Exception as e:
            print(f"Warning Read error; skipping: {in_path} | {e}")
            skipped_read_error += 1
            continue

        cols = list(df.columns)
        if exact_text_only:
            text_cols = [c for c in cols if c.strip().lower() == "text"]
        else:
            text_cols = [c for c in cols if "text" in c.strip().lower()]

        if not text_cols:
            print(f"Info No matching 'text' column in: {in_path}; skipping.")
            skipped_no_text_col += 1
            continue

        series_list = []
        for col in text_cols:
            s = df[col].dropna().astype(str).map(str.strip)
            s = s[s != ""]
            if not s.empty:
                series_list.append(s)

        if not series_list:
            print(f"Info No non-empty text in: {in_path}; skipping write.")
            continue

        combined = pd.concat(series_list, ignore_index=True)
        #uniq = combined.drop_duplicates(keep="first")

        rel_path = os.path.relpath(in_path, input_root)  
        base_no_ext, _ = os.path.splitext(rel_path)
        out_path = os.path.join(output_root, base_no_ext + ".txt")

        out_dir = os.path.dirname(out_path)
        if out_dir and not os.path.exists(out_dir):
            os.makedirs(out_dir, exist_ok=True)

        with open(out_path, "w", encoding="utf-8", newline="\n") as f:
         for line in combined:
            f.write(line + "\n")

        written += 1
        print(f"Wrote {len(combined)} unique line(s) → {out_path}")

    print("Summary")
    print(f"Processed              : {processed}")
    print(f"Wrote TXT files        : {written}")
    print(f"Skipped (no 'text' col): {skipped_no_text_col}")
    print(f"Skipped (read errors)  : {skipped_read_error}")

In [14]:
# main
grand_csv = 0
grand_proc = 0
grand_written = 0
grand_skip_text = 0
grand_skip_read = 0

for lang_dir in LANG_DIRS:
    in_root  = derive_input_root_from_lang_dir(lang_dir) 
    out_root = lang_dir                                  

    if not os.path.isdir(in_root):
        print(f"\n[Skip] Input root not found: {os.path.abspath(in_root)}")
        continue

    os.makedirs(out_root, exist_ok=True)

   
    pattern = os.path.join(in_root, "**", "*.csv")
    found = len(glob.glob(pattern, recursive=True))
    grand_csv += found

  
    process_pair(in_root, out_root, exact_text_only=EXACT_TEXT_COL_NAME_ONLY)

Found 66 CSV file(s).
Wrote 113 unique line(s) → yna/sentence/Cebuano/ABCEB_1TI_raw.txt
Wrote 1068 unique line(s) → yna/sentence/Cebuano/ABCEB_MAT_raw.txt
Wrote 406 unique line(s) → yna/sentence/Cebuano/ABCEB_NEH_raw.txt
Wrote 357 unique line(s) → yna/sentence/Cebuano/ABCEB_DAN_raw.txt
Wrote 1288 unique line(s) → yna/sentence/Cebuano/ABCEB_NUM_raw.txt
Wrote 673 unique line(s) → yna/sentence/Cebuano/ABCEB_MRK_raw.txt
Wrote 618 unique line(s) → yna/sentence/Cebuano/ABCEB_JDG_raw.txt
Wrote 48 unique line(s) → yna/sentence/Cebuano/ABCEB_JON_raw.txt
Wrote 1364 unique line(s) → yna/sentence/Cebuano/ABCEB_JER_raw.txt
Wrote 1533 unique line(s) → yna/sentence/Cebuano/ABCEB_GEN_raw.txt
Wrote 155 unique line(s) → yna/sentence/Cebuano/ABCEB_EPH_raw.txt
Wrote 222 unique line(s) → yna/sentence/Cebuano/ABCEB_ECC_raw.txt
Wrote 47 unique line(s) → yna/sentence/Cebuano/ABCEB_NAM_raw.txt
Wrote 1213 unique line(s) → yna/sentence/Cebuano/ABCEB_EXO_raw.txt
Wrote 15 unique line(s) → yna/sentence/Cebuano/ABCE

### Step 2: Clean - From Verses to Sentences
After removing duplicates in the previous step, we now split each verse into individual sentences, ensuring that every sentence is clean, normalized, and stored on its own line.  This step transforms the dataset from verse-level text into a clean, deduplicated, and sentence-aligned corpus, stored in both `.txt` and `.csv` formats. This sentence-level structure is now suitable for tasks like tokenization, alignment, and natural language processing.  

**Input and Output** 
- Deduplicated `.txt` files from the `sentence/` directories of each language.  
- Sentence-level `.txt` and `.csv` files stored under the `by_sentence/` directory, preserving the folder hierarchy.  

**Cleaning Operations** <br>
Before splitting into sentences, each verse undergoes a series of cleanup steps:  
- **Header removal:** Lines such as `ORIGINAL TEXT` are stripped using `HEADER_RE`.  
- **Verse markers:** References like `1CO.1.11` at the start of lines are removed with `VERSE_RE`.  
- **Unwanted characters:** Quotation marks, brackets, and other symbols (e.g., `“”‘’()[]«»`) are deleted.  
- **Whitespace normalization:** Newlines are flattened into spaces, and multiple spaces are collapsed.  

**Sentence Splitting** <br>
After cleaning, text is segmented into sentences using punctuation rules:  
- The regex `SENTENCE_SPLIT_RE` detects sentence boundaries after `.`, `!`, or `?`.  
- Each detected sentence is trimmed and filtered to drop empties.  
- Final output has one sentence per line.

**Conversion to CSV** 
- Every sentence `.txt` file is converted to a `.csv` with a single column named `sentence`.  
- This ensures consistency across languages and allows for easier downstream analysis.  

**Logging and Summary**
- For each language, the script prints how many `.txt` files were processed and where outputs were saved.  
- Every text-to-CSV conversion is logged with the source and destination file paths.  
- At the end, a summary reports the total number of files processed across all languages.  

In [15]:
# CHANGE: List all source folders that contain .txt files

LANG_DIRS = [
    "yna/sentence/Cebuano",
    "yna/sentence/Spanish",
    "yna/sentence/Chavacano",
    "yna/sentence/Tausug",
    "cj/sentence/Ivatan",
    "cj/sentence/Pangasinense",
    "cj/sentence/Tagalog",
    "cj/sentence/Yami",
    "trish/sentence/Bikolano",
    "trish/sentence/Ilokano",
    "trish/sentence/Kalinga",
    "trish/sentence/Kapampangan",
    "enz/sentence/Isnag",
    "enz/sentence/Kankanaey",
    "enz/sentence/Sambal",
    "enz/sentence/Yakan",

]

OUT_BASE = os.path.normpath(os.path.join("by_sentence"))

In [16]:
# REGEXES
HEADER_RE = re.compile(r'(?im)^\s*ORIGINAL\s+TEXT\s*\n?')
# verse markers like "1CO.1.11" at the start of lines
VERSE_RE  = re.compile(r'(?m)^\s*(?:[1-3]?[A-Z]{2,}\.\d+\.\d+)\s*')

# characters to delete entirely
STRIP_CHARS_RE = re.compile(r'[“”‘’()\[\]«»]')

# split sentences on ., !, ? (end the sentence right after the mark)
SENTENCE_SPLIT_RE = re.compile(r'(?<=[.!?])\s+')

In [17]:
# helper functions

def clean_and_split(text: str) -> str:
    """Remove headers/markers & unwanted chars, normalize spaces, then split into sentences.
       Returns a single string with one sentence per line."""
    # remove header + verse markers
    text = HEADER_RE.sub("", text)
    text = VERSE_RE.sub("", text)

    # delete specific characters
    text = STRIP_CHARS_RE.sub("", text)

    # flatten newlines -> spaces; collapse spaces
    text = re.sub(r'[ \t]*\n[ \t]*', ' ', text)
    text = re.sub(r'\s{2,}', ' ', text).strip()

    if not text:
        return ""

    # split into sentences
    sentences = SENTENCE_SPLIT_RE.split(text)

    # trim each, drop empties, and join one per line
    sentences = [s.strip() for s in sentences if s.strip()]
    return "\n".join(sentences)

In [18]:
# main

total_processed = 0

for lang_dir in LANG_DIRS:
    in_root  = lang_dir                        
    out_root = os.path.join(OUT_BASE, lang_dir)  
    os.makedirs(out_root, exist_ok=True)

    pattern = os.path.join(in_root, "**", "*.txt")
    in_files = sorted(glob.glob(pattern, recursive=True))
    print(f"\n[{lang_dir}] Found {len(in_files)} .txt files under {os.path.abspath(in_root)}")
    if not in_files:
        continue

    processed = 0
    for in_path in in_files:
        rel_path = os.path.relpath(in_path, in_root)
        out_path = os.path.join(out_root, rel_path)
        os.makedirs(os.path.dirname(out_path), exist_ok=True)

        with open(in_path, "r", encoding="utf-8", errors="ignore") as f:
            raw = f.read()

        cleaned = clean_and_split(raw)

        with open(out_path, "w", encoding="utf-8") as f:
            f.write(cleaned)

        processed += 1
        total_processed += 1

    print(f"[{lang_dir}] Processed {processed} files. Output → {os.path.abspath(out_root)}")

print(f"\nAll done. Total processed: {total_processed} files.")


[yna/sentence/Cebuano] Found 66 .txt files under /Users/armina/Documents/GitHub/bible-dot-com-scraper/parser/yna/sentence/Cebuano
[yna/sentence/Cebuano] Processed 66 files. Output → /Users/armina/Documents/GitHub/bible-dot-com-scraper/parser/by_sentence/yna/sentence/Cebuano

[yna/sentence/Spanish] Found 66 .txt files under /Users/armina/Documents/GitHub/bible-dot-com-scraper/parser/yna/sentence/Spanish
[yna/sentence/Spanish] Processed 66 files. Output → /Users/armina/Documents/GitHub/bible-dot-com-scraper/parser/by_sentence/yna/sentence/Spanish

[yna/sentence/Chavacano] Found 27 .txt files under /Users/armina/Documents/GitHub/bible-dot-com-scraper/parser/yna/sentence/Chavacano
[yna/sentence/Chavacano] Processed 27 files. Output → /Users/armina/Documents/GitHub/bible-dot-com-scraper/parser/by_sentence/yna/sentence/Chavacano

[yna/sentence/Tausug] Found 27 .txt files under /Users/armina/Documents/GitHub/bible-dot-com-scraper/parser/yna/sentence/Tausug
[yna/sentence/Tausug] Processed 27 

In [19]:
BASE_DIR = "by_sentence"   
OVERWRITE = True           # False: skip if CSV already exists
SKIP_BLANKS = False        # True: drop completely empty lines
TRIM_WHITESPACE = False    # True: strip leading/trailing spaces per line

#helpers
def read_txt_lines_any_encoding(path: str) -> List[str]:
    """
    Read a text file robustly with a few common encodings, returning lines without newline chars.
    """
    for enc in ("utf-8", "utf-8-sig", "latin-1"):
        try:
            with open(path, "r", encoding=enc) as f:
                return f.read().splitlines()
        except UnicodeDecodeError:
            continue
    with open(path, "r", encoding="utf-8", errors="replace") as f:
        return f.read().splitlines()

def convert_txt_to_csv(txt_path: str, csv_path: str) -> int:
    """
    Convert a single .txt to .csv with header 'sentence'.
    Returns the number of data rows written (excluding header).
    """
    lines = read_txt_lines_any_encoding(txt_path)

    if TRIM_WHITESPACE:
        lines = [ln.strip() for ln in lines]
    if SKIP_BLANKS:
        lines = [ln for ln in lines if ln != ""]

    os.makedirs(os.path.dirname(csv_path), exist_ok=True)
    with open(csv_path, "w", newline="", encoding="utf-8") as f:
        writer = csv.writer(f)
        writer.writerow(["sentence"])
        for ln in lines:
            writer.writerow([ln])
    return len(lines)

# main
total_txt = 0
total_csv = 0
total_rows = 0

if not os.path.isdir(BASE_DIR):
    raise FileNotFoundError(f"Base directory not found: {BASE_DIR}")

for root, _, files in os.walk(BASE_DIR):
    for name in files:
        if not name.lower().endswith(".txt"):
            continue
        total_txt += 1
        txt_path = os.path.join(root, name)
        csv_path = os.path.join(root, os.path.splitext(name)[0] + ".csv")

        if (not OVERWRITE) and os.path.exists(csv_path):
            print(f"SKIP (exists): {csv_path}")
            continue

        rows = convert_txt_to_csv(txt_path, csv_path)
        total_rows += rows
        total_csv += 1
        print(f"WROTE {rows:5d} rows → {csv_path}")

print("\nDone.")
print(f"TXT files found:   {total_txt}")
print(f"CSV files written: {total_csv}")
print(f"Total rows:        {total_rows}")

WROTE   217 rows → by_sentence/enz/sentence/Kankanaey/XNN_1JN_raw.csv
WROTE   304 rows → by_sentence/enz/sentence/Kankanaey/XNN_GAL_raw.csv
WROTE    55 rows → by_sentence/enz/sentence/Kankanaey/XNN_AMO_raw.csv
WROTE    77 rows → by_sentence/enz/sentence/Kankanaey/XNN_WIS_raw.csv
WROTE    53 rows → by_sentence/enz/sentence/Kankanaey/XNN_HAB_raw.csv
WROTE  3124 rows → by_sentence/enz/sentence/Kankanaey/XNN_PSA_raw.csv
WROTE    38 rows → by_sentence/enz/sentence/Kankanaey/XNN_PHM_raw.csv
WROTE    29 rows → by_sentence/enz/sentence/Kankanaey/XNN_HOS_raw.csv
WROTE     9 rows → by_sentence/enz/sentence/Kankanaey/XNN_JDT_raw.csv
WROTE    71 rows → by_sentence/enz/sentence/Kankanaey/XNN_JUD_raw.csv
WROTE    36 rows → by_sentence/enz/sentence/Kankanaey/XNN_2SA_raw.csv
WROTE   150 rows → by_sentence/enz/sentence/Kankanaey/XNN_PHP_raw.csv
WROTE    31 rows → by_sentence/enz/sentence/Kankanaey/XNN_ZEP_raw.csv
WROTE  1687 rows → by_sentence/enz/sentence/Kankanaey/XNN_ACT_raw.csv
WROTE   600 rows → b