# Data Cleaner

### Download the raw files and place them into your folder.

From the scraper that CJ did, you have the raw text files. Put them in `/cleaning/yourname/raw` folder, organized per language. See `/cleaning/yna/raw` folder.

### Installations

In [39]:
# %pip install bs4
# %pip install lxml

### Imports

In [40]:
import re
from collections import OrderedDict
from bs4 import BeautifulSoup
import lxml 
import os
import pandas as pd
import io

### Convert txt files to html files

In [41]:
source_parent = "cj/raw"          # IMPORTANT: your original parent folder with .txt files
output_parent = "cj/html"         # IMPORTANT: change parent folder name to your name

os.makedirs(output_parent, exist_ok=True)

for root, dirs, files in os.walk(source_parent):
    for file in files:
        if file.endswith(".txt"):
            txt_path = os.path.join(root, file)

            relative_path = os.path.relpath(root, source_parent)
            target_dir = os.path.join(output_parent, relative_path)
            os.makedirs(target_dir, exist_ok=True)

            html_filename = os.path.splitext(file)[0] + ".html"
            html_path = os.path.join(target_dir, html_filename)

            with open(txt_path, "r", encoding="utf-8") as f:
                content = f.read()

            wrapped = f"<html>\n{content}\n</html>"

            soup = BeautifulSoup(wrapped, "html.parser")
            pretty_html = soup.prettify()

            with open(html_path, "w", encoding="utf-8") as f:
                f.write(pretty_html)

            print(f"Converted {txt_path} → {html_path}")

Converted cj/raw/Yami/SNT_MRK_raw.txt → cj/html/Yami/SNT_MRK_raw.html
Converted cj/raw/Yami/SNT_MAT_raw.txt → cj/html/Yami/SNT_MAT_raw.html
Converted cj/raw/Yami/SNT_1TI_raw.txt → cj/html/Yami/SNT_1TI_raw.html
Converted cj/raw/Yami/SNT_EPH_raw.txt → cj/html/Yami/SNT_EPH_raw.html
Converted cj/raw/Yami/SNT_LUK_raw.txt → cj/html/Yami/SNT_LUK_raw.html
Converted cj/raw/Yami/SNT_3JN_raw.txt → cj/html/Yami/SNT_3JN_raw.html
Converted cj/raw/Yami/SNT_1PE_raw.txt → cj/html/Yami/SNT_1PE_raw.html
Converted cj/raw/Yami/SNT_1TH_raw.txt → cj/html/Yami/SNT_1TH_raw.html
Converted cj/raw/Yami/SNT_1CO_raw.txt → cj/html/Yami/SNT_1CO_raw.html
Converted cj/raw/Yami/SNT_ROM_raw.txt → cj/html/Yami/SNT_ROM_raw.html
Converted cj/raw/Yami/SNT_2JN_raw.txt → cj/html/Yami/SNT_2JN_raw.html
Converted cj/raw/Yami/SNT_HEB_raw.txt → cj/html/Yami/SNT_HEB_raw.html
Converted cj/raw/Yami/SNT_JHN_raw.txt → cj/html/Yami/SNT_JHN_raw.html
Converted cj/raw/Yami/SNT_PHP_raw.txt → cj/html/Yami/SNT_PHP_raw.html
Converted cj/raw/Yam

### HTML Parser

In [42]:
source_parent = "cj/html"         # IMPORTANT to change
output_parent = "cj/parsed"      # IMPORTANT to change

os.makedirs(output_parent, exist_ok=True)

In [43]:
try:
    import lxml  # noqa: F401
    _PARSER = "lxml"
except Exception:
    _PARSER = "html.parser"

def _norm_space(s: str) -> str:
    s = re.sub(r"\s+", " ", s).strip()
    s = re.sub(r"\s+([,.;:?!])", r"\1", s)
    s = re.sub(r"\s+([”\)\]])", r"\1", s)
    return s

def parse_usfm_html(raw_html: str):
    soup = BeautifulSoup(raw_html, _PARSER)
    parts_by_usfm = OrderedDict()

    for vtag in soup.select('span.verse[data-usfm]'):
        usfm = vtag.get("data-usfm")
        if not usfm:
            continue

        for t in vtag.select("span.label, span.note"):
            t.decompose()

        frag = " ".join(s.strip() for s in vtag.stripped_strings)
        frag = _norm_space(frag)
        if frag:
            parts_by_usfm.setdefault(usfm, []).append(frag)

    rows = []
    for usfm, frags in parts_by_usfm.items():
        text = _norm_space(" ".join(frags))
        book = ch = ver = None
        try:
            book, ch, ver = usfm.split(".")
            ch = int(ch); ver = int(ver)
        except Exception:
            pass

        vmeta = soup.select_one(f'span.verse[data-usfm="{usfm}"]')
        iso = vid = None
        if vmeta:
            ver_div = vmeta.find_parent("div", class_="version")
            if ver_div:
                iso = ver_div.get("data-iso6393")
                vid = ver_div.get("data-vid")

        rows.append({
            "usfm": usfm,
            "book": book,
            "chapter": ch,
            "verse": ver,
            "text": text,
            "iso6393": iso,
            "vid": vid,
        })
    return rows

def expand_multi_usfm_rows(rows):
    """
    If a row has usfm like '1CO.5.12+1CO.5.13', split into two rows
    with identical text/metadata but separate usfm/book/chapter/verse.
    """
    out = []
    for r in rows:
        usfm = (r.get("usfm") or "").strip()
        if "+" not in usfm:
            out.append(r)
            continue

        tokens = [t.strip() for t in usfm.split("+") if t.strip()]

        def split_parts(tok, fb=None):
            parts = tok.split(".")
            if len(parts) == 3:
                b, c, v = parts[0], int(parts[1]), int(parts[2])
                return b, c, v
            if len(parts) == 1 and parts[0].isdigit() and fb:
                b, c = fb
                return b, c, int(parts[0])
            return None, None, None

        fb_b, fb_c, _ = split_parts(tokens[0], None)

        for tok in tokens:
            b, c, v = split_parts(tok, (fb_b, fb_c))
            new = r.copy()
            new["usfm"] = f"{b}.{c}.{v}" if b and c is not None and v is not None else tok
            new["book"] = b
            new["chapter"] = c
            new["verse"] = v
            out.append(new)
    return out

def rows_to_df(rows):
    df = pd.DataFrame(rows)
    if df.empty:
        for c in ["usfm","book","chapter","verse","text","iso6393","vid"]:
            if c not in df.columns:
                df[c] = pd.Series(dtype="object")
        return df

    if df[["book","chapter","verse"]].isnull().any().any():
        def split_usfm(u):
            try:
                b, c, v = u.split(".")
                return pd.Series([b, int(c), int(v)])
            except Exception:
                return pd.Series([None, None, None])
        df[["book","chapter","verse"]] = df["usfm"].apply(split_usfm)

    df = df.sort_values(["book","chapter","verse"], kind="stable").reset_index(drop=True)
    return df

In [44]:
if not os.path.isdir(source_parent):
    raise FileNotFoundError(f"source_parent not found: {os.path.abspath(source_parent)}")

processed = 0
empties = 0
errors = []

for root, dirs, files in os.walk(source_parent):
    for fname in files:
        if not fname.lower().endswith(".html"):
            continue
        processed += 1

        in_path = os.path.join(root, fname)
        rel = os.path.relpath(in_path, start=source_parent)

        try:
            with io.open(in_path, "r", encoding="utf-8") as f:
                raw_html = f.read()

            rows = parse_usfm_html(raw_html)
            rows = expand_multi_usfm_rows(rows)   # Split 1CO.5.12+1CO.5.13 → two rows
            df = rows_to_df(rows)

            out_base = os.path.splitext(os.path.join(output_parent, rel))[0]
            out_dir = os.path.dirname(out_base)
            os.makedirs(out_dir, exist_ok=True)

            out_csv = out_base + ".csv"
            out_txt = out_base + ".txt"

            df.to_csv(out_csv, index=False, encoding="utf-8")

            with io.open(out_txt, "w", encoding="utf-8") as f:
                for _, r in df.iterrows():
                    f.write(f"{r['usfm']} {r['text']}\n")

            if df.empty:
                empties += 1
                print(f"Empty {rel}")
            else:
                print(f"Parsed {rel} -> {os.path.basename(out_csv)}, {os.path.basename(out_txt)}")

        except Exception as e:
            errors.append((rel, str(e)))
            print(f"Error {rel}: {e}")

print("\n--- Summary ---")
print(f"Processed: {processed}")
print(f"Empty     : {empties}")
print(f"Errors    : {len(errors)}")
if errors:
    for pth, msg in errors[:5]:
        print(f" - {pth}: {msg}")

Parsed Yami/SNT_LUK_raw.html -> SNT_LUK_raw.csv, SNT_LUK_raw.txt
Parsed Yami/SNT_3JN_raw.html -> SNT_3JN_raw.csv, SNT_3JN_raw.txt
Parsed Yami/SNT_JUD_raw.html -> SNT_JUD_raw.csv, SNT_JUD_raw.txt
Parsed Yami/SNT_1TI_raw.html -> SNT_1TI_raw.csv, SNT_1TI_raw.txt
Parsed Yami/SNT_1TH_raw.html -> SNT_1TH_raw.csv, SNT_1TH_raw.txt
Parsed Yami/SNT_2CO_raw.html -> SNT_2CO_raw.csv, SNT_2CO_raw.txt
Parsed Yami/SNT_2PE_raw.html -> SNT_2PE_raw.csv, SNT_2PE_raw.txt
Parsed Yami/SNT_1JN_raw.html -> SNT_1JN_raw.csv, SNT_1JN_raw.txt
Parsed Yami/SNT_PHP_raw.html -> SNT_PHP_raw.csv, SNT_PHP_raw.txt
Parsed Yami/SNT_TIT_raw.html -> SNT_TIT_raw.csv, SNT_TIT_raw.txt
Parsed Yami/SNT_REV_raw.html -> SNT_REV_raw.csv, SNT_REV_raw.txt
Parsed Yami/SNT_COL_raw.html -> SNT_COL_raw.csv, SNT_COL_raw.txt
Parsed Yami/SNT_2JN_raw.html -> SNT_2JN_raw.csv, SNT_2JN_raw.txt
Parsed Yami/SNT_MRK_raw.html -> SNT_MRK_raw.csv, SNT_MRK_raw.txt
Parsed Yami/SNT_1PE_raw.html -> SNT_1PE_raw.csv, SNT_1PE_raw.txt
Parsed Yami/SNT_JAS_raw.h