## Delete duplicates from verses 

(due to merged verses -> split verse but duplicating the verse)

In [53]:
import os
import glob
import pandas as pd
import re

In [54]:
INPUT_ROOT  = "cj/parsed"  
OUTPUT_ROOT = "cj/sentence"
EXACT_TEXT_COL_NAME_ONLY = True

In [55]:
def read_csv_any_encoding(path):
    """
    Read CSV robustly with a few common encodings. 
    Skips bad lines to avoid breaking on malformed rows.
    """
    for enc in ("utf-8", "utf-8-sig", "latin-1"):
        try:
            return pd.read_csv(path, dtype=str, encoding=enc, on_bad_lines="skip")
        except Exception:
            continue
    raise RuntimeError(f"Failed to read CSV: {path}")

In [56]:
pattern = os.path.join(INPUT_ROOT, "**", "*.csv")
csv_paths = glob.glob(pattern, recursive=True)

print(f"Found {len(csv_paths)} CSV file(s) under: {os.path.abspath(INPUT_ROOT)}")

processed = 0
written   = 0
skipped_no_text_col = 0
skipped_read_error  = 0

for in_path in csv_paths:
    processed += 1

    try:
        df = read_csv_any_encoding(in_path)
    except Exception as e:
        print(f"[WARN] Read error; skipping: {in_path} | {e}")
        skipped_read_error += 1
        continue

    cols = list(df.columns)
    if EXACT_TEXT_COL_NAME_ONLY:
        text_cols = [c for c in cols if c.strip().lower() == "text"]
    else:
        text_cols = [c for c in cols if "text" in c.strip().lower()]

    if not text_cols:
        print(f"[INFO] No matching 'text' column in: {in_path}; skipping.")
        skipped_no_text_col += 1
        continue

    series_list = []
    for col in text_cols:
        s = df[col].dropna().astype(str).map(str.strip)
        s = s[s != ""]
        series_list.append(s)

    if not series_list:
        print(f"[INFO] No non-empty text in: {in_path}; skipping write.")
        continue

    combined = pd.concat(series_list, ignore_index=True)
    uniq = combined.drop_duplicates(keep="first")

    rel_path = os.path.relpath(in_path, INPUT_ROOT)            
    base_no_ext, _ = os.path.splitext(rel_path)                
    out_path = os.path.join(OUTPUT_ROOT, base_no_ext + ".txt") 

    out_dir = os.path.dirname(out_path)
    if out_dir and not os.path.exists(out_dir):
        os.makedirs(out_dir, exist_ok=True)

    with open(out_path, "w", encoding="utf-8", newline="\n") as f:
        for line in uniq:
            f.write(line + "\n")

    written += 1
    print(f"Wrote {len(uniq)} unique line(s) -> {out_path}")

print("\n==== Summary ====")
print(f"CSV files found        : {len(csv_paths)}")
print(f"Processed              : {processed}")
print(f"Wrote TXT files        : {written}")
print(f"Skipped (no 'text' col): {skipped_no_text_col}")
print(f"Skipped (read errors)  : {skipped_read_error}")

Found 210 CSV file(s) under: /Users/armina/Documents/GitHub/bible-dot-com-scraper/parser/cj/parsed
Wrote 104 unique line(s) -> cj/sentence/Yami/SNT_PHP_raw.txt
Wrote 25 unique line(s) -> cj/sentence/Yami/SNT_JUD_raw.txt
Wrote 25 unique line(s) -> cj/sentence/Yami/SNT_PHM_raw.txt
Wrote 148 unique line(s) -> cj/sentence/Yami/SNT_GAL_raw.txt
Wrote 105 unique line(s) -> cj/sentence/Yami/SNT_1JN_raw.txt
Wrote 83 unique line(s) -> cj/sentence/Yami/SNT_2TI_raw.txt
Wrote 403 unique line(s) -> cj/sentence/Yami/SNT_REV_raw.txt
Wrote 95 unique line(s) -> cj/sentence/Yami/SNT_COL_raw.txt
Wrote 46 unique line(s) -> cj/sentence/Yami/SNT_2TH_raw.txt
Wrote 256 unique line(s) -> cj/sentence/Yami/SNT_2CO_raw.txt
Wrote 46 unique line(s) -> cj/sentence/Yami/SNT_TIT_raw.txt
Wrote 61 unique line(s) -> cj/sentence/Yami/SNT_2PE_raw.txt
Wrote 107 unique line(s) -> cj/sentence/Yami/SNT_JAS_raw.txt
Wrote 997 unique line(s) -> cj/sentence/Yami/SNT_ACT_raw.txt
Wrote 152 unique line(s) -> cj/sentence/Yami/SNT_EPH_r

## Cleaning

In [None]:
import os
import re
import glob

print("CWD:", os.getcwd())  

LANG_DIRS = [
    "yna/sentence/Cebuano",
    "yna/sentence/Spanish",
    "yna/sentence/Chavacano",
    "yna/sentence/Tausug",
    # add more if you want
    "cj/sentence/Ivatan",
    "cj/sentence/Pangasinense",
    "cj/sentence/Tagalog",
    "cj/sentence/Yami",
    "trish/sentence/Bikolano",
    "trish/sentence/Ilokano",
    "trish/sentence/Kalinga",
    "trish/sentence/Kapampangan",

]

OUT_BASE = os.path.normpath(os.path.join("by_sentence"))

# REGEXES
HEADER_RE = re.compile(r'(?im)^\s*ORIGINAL\s+TEXT\s*\n?')
# verse markers like "1CO.1.11" at the start of lines
VERSE_RE  = re.compile(r'(?m)^\s*(?:[1-3]?[A-Z]{2,}\.\d+\.\d+)\s*')

# characters to delete entirely
STRIP_CHARS_RE = re.compile(r'[“”‘’()\[\]«»]')

# split sentences on ., !, ? (end the sentence right after the mark)
SENTENCE_SPLIT_RE = re.compile(r'(?<=[.!?])\s+')

def clean_and_split(text: str) -> str:
    """Remove headers/markers & unwanted chars, normalize spaces, then split into sentences.
       Returns a single string with one sentence per line."""
    # remove header + verse markers
    text = HEADER_RE.sub("", text)
    text = VERSE_RE.sub("", text)

    # delete specific characters
    text = STRIP_CHARS_RE.sub("", text)

    # flatten newlines -> spaces; collapse spaces
    text = re.sub(r'[ \t]*\n[ \t]*', ' ', text)
    text = re.sub(r'\s{2,}', ' ', text).strip()

    if not text:
        return ""

    # split into sentences
    sentences = SENTENCE_SPLIT_RE.split(text)

    # trim each, drop empties, and join one per line
    sentences = [s.strip() for s in sentences if s.strip()]
    return "\n".join(sentences)

total_processed = 0

for lang_dir in LANG_DIRS:
    in_root  = lang_dir                        
    out_root = os.path.join(OUT_BASE, lang_dir)  
    os.makedirs(out_root, exist_ok=True)

    pattern = os.path.join(in_root, "**", "*.txt")
    in_files = sorted(glob.glob(pattern, recursive=True))
    print(f"\n[{lang_dir}] Found {len(in_files)} .txt files under {os.path.abspath(in_root)}")
    if not in_files:
        continue

    processed = 0
    for in_path in in_files:
        rel_path = os.path.relpath(in_path, in_root)
        out_path = os.path.join(out_root, rel_path)
        os.makedirs(os.path.dirname(out_path), exist_ok=True)

        with open(in_path, "r", encoding="utf-8", errors="ignore") as f:
            raw = f.read()

        cleaned = clean_and_split(raw)

        with open(out_path, "w", encoding="utf-8") as f:
            f.write(cleaned)

        processed += 1
        total_processed += 1

    print(f"[{lang_dir}] Processed {processed} files. Output -> {os.path.abspath(out_root)}")

print(f"\nAll done. Total processed: {total_processed} files.")

CWD: /Users/armina/Documents/GitHub/bible-dot-com-scraper/parser

[yna/sentence/Cebuano] Found 66 .txt files under /Users/armina/Documents/GitHub/bible-dot-com-scraper/parser/yna/sentence/Cebuano
[yna/sentence/Cebuano] Processed 66 files. Output -> /Users/armina/Documents/GitHub/bible-dot-com-scraper/parser/by_sentence/yna/sentence/Cebuano

[yna/sentence/Spanish] Found 66 .txt files under /Users/armina/Documents/GitHub/bible-dot-com-scraper/parser/yna/sentence/Spanish
[yna/sentence/Spanish] Processed 66 files. Output -> /Users/armina/Documents/GitHub/bible-dot-com-scraper/parser/by_sentence/yna/sentence/Spanish

[yna/sentence/Chavacano] Found 27 .txt files under /Users/armina/Documents/GitHub/bible-dot-com-scraper/parser/yna/sentence/Chavacano
[yna/sentence/Chavacano] Processed 27 files. Output -> /Users/armina/Documents/GitHub/bible-dot-com-scraper/parser/by_sentence/yna/sentence/Chavacano

[yna/sentence/Tausug] Found 27 .txt files under /Users/armina/Documents/GitHub/bible-dot-com-sc

In [None]:
import os
import csv

IN_ROOT = "by_sentence"

# Output root folder
OUT_ROOT = "by_sentence"
os.makedirs(OUT_ROOT, exist_ok=True)

for root, dirs, files in os.walk(IN_ROOT):
    for file in files:
        if file.endswith(".txt"):
            in_path = os.path.join(root, file)

            rel_path = os.path.relpath(root, IN_ROOT)
            out_dir = os.path.join(OUT_ROOT, rel_path)
            os.makedirs(out_dir, exist_ok=True)

            out_file = os.path.splitext(file)[0] + ".csv"
            out_path = os.path.join(out_dir, out_file)

            with open(in_path, "r", encoding="utf-8") as infile, \
                 open(out_path, "w", newline="", encoding="utf-8") as outfile:

                writer = csv.writer(outfile)

                writer.writerow(["sentence"])

                for line in infile:
                    line = line.strip()
                    if line:  
                        writer.writerow([line])

            print(f"Converted: {in_path} → {out_path}")

Converted: by_sentence/yna/sentence/Chavacano/CBKNT_PHM_raw.txt → by_sentence/yna/sentence/Chavacano/CBKNT_PHM_raw.csv
Converted: by_sentence/yna/sentence/Chavacano/CBKNT_JUD_raw.txt → by_sentence/yna/sentence/Chavacano/CBKNT_JUD_raw.csv
Converted: by_sentence/yna/sentence/Chavacano/CBKNT_1JN_raw.txt → by_sentence/yna/sentence/Chavacano/CBKNT_1JN_raw.csv
Converted: by_sentence/yna/sentence/Chavacano/CBKNT_GAL_raw.txt → by_sentence/yna/sentence/Chavacano/CBKNT_GAL_raw.csv
Converted: by_sentence/yna/sentence/Chavacano/CBKNT_PHP_raw.txt → by_sentence/yna/sentence/Chavacano/CBKNT_PHP_raw.csv
Converted: by_sentence/yna/sentence/Chavacano/CBKNT_JAS_raw.txt → by_sentence/yna/sentence/Chavacano/CBKNT_JAS_raw.csv
Converted: by_sentence/yna/sentence/Chavacano/CBKNT_2PE_raw.txt → by_sentence/yna/sentence/Chavacano/CBKNT_2PE_raw.csv
Converted: by_sentence/yna/sentence/Chavacano/CBKNT_TIT_raw.txt → by_sentence/yna/sentence/Chavacano/CBKNT_TIT_raw.csv
Converted: by_sentence/yna/sentence/Chavacano/CB