## Remove Duplicate

In [1]:
import os
from pathlib import Path

In [2]:
DATA_ROOT = Path(os.environ["GLOBAL_DATASETS_DIR"]) / "llm/mixed_text/out/"
print("Dedupe and filter corpus in ", DATA_ROOT)

Dedupe and filter corpus in  /home/pooja-saxena/PoojaVault/Professional/Workbench/Datasets/llm/mixed_text/out


In [3]:
p = DATA_ROOT / "elephant_human_90_10_corpus.txt"
print("Reading", p)


Reading /home/pooja-saxena/PoojaVault/Professional/Workbench/Datasets/llm/mixed_text/out/elephant_human_90_10_corpus.txt


In [4]:
lines = [l.rstrip() for l in p.open(encoding="utf-8") if l.strip()]
# dedupe (preserve order)
seen = set()
uniq = []

In [5]:
print(len(lines))

1522


In [6]:
for l in lines:
    if l in seen:
        continue
    seen.add(l)
    uniq.append(l)

In [7]:
# extra filters: remove lines that are too short or too long
filtered = [l for l in uniq if 10 <= len(l) <= 500]


In [8]:

# write
out_path = DATA_ROOT / "elephant_human_90_10_corpus.dedup.txt"
out_path.write_text("\n".join(filtered) + "\n", encoding="utf-8")
print("Orig:", len(lines), "Unique:", len(uniq), "Filtered:", len(filtered))


Orig: 1522 Unique: 1482 Filtered: 1329


## Split Train, test

In [17]:
def smart_split_corpus(text_file: Path = src, seed: int = 42, splits=(0.98, 0.01, 0.01)):
    lines = [l.rstrip() for l in text_file.open(encoding="utf-8") if l.strip()]
    random.seed(seed)
    random.shuffle(lines)
    n = len(lines)
    n_val = max(1, int(n * splits[1]))
    n_test = max(1, int(n * splits[2]))
    n_train = n - n_val - n_test

    train = lines[:n_train]
    val = lines[n_train:n_train+n_val]
    test = lines[n_train+n_val:]

    (DATA_ROOT / "train.txt").write_text("\n".join(train)+"\n", encoding="utf-8")
    (DATA_ROOT / "val.txt").write_text("\n".join(val)+"\n", encoding="utf-8")
    (DATA_ROOT / "test.txt").write_text("\n".join(test)+"\n", encoding="utf-8")

    print("Counts: train", len(train), "val", len(val), "test", len(test))
    print("Chars train:", sum(len(l) for l in train))

    for name in ["train.txt","val.txt","test.txt"]:
        p = DATA_ROOT / name
        lines = [l.rstrip() for l in p.open(encoding="utf-8") if l.strip()]
        chars = sum(len(l) for l in lines)
        words = sum(len(l.split()) for l in lines)
        print(name, "lines", len(lines), "chars", chars, "words", words)


In [15]:
DATA_ROOT = Path(os.environ["GLOBAL_DATASETS_DIR"]) / "llm/mixed_text/out/"
src = DATA_ROOT / "elephant_human_90_10_corpus.dedup.txt"

In [18]:
smart_split_corpus(src)

Counts: train 1303 val 13 test 13
Chars train: 102113
train.txt lines 1303 chars 102113 words 16281
val.txt lines 13 chars 683 words 110
test.txt lines 13 chars 943 words 149


In [12]:
for name in ["train.txt","val.txt","test.txt"]:
    p = DATA_ROOT / name
    lines = [l.rstrip() for l in p.open(encoding="utf-8") if l.strip()]
    chars = sum(len(l) for l in lines)
    words = sum(len(l.split()) for l in lines)
    print(name, "lines", len(lines), "chars", chars, "words", words)

train.txt lines 1303 chars 101362 words 16167
val.txt lines 13 chars 1018 words 154
test.txt lines 13 chars 1359 words 219
