In [2]:
import os
import re
import json
import hashlib
import unicodedata
from tqdm import tqdm

# === CONFIGURATION ===
INPUT_DIR = "/home/blu-bridge004/Desktop/pytorch/preprocessed06_10"           # Folder with input .jsonl files (1 doc per line)
OUTPUT_DIR = "/home/blu-bridge004/Desktop/pytorch/test2deduplicatedshards06_10"     # Folder to save deduplicated output
HASH_STORE_DIR = "./hash_bins"            # Where all paragraph hashes are stored

os.makedirs(OUTPUT_DIR, exist_ok=True)
os.makedirs(HASH_STORE_DIR, exist_ok=True)

# === TEXT NORMALIZATION FUNCTION ===
def normalize_text(text):
    text = text.lower()
    text = re.sub(r"\d", "0", text)
    text = ''.join(c for c in unicodedata.normalize("NFKD", text)
                  if not unicodedata.combining(c) and not unicodedata.category(c).startswith('P'))
    return text.strip()

# === HASHING FUNCTION (first 64 bits of SHA1) ===
def paragraph_hash(paragraph):
    norm = normalize_text(paragraph)
    sha1 = hashlib.sha1(norm.encode("utf-8")).hexdigest()
    return sha1[:8]  # 64 bits = 16 hex chars

# === STEP 1: CREATE A SINGLE GLOBAL HASH BIN WITH ALL SHARDS ===
def create_global_hash_bin():
    all_hashes = set()
    
    # Process each shard and add paragraph hashes to global set
    for fname in tqdm(sorted(os.listdir(INPUT_DIR)), desc="[1/2] Creating global hash bin"):
        if not fname.endswith(".jsonl"): continue

        with open(os.path.join(INPUT_DIR, fname), "r", encoding="utf-8") as f:
            for line in f:
                doc = json.loads(line)
                text = doc.get("text", "")
                paragraphs = [p.strip() for p in text.split("\n") if p.strip()]
                for para in paragraphs:
                    h = paragraph_hash(para)
                    all_hashes.add(h)

    # Save all unique hashes in a single global hash bin file
    global_hash_bin_path = os.path.join(HASH_STORE_DIR, "global_hashes.hashes")
    with open(global_hash_bin_path, "w") as out:
        out.write("\n".join(sorted(all_hashes)))
    
    print(f"Total unique paragraphs across all shards: {len(all_hashes)}")

# === STEP 2: DEDUPLICATE PARAGRAPHS ACROSS ALL SHARDS ===
def deduplicate_across_shards():
    # Load all hashes from the global hash bin
    all_hashes = set()
    global_hash_bin_path = os.path.join(HASH_STORE_DIR, "global_hashes.hashes")
    
    with open(global_hash_bin_path, "r") as f:
        for line in f:
            all_hashes.add(line.strip())

    # Process each shard for deduplication
    for fname in tqdm(sorted(os.listdir(INPUT_DIR)), desc="[2/2] Deduplicating across shards"):
        if not fname.endswith(".jsonl"): continue

        in_path = os.path.join(INPUT_DIR, fname)
        out_path = os.path.join(OUTPUT_DIR, fname)

        with open(in_path, "r", encoding="utf-8") as in_f, \
             open(out_path, "w", encoding="utf-8") as out_f:

            for line in in_f:
                doc = json.loads(line)
                text = doc.get("text", "")
                new_paragraphs = []
                for p in text.split("\n"):
                    p = p.strip()
                    if not p: continue
                    h = paragraph_hash(p)
                    if h in all_hashes:
                        continue  # Skip duplicate paragraph
                    new_paragraphs.append(p)
                    all_hashes.add(h)  # Mark this paragraph hash as seen

                if new_paragraphs:
                    doc["text"] = "\n".join(new_paragraphs)
                    out_f.write(json.dumps(doc, ensure_ascii=False) + "\n")

# === RUN FULL PIPELINE ===
if __name__ == "__main__":
    # Step 1: Create the global hash bin with unique paragraph hashes across all shards
    create_global_hash_bin()
    
    # Step 2: Deduplicate paragraphs across all shards using the global hash bin
    deduplicate_across_shards()
    
    print("\n✅ Deduplication across shards complete.")


[1/2] Creating global hash bin: 100%|██████████| 2/2 [19:01<00:00, 570.56s/it]


Total unique paragraphs across all shards: 39326958


[2/2] Deduplicating across shards: 100%|██████████| 2/2 [19:16<00:00, 578.00s/it]


✅ Deduplication across shards complete.



