## **Setup & shared helpers:**

In [None]:
# Install minimal deps used below
!pip -q install datasets huggingface_hub

import os, re, json, random, datetime, subprocess, shlex
from pathlib import Path
from datasets import load_dataset
from huggingface_hub import hf_hub_download

# Reproducibility
random.seed(42)

# Where we’ll write all split files
SPLITS_DIR = Path("data/splits")
SPLITS_DIR.mkdir(parents=True, exist_ok=True)

def write_jsonl(path, rows):
    # Write list[dict] to JSONL at 'path'
    path = Path(path)
    path.parent.mkdir(parents=True, exist_ok=True)
    with path.open("w", encoding="utf-8") as f:
        for r in rows:
            f.write(json.dumps(r, ensure_ascii=False) + "\n")

def safe_wc_glob(glob_expr: str):
    # Convenience: show line counts if wc exists (Colab/Linux)
    try:
        print(subprocess.check_output(shlex.split(f"bash -lc \"wc -l {glob_expr}\"")).decode())
    except Exception:
        pass

## **CS-Sum (HuggingFace) → split 80/10/10 → unified JSONL:**

In [None]:
# Load the HF dataset
cs = load_dataset("SkAndMl/cs-sum")
print("CS-Sum splits available:", list(cs.keys()))
print("CS-Sum columns:", cs["train"].column_names)

# Parse lines like "#Person1#: hello" into speaker/text turns
SPEAKER_LINE = re.compile(r"^#([^#]+)#:\s*(.*)$")

def turns_from_cs_example(ex):
    raw = ex.get("cs_dialogue") or ex.get("dialogue") or ""
    lines = [ln.strip() for ln in str(raw).splitlines() if ln.strip()]
    turns = []
    for i, ln in enumerate(lines):
        m = SPEAKER_LINE.match(ln)
        if m:
            spk, text = m.group(1), m.group(2)
        else:
            spk, text = f"S{i}", ln
        if text:
            turns.append((spk, text))
    return turns

def convert_cs_split(ds_split, dataset_tag, out_path):
    """Build records in the unified schema and write them."""
    out = []
    for i, ex in enumerate(ds_split):
        turns = turns_from_cs_example(ex)
        summ  = (ex.get("summary") or "").strip()
        if not turns or not summ:
            continue
        msgs = [
            {"mid": j, "parent": None, "author": spk, "time": None, "lang": None, "text": txt}
            for j, (spk, txt) in enumerate(turns)
        ]
        out.append({
            "thread_id": f"{dataset_tag}:{ex.get('id', f'{dataset_tag}_{i}')}",
            "source": dataset_tag,
            "domain": "dialogue",
            "title": ex.get("title"),
            "messages": msgs,
            "summary": summ,
            "refs": [summ],
            "meta": {}
        })
    write_jsonl(out_path, out)
    print(f"Wrote {len(out):,} → {out_path}")
    return len(out)

# Create explicit 80/10/10 splits from the single 'train'
train_full = cs["train"]
idx = list(range(len(train_full)))
random.shuffle(idx)

n = len(idx)
n_train = int(0.8 * n)
n_dev   = int(0.1 * n)

train_ds = train_full.select(sorted(idx[:n_train]))
dev_ds   = train_full.select(sorted(idx[n_train:n_train+n_dev]))
test_ds  = train_full.select(sorted(idx[n_train+n_dev:]))

cs_train = convert_cs_split(train_ds, "cs-sum", SPLITS_DIR / "cs_sum_train.jsonl")
cs_dev   = convert_cs_split(dev_ds,   "cs-sum", SPLITS_DIR / "cs_sum_dev.jsonl")
cs_test  = convert_cs_split(test_ds,  "cs-sum", SPLITS_DIR / "cs_sum_test.jsonl")

print(f"\nCS-Sum totals: Train={cs_train}, Dev={cs_dev}, Test={cs_test}")
safe_wc_glob("data/splits/cs_sum_*.jsonl")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/30.0 [00:00<?, ?B/s]

chinese.csv: 0.00B [00:00, ?B/s]

malay.csv: 0.00B [00:00, ?B/s]

tamil.csv: 0.00B [00:00, ?B/s]

Generating train split:   0%|          | 0/3238 [00:00<?, ? examples/s]

CS-Sum splits available: ['train']
CS-Sum columns: ['dialogue', 'cs_dialogue', 'summary']
Wrote 2,584 → data/splits/cs_sum_train.jsonl
Wrote 323 → data/splits/cs_sum_dev.jsonl
Wrote 325 → data/splits/cs_sum_test.jsonl

CS-Sum totals: Train=2584, Dev=323, Test=325
    323 data/splits/cs_sum_dev.jsonl
    325 data/splits/cs_sum_test.jsonl
   2584 data/splits/cs_sum_train.jsonl
   3232 total



## **CroCoSum (HuggingFace Hub files) → unified JSONL (train/val/test already provided):**

In [None]:
# Download manifest files from the dataset repo (script files aren’t supported by datasets.load on HF)
train_file = hf_hub_download("ruochenz/CroCoSum", filename="data/train.jsonl", repo_type="dataset")
val_file   = hf_hub_download("ruochenz/CroCoSum", filename="data/val.jsonl",   repo_type="dataset")
test_file  = hf_hub_download("ruochenz/CroCoSum", filename="data/test.jsonl",  repo_type="dataset")

# Load those JSONLs as datasets
croco = load_dataset("json", data_files={"train": train_file, "validation": val_file, "test": test_file})
print("CroCoSum splits:", list(croco.keys()))

# Load source documents dictionary (id → {title, body})
src_file = hf_hub_download("ruochenz/CroCoSum", filename="data/src_docs.json", repo_type="dataset")
with open(src_file, "r", encoding="utf-8") as f:
    src_docs = json.load(f)
print(f"Loaded {len(src_docs):,} source docs")

def convert_croco_split(ds_split, split_name, out_path):
    out, skipped = [], 0
    for i, ex in enumerate(ds_split):
        links = ex.get("links", [])
        if not links:
            skipped += 1; continue
        doc_id = str(links[0])
        if doc_id not in src_docs:
            skipped += 1; continue

        src = src_docs[doc_id]
        article_txt = f"{src.get('title','')}\n\n{src.get('body','')}".strip()
        if not article_txt or not ex.get("post_text"):
            skipped += 1; continue

        out.append({
            "thread_id": f"crocosum:{ex.get('post_id', f'{split_name}_{i}')}",
            "source": "crocosum",
            "domain": "news",
            "title": ex.get("post_title"),
            "messages": [
                {"mid": 0, "parent": None, "author": "ARTICLE", "time": None, "lang": "en", "text": article_txt}
            ],
            "summary": ex["post_text"],
            "refs": [ex["post_text"]],
            "meta": {"url": ex.get("post_url")}
        })
    write_jsonl(out_path, out)
    print(f"Wrote {len(out):,} → {out_path} (skipped {skipped})")
    return len(out)

croco_train = convert_croco_split(croco["train"],      "train", SPLITS_DIR / "croco_train.jsonl")
croco_dev   = convert_croco_split(croco["validation"], "dev",   SPLITS_DIR / "croco_dev.jsonl")
croco_test  = convert_croco_split(croco["test"],       "test",  SPLITS_DIR / "croco_test.jsonl")

print(f"\nCroCoSum totals: Train={croco_train}, Dev={croco_dev}, Test={croco_test}")
safe_wc_glob("data/splits/croco_*.jsonl")

data/train.jsonl:   0%|          | 0.00/21.7M [00:00<?, ?B/s]

val.jsonl: 0.00B [00:00, ?B/s]

test.jsonl: 0.00B [00:00, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

CroCoSum splits: ['train', 'validation', 'test']


data/src_docs.json:   0%|          | 0.00/128M [00:00<?, ?B/s]

Loaded 24,171 source docs
Wrote 12,989 → data/splits/croco_train.jsonl (skipped 0)
Wrote 2,784 → data/splits/croco_dev.jsonl (skipped 0)
Wrote 2,784 → data/splits/croco_test.jsonl (skipped 0)

CroCoSum totals: Train=12989, Dev=2784, Test=2784
     2784 data/splits/croco_dev.jsonl
     2784 data/splits/croco_test.jsonl
    12989 data/splits/croco_train.jsonl
    18557 total



## **Kaggle Email Thread Summary (local JSON) → unified JSONL + 80/10/10 split:**

Upload two files first (same folder as the notebook) from local:

*   email_thread_details.json
*   email_thread_summaries.json

In [None]:
# Adjust paths if your files live in a different folder
DETAILS = "email_thread_details.json"
SUMS    = "email_thread_summaries.json"

# Quick cleaners
ORIG_MSG_RE = re.compile(r"(^|\n)[>-]*\s*Original Message.*", re.IGNORECASE | re.DOTALL)
SIG_RE      = re.compile(r"\n--+\s*[\s\S]{0,1000}$")

def clean_body(txt: str) -> str:
    if not txt: return ""
    txt = ORIG_MSG_RE.split(txt)[0]   # drop quoted "Original Message..." chunk
    txt = SIG_RE.sub("", txt)         # drop trailing signatures after "--"
    txt = re.sub(r"[ \t]+\n", "\n", txt)
    txt = re.sub(r"\n{3,}", "\n\n", txt).strip()
    return txt

def ts_to_iso(ts):
    if ts is None: return None
    try:
        ts = int(ts)
        if ts > 10_000_000_000:  # ms → s
            ts //= 1000
        return datetime.datetime.utcfromtimestamp(ts).strftime("%Y-%m-%dT%H:%M:%SZ")
    except Exception:
        return None

# Load raw jsons
with open(DETAILS, "r", encoding="utf-8") as f:
    details = json.load(f)
with open(SUMS, "r", encoding="utf-8") as f:
    summaries = {r["thread_id"]: r["summary"] for r in json.load(f)}

# Group messages by thread
by_tid = {}
for m in details:
    by_tid.setdefault(m["thread_id"], []).append(m)

# Build one record per thread
records = []
for tid, msgs in by_tid.items():
    if tid not in summaries or not msgs:
        continue
    msgs = sorted(msgs, key=lambda x: x.get("timestamp", 0))
    built = [{
        "mid": i,
        "parent": None,  # reply graph not present in this dump
        "author": (m.get("from") or "").strip(),
        "time": ts_to_iso(m.get("timestamp")),
        "lang": None,
        "text": clean_body(m.get("body", "")),
    } for i, m in enumerate(msgs)]
    if all(len(x["text"]) == 0 for x in built):
        continue
    records.append({
        "thread_id": f"kaggle:{tid}",
        "source": "kaggle_email",
        "domain": "email",
        "title": msgs[0].get("subject", "") if msgs else "",
        "messages": built,
        "summary": summaries[tid],
        "refs": [summaries[tid]],
        "meta": {"num_messages": len(built)}
    })

# 80/10/10 split by thread (no leakage)
random.shuffle(records)
n = len(records); n_tr = int(0.8 * n); n_de = int(0.1 * n)
splits = {"train": records[:n_tr], "dev": records[n_tr:n_tr+n_de], "test": records[n_tr+n_de:]}

for name, rows in splits.items():
    write_jsonl(SPLITS_DIR / f"kaggle_{name}.jsonl", rows)

print(f"Kaggle Email: built {n} threads → train={len(splits['train'])}, dev={len(splits['dev'])}, test={len(splits['test'])}")
safe_wc_glob("data/splits/kaggle_*.jsonl")

FileNotFoundError: [Errno 2] No such file or directory: 'email_thread_details.json'

## **DialogSum (local JSONL splits) → unified JSONL (train/dev/test):**
Upload these three files first from local:

*   dialogsum.train.jsonl
*   dialogsum.dev.jsonl
*   dialogsum.test.jsonl


In [None]:
def ds_to_messages(dialogue_text: str):
    """
    Convert multi-line 'dialogue' into messages[].
    Lines are typically 'Speaker: text'. Non-matching lines are appended
    to the previous message.
    """
    msgs, i = [], 0
    if not dialogue_text: return msgs
    line_pat = re.compile(r"^([^:]+):\s*(.*)$")
    for raw in dialogue_text.split("\n"):
        line = raw.strip()
        if not line:
            continue
        m = line_pat.match(line)
        if m:
            spk, text = m.group(1).strip(), m.group(2).strip()
            if text:
                msgs.append({"mid": i, "parent": None, "author": spk, "time": None, "lang": None, "text": text})
                i += 1
        else:
            if msgs:
                msgs[-1]["text"] = (msgs[-1]["text"] + " " + line).strip()
            else:
                msgs.append({"mid": i, "parent": None, "author": "Speaker", "time": None, "lang": None, "text": line})
                i += 1
    return msgs

def convert_dialogsum_split(in_path: Path, out_path: Path):
    n, out_rows = 0, []
    with Path(in_path).open("r", encoding="utf-8") as fin:
        for i, line in enumerate(fin):
            ex = json.loads(line)
            dlg  = ex.get("dialogue") or ex.get("dialog") or ""
            summ = ex.get("summary") or ex.get("summary1") or ex.get("summary2")
            if not dlg or not summ:
                continue
            msgs = ds_to_messages(dlg)
            if not msgs:
                continue
            out_rows.append({
                "thread_id": f"dialogsum:{ex.get('id', i)}",
                "source": "dialogsum",
                "domain": "dialogue",
                "title": ex.get("topic"),
                "messages": msgs,
                "summary": summ,
                "refs": [summ],
                "meta": {}
            })
            n += 1
    write_jsonl(out_path, out_rows)
    print(f"Wrote {n:,} → {out_path}")
    return n

d_train = convert_dialogsum_split(Path("dialogsum.train.jsonl"), SPLITS_DIR / "dialogsum_train.jsonl")
d_dev   = convert_dialogsum_split(Path("dialogsum.dev.jsonl"),   SPLITS_DIR / "dialogsum_dev.jsonl")
d_test  = convert_dialogsum_split(Path("dialogsum.test.jsonl"),  SPLITS_DIR / "dialogsum_test.jsonl")

print(f"\nDialogSum totals: Train={d_train}, Dev={d_dev}, Test={d_test}")
safe_wc_glob("data/splits/dialogsum_*.jsonl")

## **Mount Drive:**

In [None]:
from google.colab import drive
from pathlib import Path
import shutil

# Mount Drive
drive.mount("/content/drive", force_remount=True)

SRC = Path("data/splits")
DST = Path("/content/drive/MyDrive/nlp_data/splits")
DST.mkdir(parents=True, exist_ok=True)

# Copy *.jsonl to Drive
copied = 0
for p in sorted(SRC.glob("*.jsonl")):
    shutil.copy2(p, DST / p.name)
    copied += 1
print(f"Copied {copied} files to {DST}")

# Quick integrity check: compare line counts
def _wc(path: Path) -> int:
    try:
        with path.open("r", encoding="utf-8") as f:
            return sum(1 for _ in f)
    except FileNotFoundError:
        return 0

mismatches = []
for p in sorted(SRC.glob("*.jsonl")):
    s, d = _wc(p), _wc(DST / p.name)
    if s != d:
        mismatches.append((p.name, s, d))

if mismatches:
    print("Line-count mismatches:")
    for name, s, d in mismatches:
        print(f"  {name}: local={s}, drive={d}")
else:
    print("Verified: all copied files match by line count.")

# Optional: list what’s in Drive
for p in sorted((DST).glob("*.jsonl")):
    print("•", p.name)

## **Statistics (counts + message totals/averages):**

In [None]:
def count_lines(path: Path) -> int:
    try:
        with path.open("r", encoding="utf-8") as f:
            return sum(1 for _ in f)
    except FileNotFoundError:
        return 0

def message_stats(path: Path):
    """Return (#threads, #messages total, avg messages/thread) without writing anything."""
    threads = msgs = 0
    try:
        with path.open("r", encoding="utf-8") as f:
            for line in f:
                rec = json.loads(line)
                threads += 1
                msgs += len(rec.get("messages", []))
    except FileNotFoundError:
        pass
    avg = (msgs / threads) if threads else 0.0
    return threads, msgs, avg

# Datasets we expect—safe if some are missing
datasets = {
    "Kaggle Email Threads":   ["kaggle_train.jsonl",   "kaggle_dev.jsonl",   "kaggle_test.jsonl"],
    "CS-Sum (Code-Switched)": ["cs_sum_train.jsonl",   "cs_sum_dev.jsonl",   "cs_sum_test.jsonl"],
    "CroCoSum (News)":        ["croco_train.jsonl",    "croco_dev.jsonl",    "croco_test.jsonl"],
    "DialogSum (Dialogue)":   ["dialogsum_train.jsonl","dialogsum_dev.jsonl","dialogsum_test.jsonl"],
}

grand_items = 0
grand_msgs  = 0

print("STATISTICS:")

for label, files in datasets.items():
    paths     = [SPLITS_DIR / f for f in files]
    counts    = [count_lines(p) for p in paths]
    msg_stats = [message_stats(p) for p in paths]   # (threads, msgs, avg)
    if sum(counts) == 0:
        continue

    msgs_total = sum(m for _, m, _ in msg_stats)

    print(f"\n{label}:")
    for split_name, n, (thr, m, avg) in zip(["Train", "Dev", "Test"], counts, msg_stats):
        print(f"  {split_name:<5} {n:>6,} items | msgs: {m:>7,}  (avg {avg:.1f}/item)")
    total = sum(counts)
    print(f"  Total {total:>6,} items | msgs: {msgs_total:>7,}")

    grand_items += total
    grand_msgs  += msgs_total

print(f"\nCombined Total: {grand_items:,} items | msgs: {grand_msgs:,}")

In [None]:
# TAR.GZ version
!tar -czf data_folder.tar.gz data
from google.colab import files
files.download("data_folder.tar.gz")