In [1]:
# ✅ Notebook 1: Run this in Jupyter or Colab

import os, json
from sklearn.model_selection import train_test_split

# 🔍 Step 1: Extract uploaded zip
import zipfile
with zipfile.ZipFile("parsed-json.zip", 'r') as zip_ref:
    zip_ref.extractall("parsed-json")

data = []
json_folder = "parsed-json"

# 📦 Step 2: Load + flatten JSON files
for fname in sorted(os.listdir(json_folder)):
    if not fname.endswith(".json"): continue
    path = os.path.join(json_folder, fname)
    with open(path, "r", encoding="utf-8") as f:
        content = json.load(f)

        # Convert sections into a single block of text
        full_text = ""
        for section in ["header", "summary", "experience", "education", "skills", "projects", "certifications", "achievements", "organizations", "hobbies"]:
            sec = content.get(section)
            if sec:
                if isinstance(sec, list):
                    full_text += f"\n\n{section.title()}:\n" + "\n".join([str(x) for x in sec])
                elif isinstance(sec, dict):
                    full_text += f"\n\n{section.title()}:\n" + "\n".join([f"{k}: {v}" for k, v in sec.items()])
                else:
                    full_text += f"\n\n{section.title()}:\n{str(sec)}"

        # Add training pair
        data.append({
            "input": full_text.strip(),
            "output": json.dumps(content, ensure_ascii=False)
        })

# ✂️ Step 3: Split into train / val
train_data, val_data = train_test_split(data, test_size=0.2, random_state=42)

# 💾 Step 4: Save to JSONL
def save_jsonl(dataset, filename):
    with open(filename, "w", encoding="utf-8") as f:
        for item in dataset:
            f.write(json.dumps(item, ensure_ascii=False) + "\n")

save_jsonl(train_data, "train.jsonl")
save_jsonl(val_data, "val.jsonl")

print("✅ Saved train.jsonl and val.jsonl")


✅ Saved train.jsonl and val.jsonl
