In [1]:
# 🚢 03_dataset_formatting.ipynb

import os
import json
from pathlib import Path

# 📂 Input & Output Paths
CLEANED_DIR = Path("data/emsa_cleaned")
OUTPUT_FILE = Path("data/emsa_dataset.jsonl")

# ⚙️ Config
SYSTEM_PROMPT = (
    "You are a maritime compliance assistant. "
    "Answer questions based on maritime safety, regulatory, and operational documents. "
    "Use precise and factual tone."
)

def read_cleaned_files(directory):
    """
    Load all cleaned text files and return as list of strings.
    """
    files = sorted(directory.glob("*.txt"))
    docs = []

    for f in files:
        try:
            text = f.read_text(encoding='utf-8').strip()
            if len(text) < 100:
                continue  # skip tiny fragments
            docs.append({
                "filename": f.name,
                "text": text
            })
        except Exception as e:
            print(f"⚠️ Skipping {f.name}: {e}")

    return docs


def convert_to_chat_format(text, filename):
    """
    Wrap each document as a prompt-response chat pair (instruction-tuning style).
    """
    return {
        "messages": [
            {"role": "system", "content": SYSTEM_PROMPT},
            {"role": "user", "content": f"Summarize the key points of the following maritime document:\n\n{text[:2000]}"},
            {"role": "assistant", "content": f"This is a regulatory excerpt from file '{filename}'. It includes technical maritime details. [PLACEHOLDER RESPONSE]"}
        ]
    }

# 📥 Load cleaned docs
documents = read_cleaned_files(CLEANED_DIR)
print(f"📄 Loaded {len(documents)} documents.")

# 💬 Format to chat-style dataset
chat_data = [convert_to_chat_format(d["text"], d["filename"]) for d in documents]

# 💾 Save to JSONL
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
    for item in chat_data:
        json.dump(item, f, ensure_ascii=False)
        f.write("\n")

print(f"✅ Saved JSONL dataset: {OUTPUT_FILE} ({len(chat_data)} samples)")


📄 Loaded 1 documents.
✅ Saved JSONL dataset: data\emsa_dataset.jsonl (1 samples)
