In [2]:
import os
import json
import pickle
from datetime import datetime
from typing import Dict, Any, Tuple, Optional

import numpy as np
import pandas as pd
import h5py

try:
    import yaml
    HAVE_YAML = True
except Exception:
    HAVE_YAML = False

# --------- USER PATHS ----------
CSV_PATH = r"C:\Users\sagni\Downloads\Code Generator\archive\train.csv"
OUT_DIR  = r"C:\Users\sagni\Downloads\Code Generator"
# --------------------------------

# Case-insensitive candidate names
INSTR_CANDS = [
    "instruction","prompt","question","task","query",
    "title","description","desc","nl","problem","user","spec"
]
INPUT_CANDS = [
    "input","context","constraints","examples","params","stdin","additional_input"
]
OUTPUT_CANDS = [
    "output","answer","solution","completion","code","response","target","program","result"
]

def ensure_out_dir(path: str):
    os.makedirs(path, exist_ok=True)

def _read_csv_any(path: str) -> pd.DataFrame:
    if not os.path.exists(path):
        raise FileNotFoundError(path)
    try:
        return pd.read_csv(path)
    except Exception:
        pass
    try:
        return pd.read_csv(path, encoding="utf-8", errors="ignore")
    except Exception:
        try:
            return pd.read_csv(path, encoding="latin-1")
        except Exception:
            return pd.read_csv(path, engine="python")

def _trim_keep_code(s: Optional[str]) -> str:
    # Only strip ends; DO NOT collapse internal whitespace/newlines
    if s is None:
        return ""
    return str(s).strip()

def detect_columns(df: pd.DataFrame) -> Tuple[pd.Series, pd.Series, pd.Series, Dict[str,str]]:
    lower = {c.lower(): c for c in df.columns}

    instr_col = next((lower[c] for c in INSTR_CANDS if c in lower), None)
    input_col = next((lower[c] for c in INPUT_CANDS if c in lower), None)
    output_col = next((lower[c] for c in OUTPUT_CANDS if c in lower), None)

    if instr_col is None or output_col is None:
        if df.shape[1] >= 2:
            cols = list(df.columns)
            instr_col = instr_col or cols[0]
            output_col = output_col or cols[1]
        elif df.shape[1] == 1:
            instr_col = instr_col or df.columns[0]
            output_col = output_col or None
        else:
            obj_cols = [c for c in df.columns if df[c].dtype == object]
            if len(obj_cols) >= 2:
                instr_col = instr_col or obj_cols[0]
                output_col = output_col or obj_cols[1]
            elif len(obj_cols) == 1:
                instr_col = instr_col or obj_cols[0]
                output_col = output_col or None

    instr = df[instr_col].map(_trim_keep_code) if instr_col else pd.Series([], dtype=str)

    if input_col:
        inp = df[input_col].map(_trim_keep_code)
    else:
        remaining = [c for c in df.columns if c not in {instr_col, output_col}]
        inp = df[remaining[0]].map(_trim_keep_code) if remaining else pd.Series([""] * len(df))

    out = df[output_col].map(_trim_keep_code) if output_col else pd.Series([""] * len(df))

    n = min(len(instr), len(inp), len(out))
    instr, inp, out = instr.iloc[:n].reset_index(drop=True), inp.iloc[:n].reset_index(drop=True), out.iloc[:n].reset_index(drop=True)

    mask = (instr != "") & (out != "")
    instr, inp, out = instr[mask].reset_index(drop=True), inp[mask].reset_index(drop=True), out[mask].reset_index(drop=True)

    names_used = {
        "instruction_column_used": instr_col or (df.columns[0] if len(df.columns) else ""),
        "input_column_used": input_col or "",
        "output_column_used": output_col or (df.columns[1] if len(df.columns) > 1 else "")
    }
    return instr, inp, out, names_used

def write_h5(path: str, instr: pd.Series, inp: pd.Series, out: pd.Series) -> None:
    with h5py.File(path, "w") as h5:
        grp = h5.create_group("train")
        str_dt = h5py.string_dtype(encoding="utf-8")
        grp.create_dataset("instruction", data=instr.astype(str).values, dtype=str_dt, compression="gzip")
        grp.create_dataset("input",       data=inp.astype(str).values,   dtype=str_dt, compression="gzip")
        grp.create_dataset("output",      data=out.astype(str).values,   dtype=str_dt, compression="gzip")

def write_pkl(path: str, df: pd.DataFrame, meta: Dict[str, Any]) -> None:
    payload = {"data": df.copy(), "meta": meta}
    with open(path, "wb") as f:
        pickle.dump(payload, f, protocol=pickle.HIGHEST_PROTOCOL)

def write_yaml(path: str, meta: Dict[str, Any]) -> None:
    if HAVE_YAML:
        with open(path, "w", encoding="utf-8") as f:
            yaml.safe_dump(meta, f, sort_keys=False, allow_unicode=True)
    else:
        with open(path, "w", encoding="utf-8") as f:
            f.write(json.dumps(meta, ensure_ascii=False, indent=2))

def write_jsonl(path: str, instr: pd.Series, inp: pd.Series, out: pd.Series) -> None:
    with open(path, "w", encoding="utf-8") as f:
        for a, b, c in zip(instr, inp, out):
            rec = {"instruction": a, "input": b, "output": c}
            f.write(json.dumps(rec, ensure_ascii=False) + "\n")

def basic_stats(instr: pd.Series, inp: pd.Series, out: pd.Series) -> Dict[str, Any]:
    def char_stats(s: pd.Series) -> Dict[str, float]:
        lens = s.map(lambda x: len(x) if isinstance(x, str) else 0)
        return {
            "min": int(lens.min()) if len(lens) else 0,
            "max": int(lens.max()) if len(lens) else 0,
            "mean": float(round(lens.mean(), 2)) if len(lens) else 0.0,
            "median": float(round(lens.median(), 2)) if len(lens) else 0.0
        }
    def line_stats(s: pd.Series) -> Dict[str, float]:
        lines = s.map(lambda x: x.count("\n") + 1 if isinstance(x, str) and x != "" else 0)
        return {
            "min": int(lines.min()) if len(lines) else 0,
            "max": int(lines.max()) if len(lines) else 0,
            "mean": float(round(lines.mean(), 2)) if len(lines) else 0.0,
            "median": float(round(lines.median(), 2)) if len(lines) else 0.0
        }
    return {
        "num_examples": int(len(instr)),
        "instruction_chars": char_stats(instr),
        "input_chars": char_stats(inp),
        "output_chars": char_stats(out),
        "output_lines": line_stats(out)
    }

def main():
    ensure_out_dir(OUT_DIR)

    df_raw = _read_csv_any(CSV_PATH)
    instr, inp, out, names_used = detect_columns(df_raw)

    data = pd.DataFrame({"instruction": instr.astype(str), "input": inp.astype(str), "output": out.astype(str)})

    stats = basic_stats(instr, inp, out)
    meta = {
        "dataset_name": "codegen_single_csv",
        "source_csv": CSV_PATH,
        "created_utc": datetime.utcnow().isoformat() + "Z",
        "columns_used": names_used,
        "sizes": {"train": int(len(data))},
        "stats": stats,
        "schema": {"fields": ["instruction", "input", "output"]}
    }

    out_h5    = os.path.join(OUT_DIR, "codegen.h5")
    out_pkl   = os.path.join(OUT_DIR, "codegen.pkl")
    out_yaml  = os.path.join(OUT_DIR, "codegen_config.yaml")
    out_jsonl = os.path.join(OUT_DIR, "codegen.jsonl")
    out_sum   = os.path.join(OUT_DIR, "codegen_summary.json")

    write_h5(out_h5, instr, inp, out)
    write_pkl(out_pkl, data, meta)
    write_yaml(out_yaml, meta)
    write_jsonl(out_jsonl, instr, inp, out)
    with open(out_sum, "w", encoding="utf-8") as f:
        json.dump(meta, f, ensure_ascii=False, indent=2)

    print("=== CodeGenerator Artifacts Written ===")
    print(f"H5:    {out_h5}")
    print(f"PKL:   {out_pkl}")
    print(f"YAML:  {out_yaml}")
    print(f"JSONL: {out_jsonl}")
    print(f"SUM:   {out_sum}")
    print("\nSizes:", meta["sizes"])
    print("Columns used:", meta["columns_used"])
    print("Stats:", json.dumps(stats, indent=2))

if __name__ == "__main__":
    main()


=== CodeGenerator Artifacts Written ===
H5:    C:\Users\sagni\Downloads\Code Generator\codegen.h5
PKL:   C:\Users\sagni\Downloads\Code Generator\codegen.pkl
YAML:  C:\Users\sagni\Downloads\Code Generator\codegen_config.yaml
JSONL: C:\Users\sagni\Downloads\Code Generator\codegen.jsonl
SUM:   C:\Users\sagni\Downloads\Code Generator\codegen_summary.json

Sizes: {'train': 18612}
Columns used: {'instruction_column_used': 'instruction', 'input_column_used': 'input', 'output_column_used': 'output'}
Stats: {
  "num_examples": 18612,
  "instruction_chars": {
    "min": 21,
    "max": 7755,
    "mean": 98.19,
    "median": 83.0
  },
  "input_chars": {
    "min": 1,
    "max": 800,
    "mean": 29.24,
    "median": 14.0
  },
  "output_chars": {
    "min": 1,
    "max": 20778,
    "mean": 465.94,
    "median": 260.0
  },
  "output_lines": {
    "min": 1,
    "max": 542,
    "mean": 16.88,
    "median": 11.0
  }
}
