In [1]:
# ==========================================================
# EduVision dataset packer: CSV -> PKL, HDF5, JSONL, YAML
# ==========================================================
import os, sys, json, math, gc
from pathlib import Path
from typing import Dict, List
import pandas as pd

# ----------------------------
# INPUT FILES (edit if needed)
# ----------------------------
inputs = [
    r"C:\Users\sagni\Downloads\Edu Vision\archive\Word Embeddings\MOOC_50d.csv",
    r"C:\Users\sagni\Downloads\Edu Vision\archive\Word Embeddings\MOOC_300d.csv",
    r"C:\Users\sagni\Downloads\Edu Vision\archive\Word Embeddings\MOOC_200d.csv",
    r"C:\Users\sagni\Downloads\Edu Vision\archive\Word Embeddings\MOOC_100d.csv",
    r"C:\Users\sagni\Downloads\Edu Vision\archive\Topic Vectors\DT50.csv",
    r"C:\Users\sagni\Downloads\Edu Vision\archive\Topic Vectors\DT100.csv",
    r"C:\Users\sagni\Downloads\Edu Vision\archive\Topic Vectors\DT200.csv",
    r"C:\Users\sagni\Downloads\Edu Vision\archive\Topic Vectors\DT300.csv",
    r"C:\Users\sagni\Downloads\Edu Vision\archive\Topic Vectors\Fine_grained_Categories.csv",
    r"C:\Users\sagni\Downloads\Edu Vision\archive\Topic Vectors\General_Level_Categories.csv",
]

# ----------------------------
# OUTPUT DIRECTORY
# ----------------------------
OUT_DIR = Path(r"C:\Users\sagni\Downloads\Edu Vision")
OUT_DIR.mkdir(parents=True, exist_ok=True)

# ----------------------------
# CONFIG
# ----------------------------
JSONL_CHUNK_ROWS = 50_000   # stream JSONL in chunks to avoid OOM
YAML_PREVIEW_ROWS = 200     # small YAML preview (not entire dataset)
HDF5_KEY = "data"           # HDF5 group key
HDF5_COMP = dict(complevel=5, complib="blosc")
PRINT_HEAD = 3

# ----------------------------
# HELPERS
# ----------------------------
def basename_no_ext(p: Path) -> str:
    """safe base name without extension; handles .csv nicely."""
    return p.stem

def compute_min_itemsize(df: pd.DataFrame) -> Dict[str, int]:
    """
    For object columns in df, compute a safe min_itemsize (HDF5 string storage).
    Caps per-column size to avoid absurd values.
    """
    obj_cols = [c for c in df.columns if df[c].dtype == object]
    sizes = {}
    for c in obj_cols:
        # length over non-null values only
        try:
            lens = df[c].dropna().astype(str).str.len()
            if lens.empty:
                continue
            L = int(lens.quantile(0.99))  # 99th percentile
            L = max(8, min(L + 4, 50_000))  # pad a bit; cap at 50k
            sizes[c] = L
        except Exception:
            # fallback conservative
            sizes[c] = 1024
    return sizes

def to_jsonl_stream(df: pd.DataFrame, out_path: Path, chunk_rows: int = 50_000):
    """Stream DataFrame to JSON Lines file in chunks (records-per-line)."""
    with open(out_path, "w", encoding="utf-8") as f:
        n = len(df)
        if n == 0:
            return
        for start in range(0, n, chunk_rows):
            end = min(start + chunk_rows, n)
            chunk = df.iloc[start:end].where(pd.notna(df.iloc[start:end]), None)
            records = chunk.to_dict(orient="records")
            for rec in records:
                f.write(json.dumps(rec, ensure_ascii=False) + "\n")
            del chunk, records
            gc.collect()

def save_yaml(data, path: Path):
    try:
        import yaml
        with open(path, "w", encoding="utf-8") as f:
            yaml.safe_dump(data, f, allow_unicode=True, sort_keys=False)
    except Exception as e:
        print(f"[WARN] Could not write YAML {path.name}: {e}")

def summarize_df(df: pd.DataFrame) -> Dict:
    dtypes = df.dtypes.astype(str).to_dict()
    cols = list(df.columns)
    summary = {
        "rows": int(df.shape[0]),
        "cols": int(df.shape[1]),
        "columns": cols,
        "dtypes": dtypes,
    }
    return summary

# ----------------------------
# MAIN LOOP
# ----------------------------
for raw in inputs:
    in_path = Path(raw)
    if not in_path.exists():
        print(f"[SKIP] Not found: {in_path}")
        continue

    base = basename_no_ext(in_path)
    print(f"\n[INFO] Reading: {in_path}")
    try:
        # Try common CSV settings; some files may need sep handling (default ',')
        df = pd.read_csv(in_path, low_memory=False)
    except Exception as e_csv:
        # If there's a delimiter/encoding issue, try python engine fallback
        try:
            df = pd.read_csv(in_path, low_memory=False, engine="python")
        except Exception as e2:
            print(f"[ERROR] Failed to read {in_path}: {e2}")
            continue

    print(f"[INFO] -> shape: {df.shape}")
    try:
        print(df.head(PRINT_HEAD))
    except Exception:
        pass

    # Output paths
    pkl_path   = OUT_DIR / f"{base}.pkl"
    h5_path    = OUT_DIR / f"{base}.h5"
    jsonl_path = OUT_DIR / f"{base}.jsonl"
    yaml_meta  = OUT_DIR / f"{base}_meta.yaml"
    yaml_prev  = OUT_DIR / f"{base}_preview.yaml"

    # --------- PKL ----------
    try:
        df.to_pickle(pkl_path)
        print(f"[WRITE] Pickle  -> {pkl_path}")
    except Exception as e:
        print(f"[WARN] Could not write PKL: {e}")

    # --------- HDF5 ----------
    try:
        min_items = compute_min_itemsize(df)
        # to_hdf will error if PyTables missing
        df.to_hdf(
            h5_path, key=HDF5_KEY, mode="w",
            format="table", data_columns=False,
            min_itemsize=min_items if min_items else None,
            **HDF5_COMP
        )
        print(f"[WRITE] HDF5    -> {h5_path}")
    except Exception as e:
        print(f"[WARN] Could not write HDF5: {e}\n       Hint: pip install tables (64-bit)")

    # --------- JSONL (streaming) ----------
    try:
        to_jsonl_stream(df, jsonl_path, chunk_rows=JSONL_CHUNK_ROWS)
        print(f"[WRITE] JSONL   -> {jsonl_path}")
    except Exception as e:
        print(f"[WARN] Could not write JSONL: {e}")

    # --------- YAMLs ----------
    # meta YAML (lightweight)
    meta = summarize_df(df)
    # include a tiny sample of rows/values per column length hints
    try:
        meta["non_null_counts"] = df.notna().sum().astype(int).to_dict()
        # sample column-wise max lengths for object columns
        obj_cols = [c for c in df.columns if df[c].dtype == object]
        obj_len = {}
        for c in obj_cols:
            try:
                obj_len[c] = int(df[c].dropna().astype(str).str.len().quantile(0.99))
            except Exception:
                obj_len[c] = None
        meta["object_length_p99"] = obj_len
    except Exception:
        pass
    save_yaml(meta, yaml_meta)
    print(f"[WRITE] YAML(meta) -> {yaml_meta}")

    # preview YAML (first N rows only)
    try:
        prev = df.head(YAML_PREVIEW_ROWS).where(pd.notna(df.head(YAML_PREVIEW_ROWS)), None)
        # Convert preview to list of dicts to be YAML-friendly
        preview_records = prev.to_dict(orient="records")
        save_yaml({"preview_rows": preview_records}, yaml_prev)
        print(f"[WRITE] YAML(preview) -> {yaml_prev}")
    except Exception as e:
        print(f"[WARN] Could not write preview YAML: {e}")

    # memory cleanup
    del df
    gc.collect()

print("\n[DONE] Artifacts written to:", OUT_DIR)
print("Formats per file: .pkl, .h5, .jsonl, _meta.yaml, _preview.yaml")



[INFO] Reading: C:\Users\sagni\Downloads\Edu Vision\archive\Word Embeddings\MOOC_50d.csv
[INFO] -> shape: (68175, 51)
  Unnamed: 0         0         1         2         3         4         5  \
0      going  0.218232  0.997149 -0.788738  0.783706  1.412624  0.439875   
1       just  0.328883 -0.323240  0.620974  0.063740  1.963643 -1.429922   
2       like -1.443751  0.700020 -0.744764  1.168110  1.252286 -2.420375   

          6         7         8  ...        40        41        42        43  \
0  3.179963 -0.552562  1.954441  ... -0.480890  0.680463  0.062857  0.563106   
1  2.479352 -0.720906  0.302058  ... -1.106233 -0.080174  1.540606  0.702087   
2  1.260556  1.910394  0.720410  ... -0.955688 -0.727729  1.993031 -0.579122   

         44        45        46        47        48        49  
0 -0.822027  0.301155 -0.354391  0.348030  1.651381 -1.720476  
1  0.237602 -0.399902 -1.076104 -0.808779  0.899297 -0.640511  
2  0.531914 -2.246910 -0.468811 -0.434437  1.417524  0.564740  