In [1]:
# ==========================================================
# FraudLens — CSV -> PKL / H5 / JSON / YAML  (Robust IO)
# ==========================================================
# Input CSV:
#   C:\Users\sagni\Downloads\FraudLens\archive\creditcard_2023.csv
#
# Outputs (all saved here):
#   C:\Users\sagni\Downloads\FraudLens
#     - creditcard_2023.pkl
#     - creditcard_2023.h5
#     - creditcard_2023.json
#     - creditcard_2023.yaml
#
# Notes:
# - HDF5 requires `tables` package and 64‑bit Python.
# - YAML requires `pyyaml`.
# - Script coerces types for HDF5, and converts NaN/NA -> None for JSON/YAML.
# ==========================================================

from pathlib import Path
import json
import numpy as np
import pandas as pd

# ---------- Optional YAML ----------
try:
    import yaml
    HAVE_YAML = True
except Exception:
    HAVE_YAML = False
    print("[WARN] PyYAML not installed; YAML output will be skipped.")

# ---------- Paths ----------
CSV_PATH = Path(r"C:\Users\sagni\Downloads\FraudLens\archive\creditcard_2023.csv")
OUT_DIR  = Path(r"C:\Users\sagni\Downloads\FraudLens")
OUT_DIR.mkdir(parents=True, exist_ok=True)

BASE_NAME = CSV_PATH.stem  # "creditcard_2023"
PKL_PATH  = OUT_DIR / f"{BASE_NAME}.pkl"
H5_PATH   = OUT_DIR / f"{BASE_NAME}.h5"
JSON_PATH = OUT_DIR / f"{BASE_NAME}.json"
YAML_PATH = OUT_DIR / f"{BASE_NAME}.yaml"

# ---------- Load CSV ----------
print(f"[INFO] Loading CSV: {CSV_PATH}")
df = pd.read_csv(CSV_PATH, low_memory=False)
print("[INFO] Shape:", df.shape)
print("[INFO] Columns:", list(df.columns))

# ---------- Helpers for clean serialization ----------
def coerce_for_hdf(df_in: pd.DataFrame) -> pd.DataFrame:
    """
    Convert DataFrame to HDF5-friendly dtypes:
      - numeric -> float64/int64/bool
      - datetimes -> ISO strings (object)
      - everything else -> object (string)
    """
    g = df_in.copy()

    # Convert pandas extension types to normal NumPy dtypes where reasonable
    for col in g.columns:
        s = g[col]
        if pd.api.types.is_datetime64_any_dtype(s):
            # Store datetimes as ISO strings for HDF5 simplicity
            g[col] = s.dt.strftime("%Y-%m-%d %H:%M:%S").astype(object)
        elif pd.api.types.is_bool_dtype(s):
            g[col] = s.astype(bool)
        elif pd.api.types.is_integer_dtype(s):
            # Use float64 if there are missing values (to avoid NA-int issues)
            if s.isna().any():
                g[col] = s.astype("float64")
            else:
                g[col] = s.astype("int64")
        elif pd.api.types.is_float_dtype(s):
            g[col] = s.astype("float64")
        else:
            # Ensure plain Python strings for object columns
            g[col] = s.astype(str).where(~s.isna(), None)

    return g

def records_for_json(df_in: pd.DataFrame) -> list[dict]:
    """
    Convert DataFrame to a list of JSON-serializable dicts:
      - NaN/NA -> None
      - NumPy scalars -> Python scalars
    """
    g = df_in.copy()

    # If any datetime dtype exists, convert to ISO string for JSON
    for col in g.columns:
        if pd.api.types.is_datetime64_any_dtype(g[col]):
            g[col] = g[col].dt.strftime("%Y-%m-%d %H:%M:%S")

    g = g.where(pd.notna(g), None)
    recs = g.to_dict(orient="records")

    # Ensure pure Python types (json can't handle numpy types)
    out = []
    for rec in recs:
        clean = {}
        for k, v in rec.items():
            if isinstance(v, (np.floating,)):
                clean[k] = float(v)
            elif isinstance(v, (np.integer,)):
                clean[k] = int(v)
            elif isinstance(v, (np.bool_,)):
                clean[k] = bool(v)
            else:
                clean[k] = v  # str, None, native numbers are fine
        out.append(clean)
    return out

# ---------- Save: Pickle ----------
print(f"[WRITE] Pickle  -> {PKL_PATH}")
df.to_pickle(PKL_PATH)

# ---------- Save: HDF5 ----------
try:
    df_h5 = coerce_for_hdf(df)
    print(f"[WRITE] HDF5    -> {H5_PATH}")
    df_h5.to_hdf(H5_PATH, key="data", mode="w", format="table")
except Exception as e:
    print(f"[WARN] Could not write HDF5: {e}\n       Hint: pip install tables (64-bit Python)")

# ---------- Save: JSON ----------
print(f"[WRITE] JSON    -> {JSON_PATH}")
records = records_for_json(df)
with open(JSON_PATH, "w", encoding="utf-8") as f:
    json.dump(records, f, ensure_ascii=False, indent=2)

# ---------- Save: YAML ----------
if HAVE_YAML:
    try:
        print(f"[WRITE] YAML    -> {YAML_PATH}")
        with open(YAML_PATH, "w", encoding="utf-8") as f:
            yaml.safe_dump(records, f, allow_unicode=True, sort_keys=False)
    except Exception as e:
        print(f"[WARN] Could not write YAML: {e}")
else:
    print("[INFO] Skipping YAML (PyYAML not installed).")

print("\n[DONE] All files saved in:", OUT_DIR)
print(" -", PKL_PATH)
print(" -", H5_PATH)
print(" -", JSON_PATH)
if HAVE_YAML:
    print(" -", YAML_PATH)


[INFO] Loading CSV: C:\Users\sagni\Downloads\FraudLens\archive\creditcard_2023.csv
[INFO] Shape: (568630, 31)
[INFO] Columns: ['id', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount', 'Class']
[WRITE] Pickle  -> C:\Users\sagni\Downloads\FraudLens\creditcard_2023.pkl
[WRITE] HDF5    -> C:\Users\sagni\Downloads\FraudLens\creditcard_2023.h5
[WRITE] JSON    -> C:\Users\sagni\Downloads\FraudLens\creditcard_2023.json
[WRITE] YAML    -> C:\Users\sagni\Downloads\FraudLens\creditcard_2023.yaml

[DONE] All files saved in: C:\Users\sagni\Downloads\FraudLens
 - C:\Users\sagni\Downloads\FraudLens\creditcard_2023.pkl
 - C:\Users\sagni\Downloads\FraudLens\creditcard_2023.h5
 - C:\Users\sagni\Downloads\FraudLens\creditcard_2023.json
 - C:\Users\sagni\Downloads\FraudLens\creditcard_2023.yaml
