In [2]:
pip install pandas pyyaml tables


Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.1.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [5]:
# ==========================================================
# NewsSense — FINAL Writers: PKL.GZ / H5 (min_itemsize) / JSONL / YAML
# ==========================================================
# Inputs:
#   "C:\Users\sagni\Downloads\News Sense\archive\News _dataset\True.csv"
#   "C:\Users\sagni\Downloads\News Sense\archive\News _dataset\Fake.csv"
#
# Outputs:
#   C:\Users\sagni\Downloads\News Sense
#     - news_dataset.pkl.gz
#     - news_dataset.h5                  (with per-column min_itemsize)
#     - news_dataset.jsonl               (streaming)
#     - news_dataset.yaml                (subset; configurable)
#     - news_dataset_meta.yaml           (schema + class balance; YAML-safe)
# ==========================================================

from pathlib import Path
import json
import math
import numpy as np
import pandas as pd

# ---------- Optional YAML ----------
try:
    import yaml
    HAVE_YAML = True
except Exception:
    HAVE_YAML = False
    print("[WARN] PyYAML not installed; YAML output will be skipped.")

# ---------- Paths (edit OUT_DIR if disk is tight) ----------
TRUE_PATH = Path(r"C:\Users\sagni\Downloads\News Sense\archive\News _dataset\True.csv")
FAKE_PATH = Path(r"C:\Users\sagni\Downloads\News Sense\archive\News _dataset\Fake.csv")
OUT_DIR   = Path(r"C:\Users\sagni\Downloads\News Sense")
OUT_DIR.mkdir(parents=True, exist_ok=True)

BASE       = "news_dataset"
PKL_PATH   = OUT_DIR / f"{BASE}.pkl.gz"
H5_PATH    = OUT_DIR / f"{BASE}.h5"
JSONL_PATH = OUT_DIR / f"{BASE}.jsonl"
YAML_PATH  = OUT_DIR / f"{BASE}.yaml"
META_YAML  = OUT_DIR / f"{BASE}_meta.yaml"

# ---------- Writer config ----------
JSONL_CHUNK_ROWS = 20_000
YAML_MAX_ROWS    = 5_000        # set None to dump all rows (not recommended for huge files)
H5_COMPLIB       = "blosc:zstd" # compression for HDF5 where supported
H5_COMPLEVEL     = 5

# ---------- Load ----------
print("[INFO] Loading datasets...")
true_df = pd.read_csv(TRUE_PATH, low_memory=False)
true_df["label"] = 0   # 0 = True/Real
fake_df = pd.read_csv(FAKE_PATH, low_memory=False)
fake_df["label"] = 1   # 1 = Fake

df = pd.concat([true_df, fake_df], ignore_index=True)
print("[INFO] Combined shape:", df.shape)
print("[INFO] Columns:", list(df.columns))

# ---------- Helpers ----------
def to_py(v):
    """Convert numpy/pandas scalars to plain Python; Ts/NaT -> str/None."""
    import pandas as pd
    if v is None:
        return None
    if isinstance(v, (np.integer,)):
        return int(v)
    if isinstance(v, (np.floating,)):
        # preserve NaN as None
        return None if np.isnan(v) else float(v)
    if isinstance(v, (np.bool_,)):
        return bool(v)
    # pandas Timestamp/Timedelta/NA to string or None
    if isinstance(v, (pd.Timestamp,)):
        return v.isoformat()
    if pd.isna(v):
        return None
    return v  # str or python native

def compute_meta_yaml_safe(df_in: pd.DataFrame) -> dict:
    cols = []
    for c in df_in.columns:
        series = df_in[c]
        ex = None
        # find first non-null example and coerce to python type
        for val in series.head(100).tolist():
            if pd.notna(val):
                ex = to_py(val)
                break
        cols.append({
            "name": str(c),
            "dtype": str(series.dtype),
            "non_null": int(series.notna().sum()),
            "nulls": int(series.isna().sum()),
            "example": ex if ex is None or isinstance(ex, (str, int, float, bool)) else str(ex),
        })
    counts = df_in["label"].value_counts(dropna=False).to_dict() if "label" in df_in.columns else {}
    counts_py = {str(k): int(v) for k, v in counts.items()}
    return {
        "rows": int(len(df_in)),
        "columns": int(len(df_in.columns)),
        "label_counts": counts_py,
        "columns_info": cols
    }

def build_min_itemsize(df_in: pd.DataFrame, cap: int = 65500, headroom: int = 64) -> dict:
    """
    Compute min_itemsize per object column for HDF5(table). Adds small headroom.
    Cap protects against absurd lengths (PyTables practical limit ~65k).
    """
    mi = {}
    for c in df_in.select_dtypes(include=["object"]).columns:
        # robust max length without exploding memory
        try:
            max_len = int(df_in[c].astype(str).str.len().max() or 0)
        except Exception:
            max_len = 0
        size = min(cap, max_len + headroom)
        # Only set if there is actual text
        if size > 0:
            mi[c] = size
    return mi

def coerce_for_hdf(df_in: pd.DataFrame) -> pd.DataFrame:
    """Make HDF5-friendly dtypes: object->str, ints/floats normalized."""
    g = df_in.copy()
    # ensure strings for object columns
    for c in g.select_dtypes(include=["object"]).columns:
        g[c] = g[c].astype(str)
    # numeric normalization
    for c in g.columns:
        s = g[c]
        if pd.api.types.is_integer_dtype(s):
            g[c] = s.astype("int64", copy=False)
        elif pd.api.types.is_float_dtype(s):
            g[c] = s.astype("float64", copy=False)
        elif pd.api.types.is_bool_dtype(s):
            g[c] = s.astype(bool, copy=False)
    return g

# ---------- 1) Compressed Pickle ----------
try:
    print(f"[WRITE] Pickle (gzip) -> {PKL_PATH}")
    df.to_pickle(PKL_PATH, compression="gzip")
except OSError as e:
    print(f"[ERROR] Pickle write failed: {e}")

# ---------- 2) HDF5 with per-column min_itemsize ----------
# Single write (fixed) or table with min_itemsize; we use table for queryability.
try:
    df_h5 = coerce_for_hdf(df)
    min_itemsize = build_min_itemsize(df_h5, cap=65500, headroom=64)
    print("[INFO] HDF5 min_itemsize:", min_itemsize)

    # remove prior file to avoid schema conflicts
    if H5_PATH.exists():
        try:
            H5_PATH.unlink()
        except Exception:
            pass

    print(f"[WRITE] HDF5 (table, compressed) -> {H5_PATH}")
    df_h5.to_hdf(
        H5_PATH,
        key="news",
        mode="w",
        format="table",           # appendable/queryable
        complib=H5_COMPLIB,
        complevel=H5_COMPLEVEL,
        min_itemsize=min_itemsize # <-- critical fix for long strings
    )
except Exception as e:
    print(f"[WARN] HDF5 write skipped/failed: {e}\n       Hint: ensure 64‑bit Python, `pip install tables`, and ample free disk.")

# ---------- 3) JSONL (streaming, robust) ----------
try:
    print(f"[WRITE] JSONL (streaming) -> {JSONL_PATH}")
    total = len(df)
    n_chunks = math.ceil(total / JSONL_CHUNK_ROWS)
    with open(JSONL_PATH, "w", encoding="utf-8") as f:
        for i in range(n_chunks):
            s = i * JSONL_CHUNK_ROWS
            e = min((i + 1) * JSONL_CHUNK_ROWS, total)
            chunk = df.iloc[s:e].where(pd.notna(df.iloc[s:e]), None)
            for rec in chunk.to_dict(orient="records"):
                # normalize to python types
                for k, v in list(rec.items()):
                    if isinstance(v, (np.integer,)):
                        rec[k] = int(v)
                    elif isinstance(v, (np.floating,)):
                        rec[k] = None if np.isnan(v) else float(v)
                    elif isinstance(v, (np.bool_,)):
                        rec[k] = bool(v)
                f.write(json.dumps(rec, ensure_ascii=False) + "\n")
            print(f"  - JSONL wrote chunk {i+1}/{n_chunks} ({e - s} rows)")
except OSError as e:
    print(f"[ERROR] JSONL write failed: {e}\n       Tip: change OUT_DIR to a drive with more free space.")

# ---------- 4) YAML (meta + subset) ----------
if HAVE_YAML:
    try:
        meta = compute_meta_yaml_safe(df)
        print(f"[WRITE] Meta YAML -> {META_YAML}")
        with open(META_YAML, "w", encoding="utf-8") as f:
            yaml.safe_dump(meta, f, allow_unicode=True, sort_keys=False)

        if YAML_MAX_ROWS is None or YAML_MAX_ROWS >= len(df):
            subset = df
            print(f"[INFO] Writing full YAML with {len(df)} rows (may be large)")
        else:
            subset = df.iloc[:YAML_MAX_ROWS]
            print(f"[INFO] Writing YAML subset of {len(subset)} rows (YAML_MAX_ROWS={YAML_MAX_ROWS})")

        # convert to python-native types
        subset_py = subset.where(pd.notna(subset), None).to_dict(orient="records")
        for rec in subset_py:
            for k, v in list(rec.items()):
                rec[k] = to_py(v)
                if not isinstance(rec[k], (str, int, float, bool)) and rec[k] is not None:
                    rec[k] = str(rec[k])

        print(f"[WRITE] YAML -> {YAML_PATH}")
        with open(YAML_PATH, "w", encoding="utf-8") as f:
            yaml.safe_dump(subset_py, f, allow_unicode=True, sort_keys=False)

    except OSError as e:
        print(f"[WARN] YAML write failed: {e}\n       Tip: reduce YAML_MAX_ROWS or move OUT_DIR to a larger drive.")
else:
    print("[INFO] Skipping YAML (PyYAML not installed).")

# ---------- Done ----------
print("\n[DONE] Outputs in:", OUT_DIR)
print(" -", PKL_PATH, "(compressed pickle)")
print(" -", H5_PATH, "(HDF5; if present)")
print(" -", JSONL_PATH, "(JSON Lines)")
if HAVE_YAML:
    print(" -", YAML_PATH, "(YAML subset)")
    print(" -", META_YAML, "(metadata YAML)")


[INFO] Loading datasets...
[INFO] Combined shape: (44898, 5)
[INFO] Columns: ['title', 'text', 'subject', 'date', 'label']
[WRITE] Pickle (gzip) -> C:\Users\sagni\Downloads\News Sense\news_dataset.pkl.gz
[INFO] HDF5 min_itemsize: {'title': 350, 'text': 51858, 'subject': 79, 'date': 213}
[WRITE] HDF5 (table, compressed) -> C:\Users\sagni\Downloads\News Sense\news_dataset.h5


  expected_mb = (expectedrows * rowsize) // MB


[WRITE] JSONL (streaming) -> C:\Users\sagni\Downloads\News Sense\news_dataset.jsonl
  - JSONL wrote chunk 1/3 (20000 rows)
  - JSONL wrote chunk 2/3 (20000 rows)
  - JSONL wrote chunk 3/3 (4898 rows)
[WRITE] Meta YAML -> C:\Users\sagni\Downloads\News Sense\news_dataset_meta.yaml
[INFO] Writing YAML subset of 5000 rows (YAML_MAX_ROWS=5000)
[WRITE] YAML -> C:\Users\sagni\Downloads\News Sense\news_dataset.yaml

[DONE] Outputs in: C:\Users\sagni\Downloads\News Sense
 - C:\Users\sagni\Downloads\News Sense\news_dataset.pkl.gz (compressed pickle)
 - C:\Users\sagni\Downloads\News Sense\news_dataset.h5 (HDF5; if present)
 - C:\Users\sagni\Downloads\News Sense\news_dataset.jsonl (JSON Lines)
 - C:\Users\sagni\Downloads\News Sense\news_dataset.yaml (YAML subset)
 - C:\Users\sagni\Downloads\News Sense\news_dataset_meta.yaml (metadata YAML)
