In [2]:
import os
import pickle
import pandas as pd
import h5py
import json
import yaml
import numpy as np

# ==============================
# Paths
# ==============================
input_path = r"C:\Users\sagni\Downloads\RAG based Medical FAQ Chatbot\archive\train.csv"
output_dir = r"C:\Users\sagni\Downloads\RAG based Medical FAQ Chatbot"
os.makedirs(output_dir, exist_ok=True)

print(f"Loading CSV from: {input_path}")
df = pd.read_csv(input_path)
print(f"Loaded {len(df)} rows and {len(df.columns)} columns")

# ==============================
# Save as PKL
# ==============================
pkl_path = os.path.join(output_dir, "train.pkl")
with open(pkl_path, "wb") as f:
    pickle.dump(df, f)
print(f"✅ Saved PKL -> {pkl_path}")

# ==============================
# Save as H5 (UTF-8 safe)
# ==============================
h5_path = os.path.join(output_dir, "train.h5")

# Create a UTF-8 variable-length string dtype for text columns
utf8_str_dtype = h5py.string_dtype(encoding="utf-8")

with h5py.File(h5_path, "w") as f:
    for col in df.columns:
        series = df[col]

        # If it's numeric, store as numeric array (preserve NaN as needed)
        if pd.api.types.is_numeric_dtype(series):
            data = series.to_numpy()
            f.create_dataset(
                name=col,
                data=data,
                compression="gzip",
                compression_opts=9,
                shuffle=True,
            )
        else:
            # Treat as text: coerce to string, fill NaN, and write as UTF-8 varlen
            # Use object dtype to avoid numpy auto-encoding to ASCII
            text = series.astype("string").fillna("").to_numpy(dtype=object)
            f.create_dataset(
                name=col,
                data=text,
                dtype=utf8_str_dtype,
                compression="gzip",
                compression_opts=9,
                shuffle=True,
            )

print(f"✅ Saved H5  -> {h5_path}")

# ==============================
# Save as JSON (UTF-8, line-delimited)
# ==============================
json_path = os.path.join(output_dir, "train.json")
df.to_json(json_path, orient="records", lines=True, force_ascii=False)
print(f"✅ Saved JSON -> {json_path}")

# ==============================
# Save as YAML (UTF-8)
# ==============================
yaml_path = os.path.join(output_dir, "train.yaml")
with open(yaml_path, "w", encoding="utf-8") as f:
    yaml.dump(df.to_dict(orient="records"), f, allow_unicode=True)
print(f"✅ Saved YAML -> {yaml_path}")

print("\n🎉 Conversion complete!")


Loading CSV from: C:\Users\sagni\Downloads\RAG based Medical FAQ Chatbot\archive\train.csv
Loaded 16407 rows and 3 columns
✅ Saved PKL -> C:\Users\sagni\Downloads\RAG based Medical FAQ Chatbot\train.pkl
✅ Saved H5  -> C:\Users\sagni\Downloads\RAG based Medical FAQ Chatbot\train.h5
✅ Saved JSON -> C:\Users\sagni\Downloads\RAG based Medical FAQ Chatbot\train.json
✅ Saved YAML -> C:\Users\sagni\Downloads\RAG based Medical FAQ Chatbot\train.yaml

🎉 Conversion complete!
