In [2]:
import os
import json
import yaml
import pickle
import h5py
import pandas as pd
import numpy as np
from datetime import datetime

# ----------------------------------------------------
# PATH SETTINGS
# ----------------------------------------------------
INPUT_EXCEL = r"C:\Users\NXTWAVE\Downloads\Infectious Pathogen Identifier\archive\Virus_Genome.xlsx"
OUTPUT_DIR  = r"C:\Users\NXTWAVE\Downloads\Infectious Pathogen Identifier"

# ----------------------------------------------------
# 1. Load dataset
# ----------------------------------------------------
print("[INFO] Loading dataset from Excel...")
df = pd.read_excel(INPUT_EXCEL)
print(f"[INFO] Loaded dataset shape: {df.shape}")
print(f"[INFO] Columns: {list(df.columns)}")

# ----------------------------------------------------
# 2. Basic cleaning (safe for numeric headers)
# ----------------------------------------------------
df = df.dropna(how="all")          # drop fully empty rows
df = df.fillna("NA")               # fill partial NaNs

# Ensure all column names are strings
df.columns = [str(c).strip().replace(" ", "_") for c in df.columns]

# Add index ID if not exists
if "Sample_ID" not in df.columns:
    df.insert(0, "Sample_ID", [f"S{i+1:05d}" for i in range(len(df))])

# ----------------------------------------------------
# 3. Convert to dictionary for export
# ----------------------------------------------------
dataset_dict = {
    "meta": {
        "project": "PathoNet — Infectious Pathogen Identifier",
        "source_file": INPUT_EXCEL,
        "n_rows": len(df),
        "n_cols": len(df.columns),
        "created_on": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
        "notes": "Virus genome dataset converted to multiple formats (.h5, .pkl, .yaml, .json)"
    },
    "data": df.to_dict(orient="list")
}

# ----------------------------------------------------
# 4. Define save functions
# ----------------------------------------------------
def save_json(obj, path):
    with open(path, "w") as f:
        json.dump(obj, f, indent=2)
    print(f"[SAVED] {path}")

def save_yaml(obj, path):
    with open(path, "w") as f:
        yaml.dump(obj, f, sort_keys=False)
    print(f"[SAVED] {path}")

def save_pickle(obj, path):
    with open(path, "wb") as f:
        pickle.dump(obj, f)
    print(f"[SAVED] {path}")

def save_h5(df, meta, path):
    with h5py.File(path, "w") as hf:
        # create datasets for each column
        grp = hf.create_group("data")
        for col in df.columns:
            data = df[col].astype(str).values
            dt = h5py.string_dtype(encoding='utf-8')
            grp.create_dataset(col, data=data, dtype=dt)
        # metadata
        meta_grp = hf.create_group("meta")
        for k, v in meta.items():
            meta_grp.attrs[k] = str(v)
    print(f"[SAVED] {path}")

# ----------------------------------------------------
# 5. Save all formats
# ----------------------------------------------------
os.makedirs(OUTPUT_DIR, exist_ok=True)

json_path = os.path.join(OUTPUT_DIR, "dataset.json")
yaml_path = os.path.join(OUTPUT_DIR, "dataset.yaml")
pkl_path  = os.path.join(OUTPUT_DIR, "dataset.pkl")
h5_path   = os.path.join(OUTPUT_DIR, "dataset.h5")

save_json(dataset_dict, json_path)
save_yaml(dataset_dict, yaml_path)
save_pickle(dataset_dict, pkl_path)
save_h5(df, dataset_dict["meta"], h5_path)

# ----------------------------------------------------
# 6. Summary
# ----------------------------------------------------
print("\n✅ [DONE] Phase 1 complete.")
print(f"All 4 artifacts saved under:\n{OUTPUT_DIR}")


[INFO] Loading dataset from Excel...
[INFO] Loaded dataset shape: (598, 6)
[INFO] Columns: ['ATTAAAGGTT', 'TATACCTTCC', 'CAGGTAACAA', 'ACCAACCAAC', 'TTTCGATCTC', 50]
[SAVED] C:\Users\NXTWAVE\Downloads\Infectious Pathogen Identifier\dataset.json
[SAVED] C:\Users\NXTWAVE\Downloads\Infectious Pathogen Identifier\dataset.yaml
[SAVED] C:\Users\NXTWAVE\Downloads\Infectious Pathogen Identifier\dataset.pkl
[SAVED] C:\Users\NXTWAVE\Downloads\Infectious Pathogen Identifier\dataset.h5

✅ [DONE] Phase 1 complete.
All 4 artifacts saved under:
C:\Users\NXTWAVE\Downloads\Infectious Pathogen Identifier
