In [1]:
import os
import json
import yaml
import joblib
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
from tensorflow.keras.models import load_model

# ======================================================
# 📂 PATH CONFIGURATION
# ======================================================
BASE_DIR = r"C:\Users\NXTWAVE\Downloads\Genomic Disease Risk Predictor"
TEST_CSV = os.path.join(BASE_DIR, "archive", "test.csv")
OUT_DIR = os.path.join(BASE_DIR, "outputs")

MODEL_PKL = os.path.join(OUT_DIR, "geneguard_model.pkl")
MODEL_H5 = os.path.join(OUT_DIR, "geneguard_model.h5")
META_YAML = os.path.join(OUT_DIR, "geneguard_metadata.yaml")

os.makedirs(OUT_DIR, exist_ok=True)

# ======================================================
# 📘 LOAD MODELS & METADATA
# ======================================================
print("[INFO] Loading models and metadata...")
rf_model = joblib.load(MODEL_PKL)
dnn_model = load_model(MODEL_H5)

with open(META_YAML, "r") as f:
    meta = yaml.safe_load(f)

feature_cols = meta.get("features_used", [])
print(f"[INFO] Loaded {len(feature_cols)} features for prediction")

# ======================================================
# 📄 DEFINE DISEASE MAPPING (EDITABLE)
# ======================================================
# Map numeric class IDs to real disease names
# You can edit these based on your dataset's disease categories
disease_map = {
    0: "Diabetes",
    1: "Cardiac Disorder",
    2: "Cancer",
    3: "Obesity",
    4: "Hypertension",
    5: "Neurological Disorder",
    6: "Liver Disease",
    7: "Kidney Disorder",
    8: "Respiratory Disease",
    9: "No Disease Detected"
}

# ======================================================
# 📘 LOAD TEST DATA
# ======================================================
print("[INFO] Reading test data...")
df_test = pd.read_csv(TEST_CSV)
print("[INFO] Test shape:", df_test.shape)

# Ensure feature consistency
missing_cols = [c for c in feature_cols if c not in df_test.columns]
for col in missing_cols:
    df_test[col] = 0
df_test = df_test[feature_cols]

# ======================================================
# ⚙️ PREPROCESS TEST DATA
# ======================================================
for col in df_test.select_dtypes(include=["object"]).columns:
    df_test[col] = pd.factorize(df_test[col])[0]

scaler = StandardScaler()
X_test = scaler.fit_transform(df_test)

# ======================================================
# 🌲 RANDOM FOREST PREDICTION
# ======================================================
print("[INFO] Predicting using RandomForest...")
rf_pred = rf_model.predict(X_test)
rf_prob = rf_model.predict_proba(X_test)
rf_conf = rf_prob.max(axis=1)

# ======================================================
# 🧠 DNN PREDICTION
# ======================================================
print("[INFO] Predicting using DNN...")
dnn_prob = dnn_model.predict(X_test, verbose=0)
dnn_pred = np.argmax(dnn_prob, axis=1)
dnn_conf = dnn_prob.max(axis=1)

# ======================================================
# 🧩 COMBINE & MAP TO DISEASES
# ======================================================
print("[INFO] Combining model outputs...")
final_pred = []
for i in range(len(df_test)):
    # Confidence-weighted ensemble
    if dnn_conf[i] >= 0.7:
        final_pred.append(dnn_pred[i])
    else:
        final_pred.append(rf_pred[i])

# Map numeric predictions to disease names
rf_disease = [disease_map.get(int(x), "Unknown") for x in rf_pred]
dnn_disease = [disease_map.get(int(x), "Unknown") for x in dnn_pred]
final_disease = [disease_map.get(int(x), "Unknown") for x in final_pred]

# ======================================================
# 💾 SAVE OUTPUTS
# ======================================================
result_df = pd.DataFrame({
    "Sample_ID": range(1, len(df_test) + 1),
    "RF_Prediction": rf_disease,
    "RF_Confidence": rf_conf.round(4),
    "DNN_Prediction": dnn_disease,
    "DNN_Confidence": dnn_conf.round(4),
    "Final_Predicted_Disease": final_disease
})

csv_path = os.path.join(OUT_DIR, "disease_predictions.csv")
json_path = os.path.join(OUT_DIR, "disease_predictions.json")

result_df.to_csv(csv_path, index=False)
result_df.to_json(json_path, orient="records", indent=4)

print("\n✅ Prediction completed!")
print("Saved results:")
print(f" - {csv_path}")
print(f" - {json_path}")



[INFO] Loading models and metadata...

[INFO] Loaded 44 features for prediction
[INFO] Reading test data...
[INFO] Test shape: (9465, 43)
[INFO] Predicting using RandomForest...
[INFO] Predicting using DNN...
[INFO] Combining model outputs...

✅ Prediction completed!
Saved results:
 - C:\Users\NXTWAVE\Downloads\Genomic Disease Risk Predictor\outputs\disease_predictions.csv
 - C:\Users\NXTWAVE\Downloads\Genomic Disease Risk Predictor\outputs\disease_predictions.json
