In [1]:
import os
import json
import yaml
import joblib
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from tensorflow.keras.models import load_model

# ======================================================
# 📂 PATH CONFIGURATION
# ======================================================
BASE_DIR = r"C:\Users\NXTWAVE\Downloads\Genomic Disease Risk Predictor"
TEST_CSV = os.path.join(BASE_DIR, "archive", "test.csv")
OUT_DIR = os.path.join(BASE_DIR, "outputs")
META_YAML = os.path.join(OUT_DIR, "geneguard_metadata.yaml")
MODEL_PKL = os.path.join(OUT_DIR, "geneguard_model.pkl")
MODEL_H5 = os.path.join(OUT_DIR, "geneguard_model.h5")

os.makedirs(OUT_DIR, exist_ok=True)

# ======================================================
# 📘 LOAD MODELS + METADATA
# ======================================================
print("[INFO] Loading trained models and metadata...")
rf_model = joblib.load(MODEL_PKL)
dnn_model = load_model(MODEL_H5)

with open(META_YAML, "r") as f:
    meta = yaml.safe_load(f)

feature_cols = meta["features_used"]
print(f"[INFO] Loaded {len(feature_cols)} features from metadata")

# ======================================================
# 🧾 LOAD TEST DATA
# ======================================================
print("[INFO] Reading test dataset...")
df_test = pd.read_csv(TEST_CSV)
print("[INFO] Shape:", df_test.shape)

# Handle missing columns gracefully
missing_cols = [c for c in feature_cols if c not in df_test.columns]
for col in missing_cols:
    df_test[col] = 0  # default fill if missing

# Use same order as training
df_test = df_test[feature_cols]

# ======================================================
# ⚙️ PREPROCESS TEST DATA (ENCODING + SCALING)
# ======================================================
# Encode categorical features
for col in df_test.select_dtypes(include=["object"]).columns:
    df_test[col] = pd.factorize(df_test[col])[0]

# Scale numeric features
scaler = StandardScaler()
X_test_scaled = scaler.fit_transform(df_test)

# ======================================================
# 🌲 RANDOM FOREST PREDICTIONS
# ======================================================
print("[INFO] Predicting using RandomForest...")
rf_pred = rf_model.predict(X_test_scaled)
rf_prob = rf_model.predict_proba(X_test_scaled)

# ======================================================
# 🧠 DNN PREDICTIONS
# ======================================================
print("[INFO] Predicting using DNN...")
dnn_prob = dnn_model.predict(X_test_scaled, verbose=0)
dnn_pred = np.argmax(dnn_prob, axis=1)

# ======================================================
# 🧩 COMBINE RESULTS
# ======================================================
print("[INFO] Combining predictions...")

# Combine both models' predictions
combined_pred = []
for i in range(len(df_test)):
    if np.max(dnn_prob[i]) > 0.7:
        combined_pred.append(dnn_pred[i])
    else:
        combined_pred.append(rf_pred[i])

# ======================================================
# 💾 SAVE OUTPUTS
# ======================================================
pred_df = pd.DataFrame({
    "Sample_ID": range(1, len(df_test) + 1),
    "RF_Prediction": rf_pred,
    "RF_Confidence": rf_prob.max(axis=1),
    "DNN_Prediction": dnn_pred,
    "DNN_Confidence": dnn_prob.max(axis=1),
    "Final_Prediction": combined_pred
})

csv_path = os.path.join(OUT_DIR, "final_predictions.csv")
json_path = os.path.join(OUT_DIR, "final_predictions.json")

pred_df.to_csv(csv_path, index=False)
pred_df.to_json(json_path, orient="records", indent=4)

print("\n✅ Predictions completed and saved:")
print(" -", csv_path)
print(" -", json_path)



[INFO] Loading trained models and metadata...

[INFO] Loaded 44 features from metadata
[INFO] Reading test dataset...
[INFO] Shape: (9465, 43)
[INFO] Predicting using RandomForest...
[INFO] Predicting using DNN...
[INFO] Combining predictions...

✅ Predictions completed and saved:
 - C:\Users\NXTWAVE\Downloads\Genomic Disease Risk Predictor\outputs\final_predictions.csv
 - C:\Users\NXTWAVE\Downloads\Genomic Disease Risk Predictor\outputs\final_predictions.json
