In [1]:
import os
import json
import yaml
import joblib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping

# ======================
# 📂 PATH CONFIGURATION
# ======================
BASE_DIR = r"C:\Users\NXTWAVE\Downloads\Genomic Disease Risk Predictor"
TRAIN_CSV = os.path.join(BASE_DIR, "archive", "train.csv")
SAMPLE_SUB = os.path.join(BASE_DIR, "archive", "sample_submission.csv")
OUT_DIR = os.path.join(BASE_DIR, "outputs")

os.makedirs(OUT_DIR, exist_ok=True)

# ======================
# 📘 LOAD DATA
# ======================
print("[INFO] Loading data...")
df = pd.read_csv(TRAIN_CSV)
print("[INFO] Shape:", df.shape)
print("[INFO] Columns:", df.columns.tolist())

# ======================
# ⚙️ PREPROCESSING
# ======================
# Detect label column automatically
target_col = "Disease" if "Disease" in df.columns else df.columns[-1]

X = df.drop(columns=[target_col])
y = df[target_col]

# Encode categorical columns
for col in X.select_dtypes(include=["object"]).columns:
    X[col] = LabelEncoder().fit_transform(X[col].astype(str))

# Scale numeric features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Encode target
le = LabelEncoder()
y_enc = le.fit_transform(y)

# Split
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y_enc, test_size=0.2, random_state=42, stratify=y_enc
)

# ======================
# 🌲 RANDOM FOREST MODEL
# ======================
print("[INFO] Training RandomForest model...")
rf = RandomForestClassifier(n_estimators=200, random_state=42)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
acc_rf = accuracy_score(y_test, y_pred)
print(f"[RESULT] RandomForest Accuracy: {acc_rf:.4f}")

# ======================
# 🧠 DEEP NEURAL NETWORK
# ======================
print("[INFO] Training DNN model...")
dnn = Sequential([
    Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
    Dropout(0.3),
    Dense(64, activation='relu'),
    Dropout(0.2),
    Dense(len(np.unique(y_enc)), activation='softmax')
])
dnn.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
es = EarlyStopping(patience=10, restore_best_weights=True)
history = dnn.fit(X_train, y_train, validation_data=(X_test, y_test),
                  epochs=50, batch_size=32, callbacks=[es], verbose=0)

# Evaluate
dnn_eval = dnn.evaluate(X_test, y_test, verbose=0)
print(f"[RESULT] DNN Accuracy: {dnn_eval[1]:.4f}")

# ======================
# 📊 EVALUATION VISUALS
# ======================
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(6,5))
sns.heatmap(cm, annot=True, fmt='d', cmap='coolwarm')
plt.title("Disease Prediction Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("True")
plt.tight_layout()
plt.savefig(os.path.join(OUT_DIR, "disease_risk_heatmap.png"))
plt.close()

# Feature importance plot
feat_imp = pd.Series(rf.feature_importances_, index=X.columns).sort_values(ascending=False)[:20]
plt.figure(figsize=(8,6))
sns.barplot(x=feat_imp.values, y=feat_imp.index)
plt.title("Top 20 Predictive Features / Genes")
plt.xlabel("Importance")
plt.ylabel("Feature")
plt.tight_layout()
plt.savefig(os.path.join(OUT_DIR, "gene_importance.png"))
plt.close()

# ======================
# 💾 SAVE ARTIFACTS
# ======================
# 1️⃣ Save RF model
joblib.dump(rf, os.path.join(OUT_DIR, "geneguard_model.pkl"))

# 2️⃣ Save Keras model
dnn.save(os.path.join(OUT_DIR, "geneguard_model.h5"))

# 3️⃣ Save predictions
pred_df = pd.DataFrame({
    "TrueLabel": le.inverse_transform(y_test),
    "PredLabel": le.inverse_transform(y_pred),
    "RF_Prob": rf.predict_proba(X_test).max(axis=1)
})
pred_json_path = os.path.join(OUT_DIR, "prediction_results.json")
pred_df.to_json(pred_json_path, orient='records', indent=4)

# 4️⃣ Save metadata YAML
meta = {
    "model_randomforest_acc": float(acc_rf),
    "model_dnn_acc": float(dnn_eval[1]),
    "features_used": list(X.columns),
    "n_samples": int(len(df)),
    "label_column": target_col
}
with open(os.path.join(OUT_DIR, "geneguard_metadata.yaml"), "w") as f:
    yaml.dump(meta, f)

print(f"\n✅ All outputs saved to: {OUT_DIR}")
print("Files generated:")
print(" - geneguard_model.pkl")
print(" - geneguard_model.h5")
print(" - gene_importance.png")
print(" - disease_risk_heatmap.png")
print(" - prediction_results.json")
print(" - geneguard_metadata.yaml")



[INFO] Loading data...
[INFO] Shape: (22083, 45)
[INFO] Columns: ['Patient Id', 'Patient Age', "Genes in mother's side", 'Inherited from father', 'Maternal gene', 'Paternal gene', 'Blood cell count (mcL)', 'Patient First Name', 'Family Name', "Father's name", "Mother's age", "Father's age", 'Institute Name', 'Location of Institute', 'Status', 'Respiratory Rate (breaths/min)', 'Heart Rate (rates/min', 'Test 1', 'Test 2', 'Test 3', 'Test 4', 'Test 5', 'Parental consent', 'Follow-up', 'Gender', 'Birth asphyxia', 'Autopsy shows birth defect (if applicable)', 'Place of birth', 'Folic acid details (peri-conceptional)', 'H/O serious maternal illness', 'H/O radiation exposure (x-ray)', 'H/O substance abuse', 'Assisted conception IVF/ART', 'History of anomalies in previous pregnancies', 'No. of previous abortion', 'Birth defects', 'White Blood cell count (thousand per microliter)', 'Blood test result', 'Symptom 1', 'Symptom 2', 'Symptom 3', 'Symptom 4', 'Symptom 5', 'Genetic Disorder', 'Disord

  saving_api.save_model(



✅ All outputs saved to: C:\Users\NXTWAVE\Downloads\Genomic Disease Risk Predictor\outputs
Files generated:
 - geneguard_model.pkl
 - geneguard_model.h5
 - gene_importance.png
 - disease_risk_heatmap.png
 - prediction_results.json
 - geneguard_metadata.yaml
