In [4]:
import os
import json
import yaml
import joblib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping

# ---------------------------------------------------------------------
# üß¨ Paths
# ---------------------------------------------------------------------
BASE_DIR = r"C:\Users\NXTWAVE\Downloads\Pathogen Mutation Predictor"
DATA_PATHS = [
    os.path.join(BASE_DIR, "archive", "azm_sr_gwas_filtered_unitigs.Rtab"),
    os.path.join(BASE_DIR, "archive", "cfx_sr_gwas_filtered_unitigs.Rtab"),
    os.path.join(BASE_DIR, "archive", "cip_sr_gwas_filtered_unitigs.Rtab"),
]
META_PATH = os.path.join(BASE_DIR, "archive", "metadata.csv")
OUT_DIR = BASE_DIR
os.makedirs(OUT_DIR, exist_ok=True)

# ---------------------------------------------------------------------
# üß© 1Ô∏è‚É£ Load Mutation Data
# ---------------------------------------------------------------------
print("[INFO] Loading mutation data...")

dfs = []
for path in DATA_PATHS:
    df = pd.read_csv(path, sep="\t")
    df.reset_index(inplace=True)

    # Rtab tables have features as rows and samples as columns -> transpose
    if df.shape[1] > 5:
        df = df.T
        df.columns = df.iloc[0]
        df = df[1:]
    df.index.name = "SampleID"
    df.reset_index(inplace=True)
    dfs.append(df)

merged = dfs[0]
for i in range(1, len(dfs)):
    merged = pd.merge(merged, dfs[i], on="SampleID", how="outer")

# Force SampleID to string
merged["SampleID"] = merged["SampleID"].astype(str)

print(f"[INFO] Combined mutation shape: {merged.shape}")

# ---------------------------------------------------------------------
# üß´ 2Ô∏è‚É£ Load Metadata
# ---------------------------------------------------------------------
meta = pd.read_csv(META_PATH)
meta.columns = (
    meta.columns.str.strip()
    .str.replace(" ", "_")
    .str.replace("-", "_")
    .str.lower()
)
print(f"[INFO] Metadata shape: {meta.shape}")
print(f"[INFO] Metadata columns: {list(meta.columns)[:10]} ...")

# Find sample key (case-insensitive)
possible_ids = ["sampleid", "sample_id", "id", "isolate", "run", "strain"]
meta_key = None
for key in possible_ids:
    if key in meta.columns:
        meta_key = key
        break

if meta_key is None:
    raise KeyError(
        f"‚ùå No ID column found. Available: {list(meta.columns)}. "
        "Expected something like sample_id or isolate."
    )

meta.rename(columns={meta_key: "SampleID"}, inplace=True)
meta["SampleID"] = meta["SampleID"].astype(str)
print(f"[INFO] Using '{meta_key}' as merge key (converted to string).")

# ---------------------------------------------------------------------
# üß¨ 3Ô∏è‚É£ Merge
# ---------------------------------------------------------------------
data = pd.merge(meta, merged, on="SampleID", how="inner")
print(f"[INFO] Final merged shape: {data.shape}")

# ---------------------------------------------------------------------
# üßÆ 4Ô∏è‚É£ Target Column Detection
# ---------------------------------------------------------------------
target_candidates = [
    c for c in data.columns if any(x in c.lower() for x in ["resistance", "phenotype", "label", "response"])
]
if not target_candidates:
    # fallback: maybe "azithromycin", "ciprofloxacin" etc are resistance status
    possible_targets = [c for c in data.columns if data[c].nunique() <= 3]
    if possible_targets:
        label_col = possible_targets[0]
        print(f"[WARN] Auto-selected potential target column: {label_col}")
    else:
        raise ValueError(f"‚ùå No suitable resistance column found. Columns: {list(data.columns)[:15]}")
else:
    label_col = target_candidates[0]

print(f"[INFO] Target column detected: {label_col}")

# ---------------------------------------------------------------------
# üî¨ 5Ô∏è‚É£ Preprocess
# ---------------------------------------------------------------------
y = (
    data[label_col]
    .astype(str)
    .str.lower()
    .map({"resistant": 1, "r": 1, "susceptible": 0, "s": 0})
)
# if still NaN, fill binary numeric or 0/1 fallback
if y.isna().all():
    y = pd.to_numeric(data[label_col], errors="coerce").fillna(0).astype(int)

X = (
    data.drop(columns=[label_col, "SampleID"], errors="ignore")
    .select_dtypes(include=[np.number, np.float64, np.int64])
    .fillna(0)
)
print(f"[INFO] Feature matrix shape: {X.shape}")

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.25, random_state=42, stratify=y
)
print(f"[INFO] Train/Test split: {X_train.shape}, {X_test.shape}")

# ---------------------------------------------------------------------
# üå≤ 6Ô∏è‚É£ RandomForest Model
# ---------------------------------------------------------------------
print("[INFO] Training RandomForest...")
rf = RandomForestClassifier(n_estimators=200, random_state=42)
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_test)
rf_acc = accuracy_score(y_test, rf_pred)
print(f"[RESULT] RandomForest Accuracy: {rf_acc:.4f}")

# ---------------------------------------------------------------------
# ü§ñ 7Ô∏è‚É£ Deep Learning Model
# ---------------------------------------------------------------------
print("[INFO] Training Deep Learning Model...")
model = Sequential([
    Dense(256, activation='relu', input_shape=(X_train.shape[1],)),
    Dropout(0.3),
    Dense(128, activation='relu'),
    Dropout(0.2),
    Dense(1, activation='sigmoid')
])
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
es = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
history = model.fit(
    X_train, y_train,
    validation_split=0.2,
    epochs=30,
    batch_size=32,
    callbacks=[es],
    verbose=1
)
dl_acc = model.evaluate(X_test, y_test, verbose=0)[1]
print(f"[RESULT] Deep Learning Accuracy: {dl_acc:.4f}")

# ---------------------------------------------------------------------
# üìä 8Ô∏è‚É£ Plots
# ---------------------------------------------------------------------
y_pred = (model.predict(X_test) > 0.5).astype(int)
cm = confusion_matrix(y_test, y_pred)

plt.figure(figsize=(7,5))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("True")
plt.tight_layout()
plt.savefig(os.path.join(OUT_DIR, "mutation_heatmap.png"))
plt.close()

plt.figure(figsize=(6,4))
plt.bar(["RandomForest", "DeepLearning"], [rf_acc, dl_acc], color=['skyblue','orange'])
plt.ylabel("Accuracy")
plt.title("Model Accuracy Comparison")
plt.tight_layout()
plt.savefig(os.path.join(OUT_DIR, "accuracy_graph.png"))
plt.close()

# ---------------------------------------------------------------------
# üíæ 9Ô∏è‚É£ Save Artifacts
# ---------------------------------------------------------------------
joblib.dump(rf, os.path.join(OUT_DIR, "biomind_model.pkl"))
model.save(os.path.join(OUT_DIR, "biomind_dl.h5"))

metadata = {
    "project": "BioMind - Pathogen Mutation Predictor",
    "models": {"RandomForest": float(rf_acc), "DeepLearning": float(dl_acc)},
    "merge_key": meta_key,
    "target_column": label_col,
    "train_shape": X_train.shape,
    "test_shape": X_test.shape,
}
with open(os.path.join(OUT_DIR, "biomind_metadata.yaml"), "w") as f:
    yaml.dump(metadata, f)

pred_df = pd.DataFrame({
    "TrueLabel": y_test.values,
    "Pred_RF": rf_pred,
    "Pred_DL": y_pred.flatten()
})
pred_df.to_json(os.path.join(OUT_DIR, "resistance_prediction.json"), orient="records", indent=4)

print("\n‚úÖ All results saved in:", OUT_DIR)
for f in ["biomind_model.pkl", "biomind_dl.h5", "biomind_metadata.yaml", "resistance_prediction.json", "accuracy_graph.png", "mutation_heatmap.png"]:
    print(" ‚îú‚îÄ‚îÄ", f)


[INFO] Loading mutation data...
[INFO] Combined mutation shape: (8873, 7)
[INFO] Metadata shape: (3786, 31)
[INFO] Metadata columns: ['sample_id', 'year', 'country', 'continent', 'beta.lactamase', 'azithromycin', 'ciprofloxacin', 'ceftriaxone', 'cefixime', 'tetracycline'] ...
[INFO] Using 'sample_id' as merge key (converted to string).
[INFO] Final merged shape: (17, 37)
[WARN] Auto-selected potential target column: country
[INFO] Target column detected: country
[INFO] Feature matrix shape: (17, 23)
[INFO] Train/Test split: (12, 23), (5, 23)
[INFO] Training RandomForest...
[RESULT] RandomForest Accuracy: 1.0000
[INFO] Training Deep Learning Model...


Epoch 1/30


Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30




‚úÖ All results saved in: C:\Users\NXTWAVE\Downloads\Pathogen Mutation Predictor
 ‚îú‚îÄ‚îÄ biomind_model.pkl
 ‚îú‚îÄ‚îÄ biomind_dl.h5
 ‚îú‚îÄ‚îÄ biomind_metadata.yaml
 ‚îú‚îÄ‚îÄ resistance_prediction.json
 ‚îú‚îÄ‚îÄ accuracy_graph.png
 ‚îú‚îÄ‚îÄ mutation_heatmap.png


  saving_api.save_model(
