In [4]:
# ================================
# Validate Breast Model on Ovarian Dataset
# ================================

import pandas as pd
import joblib
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score

# ----------------------------
# Step 1Ô∏è‚É£: Load breast model
# ----------------------------
model = joblib.load("../../models/breast_model.pkl")
print("‚úÖ Loaded model: breast_model.pkl")

# ----------------------------
# Step 2Ô∏è‚É£: Load ovarian dataset
# ----------------------------
df_ov = pd.read_csv("../../data/labeled/ovarian_labeled.csv")
print(f"‚úÖ Ovarian dataset loaded: {df_ov.shape[0]} samples, {df_ov.shape[1]-1} genes")

X_ov = df_ov.drop(columns=["status"])
y_ov = df_ov["status"]

# ----------------------------
# Step 3Ô∏è‚É£: Match genes with breast model
# ----------------------------
# Load top genes from breast biomarkers file
top_genes = pd.read_csv("../../outputs/metrics/breast_top_genes.csv")["Gene"].tolist()

# Select only common genes
common_genes = [g for g in top_genes if g in X_ov.columns]
print(f"üß¨ Using {len(common_genes)} common top genes for cross-prediction")

X_ov_subset = X_ov[common_genes]

# ----------------------------
# Step 4Ô∏è‚É£: Predict using breast model
# ----------------------------
y_pred = model.predict(X_ov_subset)
y_prob = model.predict_proba(X_ov_subset)[:, 1]

# ----------------------------
# Step 5Ô∏è‚É£: Evaluate performance
# ----------------------------
acc = accuracy_score(y_ov, y_pred)
roc = roc_auc_score(y_ov, y_prob)

print(f"\n‚úÖ Cross-disease Prediction (Breast ‚Üí Ovarian):")
print(f"Accuracy: {acc:.4f}")
print(f"ROC-AUC: {roc:.4f}")
print("\nClassification Report:")
print(classification_report(y_ov, y_pred))

# ----------------------------
# Step 6Ô∏è‚É£: Save results
# ----------------------------
results = pd.DataFrame({
    "true_label": y_ov,
    "predicted_label": y_pred,
    "predicted_prob": y_prob
})
results.to_csv("../outputs/metrics/breast_to_ovarian_predictions.csv", index=False)
print("üíæ Saved predictions to ../outputs/metrics/breast_to_ovarian_predictions.csv")


‚úÖ Loaded model: breast_model.pkl
‚úÖ Ovarian dataset loaded: 63 samples, 22189 genes
üß¨ Using 20 common top genes for cross-prediction


ValueError: The feature names should match those that were passed during fit.
Feature names seen at fit time, yet now missing:
- A1CF
- A2M
- A4GALT
- A4GNT
- AAAS
- ...


In [5]:
# =======================================
# Cross-Disease Validation: Breast ‚Üí Ovarian
# =======================================
import pandas as pd
import joblib
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score

# ----------------------------
# Step 1Ô∏è‚É£: Load model & dataset
# ----------------------------
model = joblib.load("../../models/breast_model.pkl")
print("‚úÖ Loaded model: breast_model.pkl")

df_ov = pd.read_csv("../../data/labeled/ovarian_labeled.csv")
print(f"‚úÖ Ovarian dataset loaded: {df_ov.shape[0]} samples, {df_ov.shape[1]-1} genes")

X_ov = df_ov.drop(columns=["status"])
y_ov = df_ov["status"]

# ----------------------------
# Step 2Ô∏è‚É£: Load top genes (from breast)
# ----------------------------
top_genes = pd.read_csv("../../outputs/metrics/breast_top_genes.csv")["Gene"].tolist()
print(f"üß¨ Found {len(top_genes)} top genes from breast model")

# Normalize column names (case-insensitive)
X_ov.columns = [c.strip().upper() for c in X_ov.columns]
top_genes_upper = [g.strip().upper() for g in top_genes]

# Find overlapping and missing genes
common_genes = [g for g in top_genes_upper if g in X_ov.columns]
missing_genes = [g for g in top_genes_upper if g not in X_ov.columns]
print(f"‚úÖ Using {len(common_genes)} overlapping genes")
print(f"‚ö†Ô∏è Missing {len(missing_genes)} genes (filled with zeros)")

# Create aligned feature set
X_aligned = pd.DataFrame()

# Add all top genes in correct order
for gene in top_genes_upper:
    if gene in X_ov.columns:
        X_aligned[gene] = X_ov[gene]
    else:
        X_aligned[gene] = 0  # Fill missing genes with zero expression

print(f"üß© Final aligned shape for prediction: {X_aligned.shape}")

# ----------------------------
# Step 3Ô∏è‚É£: Predict
# ----------------------------
y_pred = model.predict(X_aligned)
y_prob = model.predict_proba(X_aligned)[:, 1]

# ----------------------------
# Step 4Ô∏è‚É£: Evaluate
# ----------------------------
acc = accuracy_score(y_ov, y_pred)
roc = roc_auc_score(y_ov, y_prob)

print(f"\n‚úÖ Cross-disease Prediction (Breast ‚Üí Ovarian):")
print(f"Accuracy: {acc:.4f}")
print(f"ROC-AUC: {roc:.4f}")
print("\nClassification Report:")
print(classification_report(y_ov, y_pred))

# ----------------------------
# Step 5Ô∏è‚É£: Save results
# ----------------------------
import os
os.makedirs("../../outputs/metrics", exist_ok=True)

results = pd.DataFrame({
    "true_label": y_ov,
    "predicted_label": y_pred,
    "predicted_prob": y_prob
})
results.to_csv("../../outputs/metrics/breast_to_ovarian_predictions.csv", index=False)
print("üíæ Saved predictions to ../../outputs/metrics/breast_to_ovarian_predictions.csv")


‚úÖ Loaded model: breast_model.pkl
‚úÖ Ovarian dataset loaded: 63 samples, 22189 genes
üß¨ Found 20 top genes from breast model
‚úÖ Using 20 overlapping genes
‚ö†Ô∏è Missing 0 genes (filled with zeros)
üß© Final aligned shape for prediction: (63, 20)


ValueError: The feature names should match those that were passed during fit.
Feature names seen at fit time, yet now missing:
- A1CF
- A2M
- A4GALT
- A4GNT
- AAAS
- ...
