In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    f1_score, precision_score, recall_score, roc_auc_score,
    average_precision_score, confusion_matrix, roc_curve, precision_recall_curve
)
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

# Load the preprocessed clinical data
clinical_data_path = "preprocessed_clinical_data.csv"
clinical_df = pd.read_csv(clinical_data_path, sep="\t")

# Load the mutation data
mutation_data_path = "data_mutations.txt"
try:
    mutation_df = pd.read_csv(
        mutation_data_path,
        sep="\t",
        comment="#",
        on_bad_lines="skip"
    )
except Exception as e:
    print(f"Error loading file: {e}")

# Merge clinical and mutation data
merged_df = pd.merge(clinical_df, mutation_df, left_on="SAMPLE_ID", right_on="Tumor_Sample_Barcode", how="inner")

# Define relevant columns
relevant_columns = [
    "AGE", "OS_MONTHS", "DFS_MONTHS",
    "SEX_Male", "RACE_ASIAN", "RACE_BLACK OR AFRICAN AMERICAN",
    "RACE_WHITE", "ETHNICITY_NOT HISPANIC OR LATINO",
    *[col for col in merged_df.columns if col.startswith("PRIMARY_DIAGNOSIS_")],
    *[col for col in merged_df.columns if col.startswith("PATH_STAGE_")],
    *[col for col in merged_df.columns if col.startswith("PATH_T_STAGE_")],
    *[col for col in merged_df.columns if col.startswith("PATH_N_STAGE_")],
    *[col for col in merged_df.columns if col.startswith("PATH_M_STAGE_")],
    "Hugo_Symbol"
]

# Final dataset
final_df = merged_df[relevant_columns]

# Group by cancer type
cancer_types = final_df.filter(like="PRIMARY_DIAGNOSIS_").columns

# Function to evaluate model performance
def evaluate_model(y_true, y_pred, y_prob):
    cm = confusion_matrix(y_true, y_pred)
    tn, fp, fn, tp = cm.ravel()
    sensitivity = tp / (tp + fn)
    specificity = tn / (tn + fp)
    f1 = f1_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    roc_auc = roc_auc_score(y_true, y_prob)
    pr_auc = average_precision_score(y_true, y_prob)
    return {
        "F1-Score": f1,
        "Sensitivity": sensitivity,
        "Specificity": specificity,
        "Precision": precision,
        "ROC-AUC": roc_auc,
        "PR-AUC": pr_auc
    }

# Function to plot ROC and PR curves
def plot_curves(y_true, y_prob, cancer_type, model_name):
    fpr, tpr, _ = roc_curve(y_true, y_prob)
    precision, recall, _ = precision_recall_curve(y_true, y_prob)

    plt.figure(figsize=(12, 5))

    # ROC Curve
    plt.subplot(1, 2, 1)
    plt.plot(fpr, tpr, label=f"{model_name} (AUC={roc_auc_score(y_true, y_prob):.2f})")
    plt.plot([0, 1], [0, 1], linestyle="--", color="gray")
    plt.title(f"ROC Curve - {cancer_type}")
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.legend()

    # Precision-Recall Curve
    plt.subplot(1, 2, 2)
    plt.plot(recall, precision, label=f"{model_name} (AUC={average_precision_score(y_true, y_prob):.2f})")
    plt.title(f"Precision-Recall Curve - {cancer_type}")
    plt.xlabel("Recall")
    plt.ylabel("Precision")
    plt.legend()

    plt.tight_layout()
    plt.savefig(f"{cancer_type}_{model_name}_curves.png")
    plt.close()

# Dictionary to store results
results = {}

# Loop through each cancer type
for cancer_type in cancer_types:
    print(f"Processing {cancer_type}...")
    
    # Prepare features and target
    X = final_df.drop(columns=cancer_types)
    y = final_df[cancer_type]

    # Train-test-validation split
    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

    # Normalize numerical features
    scaler = StandardScaler()
    numerical_cols = ["AGE", "OS_MONTHS", "DFS_MONTHS"]
    X_train[numerical_cols] = scaler.fit_transform(X_train[numerical_cols])
    X_val[numerical_cols] = scaler.transform(X_val[numerical_cols])
    X_test[numerical_cols] = scaler.transform(X_test[numerical_cols])

    # Choose a model based on cancer type
    if "Breast" in cancer_type:
        model = LogisticRegression()
        model_name = "Logistic Regression"
    elif "Colon" in cancer_type:
        model = RandomForestClassifier(n_estimators=100, random_state=42)
        model_name = "Random Forest"
    elif "Lung" in cancer_type:
        model = XGBClassifier(random_state=42)
        model_name = "XGBoost"
    elif "Kidney" in cancer_type:
        model = LGBMClassifier(random_state=42)
        model_name = "LightGBM"
    else:
        model = CatBoostClassifier(verbose=0, random_state=42)
        model_name = "CatBoost"

    # Train the model
    model.fit(X_train, y_train)

    # Evaluate on validation set
    y_val_pred = model.predict(X_val)
    y_val_prob = model.predict_proba(X_val)[:, 1]
    val_metrics = evaluate_model(y_val, y_val_pred, y_val_prob)

    # Evaluate on test set
    y_test_pred = model.predict(X_test)
    y_test_prob = model.predict_proba(X_test)[:, 1]
    test_metrics = evaluate_model(y_test, y_test_pred, y_test_prob)

    # Plot curves
    plot_curves(y_test, y_test_prob, cancer_type, model_name)

    # Store results
    results[cancer_type] = {
        "Model": model_name,
        "Validation Metrics": val_metrics,
        "Test Metrics": test_metrics
    }

# Print results
for cancer_type, result in results.items():
    print(f"\nResults for {cancer_type}:")
    print(f"Model: {result['Model']}")
    print("Validation Metrics:", result["Validation Metrics"])
    print("Test Metrics:", result["Test Metrics"])

ModuleNotFoundError: No module named 'xgboost'