In [10]:
# LATE FUSION  

In [None]:
import joblib
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import numpy as np
import pandas as pd
import duckdb
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, f1_score
from tqdm import tqdm
import os
import warnings
warnings.filterwarnings("ignore", category=UserWarning)
# Suppress the FutureWarning related to is_copy/inplace operations
warnings.filterwarnings("ignore", category=FutureWarning)
# Suppress the generic SettingWithCopyWarning
# warnings.filterwarnings("ignore", category=pd.SettingWithCopyWarning)

In [12]:
# --- Blood Test Model (XGBoost) ---
BLOOD_MODEL_PATH = 'xgb_cad_severity_model-7.joblib'
SCALER_PATH = 'scaler_cad_severity-7.joblib'
FEATURES_LIST_PATH = 'selected_features-7.joblib'
DUCKDB_PATH = '../../final_db/mimic_analysis.db'

# --- ECG Model (PyTorch) ---
ECG_MODEL_PATH = './cnn_lstm_hybrid_checkpoints/hybrid_model_epoch60.pt' 
CLEANED_CSV_PATH = 'metadata_with_features.csv'

FUSION_DIR = 'fusion_1_checkpoints'

In [13]:
# --- Shared Config ---
BATCH_SIZE = 64
LEAD_COUNT = 12
SAMPLES = 2500
NUM_CLASSES = 3 
NUM_CLINICAL_FEATURES = 8 
HIDDEN_SIZE = 64
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

DEVICE

device(type='cuda')

In [None]:
# ============================================
# 1. ECG MODEL ARCHITECTURE (Copied from ECG script)
# ============================================

class CNNLSTM_FeatureExtractor(nn.Module):
    """Processes the raw ECG signal to generate a deep feature vector."""
    def __init__(self, hidden_size=HIDDEN_SIZE, num_layers=1):
        super().__init__()
        self.hidden_size = hidden_size
        
        # CNN (Feature Extraction)
        self.cnn = nn.Sequential(
            nn.Conv1d(LEAD_COUNT, 32, kernel_size=15, stride=2, padding=7),
            nn.BatchNorm1d(32), nn.ReLU(inplace=True),
            nn.MaxPool1d(kernel_size=3, stride=2, padding=1),
            
            nn.Conv1d(32, 64, kernel_size=11, stride=2, padding=5),
            nn.BatchNorm1d(64), nn.ReLU(inplace=True),
            nn.MaxPool1d(kernel_size=3, stride=2, padding=1),
            
            nn.Conv1d(64, 128, kernel_size=7, stride=2, padding=3),
            nn.BatchNorm1d(128), nn.ReLU(inplace=True)
        )
        
        # LSTM (Temporal Context)
        self.lstm = nn.LSTM(
            input_size=128, hidden_size=hidden_size, num_layers=num_layers,
            batch_first=True, bidirectional=True
        )

    def forward(self, x):
        cnn_out = self.cnn(x) 
        lstm_input = cnn_out.transpose(1, 2) 
        
        _, (h_n, _) = self.lstm(lstm_input)
        
        # Concatenate final hidden states from both directions
        final_state = torch.cat((h_n[-2, :, :], h_n[-1, :, :]), dim=1)
        return final_state
        
class FinalMultiInputHybridModel(nn.Module):
    """Combines deep features (CNN-LSTM output) and 8 handcrafted features."""
    def __init__(self, num_classes=NUM_CLASSES, num_clinical_features=NUM_CLINICAL_FEATURES):
        super().__init__()
        
        self.signal_extractor = CNNLSTM_FeatureExtractor(hidden_size=HIDDEN_SIZE)
        
        # Input size: (2 * HIDDEN_SIZE for bidirectional LSTM) + NUM_CLINICAL_FEATURES
        INPUT_SIZE = (2 * HIDDEN_SIZE) + num_clinical_features 
        
        self.final_fc = nn.Sequential(
            nn.Linear(INPUT_SIZE, 64),
            nn.ReLU(inplace=True),
            nn.Dropout(0.5),
            nn.Linear(64, num_classes)
        )

    def forward(self, signal, clinical_features):
        deep_features = self.signal_extractor(signal) 
        
        combined_features = torch.cat((deep_features, clinical_features.float()), dim=1) 
        
        return self.final_fc(combined_features)

In [15]:
# ============================================
# 2. ECG DATASET CLASS (Copied from ECG script)
# ============================================
class HybridECGDataset(Dataset):
    """Dataset for loading ECG signal and 8 handcrafted features."""
    def __init__(self, df):
        self.df = df.reset_index(drop=True)

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        
        # --- Load Signal ---
        try:
            signal = np.load(row['processed_npy_path']).T
            if signal.shape[1] > SAMPLES:
                signal = signal[:, :SAMPLES]
            elif signal.shape[1] < SAMPLES:
                 # Pad if signal is too short
                 padding = np.zeros((LEAD_COUNT, SAMPLES - signal.shape[1]), dtype=np.float32)
                 signal = np.concatenate([signal, padding], axis=1)
            signal = np.nan_to_num(signal, nan=0.0)
        except Exception: 
            signal = np.zeros((LEAD_COUNT, SAMPLES), dtype=np.float32)

        # --- Load Clinical Features ---
        try:
            features = np.load(row['feature_path'])
        except Exception:
            features = np.zeros(NUM_CLINICAL_FEATURES, dtype=np.float32)

        label = {"Low": 0, "Moderate": 1, "High": 2}[row['severity_level']]
        
        # Return Signal, 8 Handcrafted Features, and Label
        return (torch.tensor(signal, dtype=torch.float32), 
                torch.tensor(features, dtype=torch.float32)), label

In [None]:
def load_and_align_test_data(cleaned_csv_path, duckdb_path):
    """
    Loads ECG metadata and blood lab data, aligns blood features to the exact
    feature order used in training (feature_names_full-7.joblib), and returns:
      - X_blood_all_features : pd.DataFrame (N x 99) in exact training order
      - X_ecg_metadata       : pd.DataFrame (contains processed_npy_path, feature_path, severity_level, etc.)
      - y_test_fusion        : np.ndarray labels (0/1/2)
      - original_feature_names_list : list of 99 feature names (order)
    """
    import joblib
    import pandas as pd
    import duckdb
    import numpy as np
    import os

    # 1) Load ECG metadata CSV
    df_ecg_full = pd.read_csv(cleaned_csv_path)

    # 2) Query DuckDB for admissions + labs (same logic as training)
    con = duckdb.connect(duckdb_path)

    df_blood_full = con.execute("""
        SELECT hadm_id, subject_id, anchor_age, gender, cad, severity_level
        FROM admissions_severity
        WHERE severity_level IS NOT NULL
    """).df()

    # lab item selection (same keyword groups as training)
    cbc_keywords = ["hemoglobin", "hematocrit", "rbc", "wbc", "platelet", "mcv", "mch", "mchc", "rdw", "neutrophil", "lymphocyte", "monocyte", "eosinophil", "basophil"]
    bmp_keywords = ["sodium", "potassium", "chloride", "bicarbonate", "co2", "urea", "bun", "creatinine", "glucose", "calcium"]
    lft_keywords = ["albumin", "protein", "bilirubin", "alkaline phosphatase", "ast", "sgot", "alt", "sgpt", "lactate"]
    lipid_keywords = ["cholesterol", "hdl", "ldl", "triglyceride"]
    cardiac_keywords = ["troponin", "ck-mb", "creatine kinase", "ck", "bnp", "nt-probnp", "hs-crp"]
    all_keywords = cbc_keywords + bmp_keywords + lft_keywords + lipid_keywords + cardiac_keywords
    pattern = '|'.join(all_keywords)

    df_labitems = con.execute("SELECT itemid, label, fluid FROM d_labitems WHERE LOWER(fluid) = 'blood'").df()
    mask = df_labitems['label'].str.lower().str.contains(pattern, na=False)
    blood_itemids = df_labitems[mask]['itemid'].tolist()
    if len(blood_itemids) == 0:
        con.close()
        raise RuntimeError("No matching blood lab items found. Check df_labitems and keyword list.")

    blood_itemids_str = ', '.join(map(str, blood_itemids))

    df_labs = con.execute(f"""
        SELECT l.subject_id, d.label, AVG(l.valuenum) as mean_value
        FROM labevents l JOIN d_labitems d ON l.itemid = d.itemid
        WHERE l.itemid IN ({blood_itemids_str})
        GROUP BY l.subject_id, d.label
    """).df()
    df_labs_pivot = df_labs.pivot(index='subject_id', columns='label', values='mean_value').reset_index()

    # Merge lab pivot with admission-level demographics
    df_blood_full = pd.merge(df_blood_full, df_labs_pivot, on='subject_id', how='left')

    # Add comorbidities
    comorb_tables = {
        'diabetes': 'diabetes_adm',
        'hypertension': 'hypertension_adm',
        'renal': 'renal_adm',
        'obesity': 'obesity_icd_adm',
        'smoker': 'smokers_adm'
    }
    # ensure hadm_id present as string
    df_blood_full['hadm_id'] = df_blood_full['hadm_id'].astype(str)

    for comorb, table in comorb_tables.items():
        df_comorb = con.execute(f"SELECT DISTINCT hadm_id, 1 AS {comorb} FROM {table}").df()
        if not df_comorb.empty:
            df_comorb['hadm_id'] = df_comorb['hadm_id'].astype(str)
            df_blood_full = pd.merge(df_blood_full, df_comorb, on='hadm_id', how='left')
            df_blood_full[comorb] = df_blood_full[comorb].fillna(0).astype(int)
        else:
            # If table empty or absent, create column of zeros
            df_blood_full[comorb] = 0

    con.close()

    # Guarantee expected comorb columns exist
    expected_comorb_cols = list(comorb_tables.keys())
    for col in expected_comorb_cols:
        if col not in df_blood_full.columns:
            df_blood_full[col] = 0

    # Encode gender
    df_blood_full['gender'] = df_blood_full['gender'].map({'M': 0, 'F': 1})
    # If any unknown values remain, fill with median later

    # Compute numeric medians (used to fill missing numeric values as a robust proxy)
    numeric_cols = df_blood_full.select_dtypes(include=['number']).columns.tolist()
    median_values = df_blood_full[numeric_cols].median().to_dict()

    # Fill numeric NaNs with medians computed from df_blood_full
    df_blood_full.fillna(median_values, inplace=True)

    # ----- Load training artifacts (must exist in working directory) -----
    try:
        feature_names_full = joblib.load("feature_names_full-7.joblib")   # list of 99 feature names used for scaler
        selected_features = joblib.load("selected_features-7.joblib")     # final selected 30 features
        top_mi_features = joblib.load("top_mi_features-7.joblib")         # optional debug list
    except Exception as e:
        raise RuntimeError(f"Required joblib artifact missing or unreadable: {e}")

    if not isinstance(feature_names_full, list):
        feature_names_full = list(feature_names_full)

    print(f"Loaded artifacts: feature_names_full ({len(feature_names_full)}), selected_features ({len(selected_features)})")

    # --------------------------------------------------------------------------------
    # Merge ECG metadata (paths) -> keep only patients with ECG metadata and severity labels
    # --------------------------------------------------------------------------------
    # We only need subject_id + processed_npy_path + feature_path from ECG CSV
    if 'processed_npy_path' not in df_ecg_full.columns or 'feature_path' not in df_ecg_full.columns:
        raise RuntimeError("ECG metadata CSV must contain 'processed_npy_path' and 'feature_path' columns")

    df_common = pd.merge(
        df_blood_full,
        df_ecg_full[['subject_id', 'processed_npy_path', 'feature_path']],
        on='subject_id', how='inner'
    )
    # Keep only the severity levels used in model training
    df_common = df_common[df_common['severity_level'].isin(['Low', 'Moderate', 'High'])].reset_index(drop=True)

    # Map severity to numeric label for stratification
    df_common['severity_class'] = df_common['severity_level'].map({'Low': 0, 'Moderate': 1, 'High': 2})

    # Train/test split to create an independent test set for fusion (same random state as training code)
    from sklearn.model_selection import train_test_split
    df_train, df_test_fusion = train_test_split(
        df_common,
        test_size=0.2,
        random_state=42,
        stratify=df_common['severity_class']
    )

    # Ensure comorbidity columns exist in test
    for col in expected_comorb_cols:
        if col not in df_test_fusion.columns:
            df_test_fusion[col] = 0

    # Prepare the final blood matrix: drop identifiers and ECG columns
    columns_to_drop_final = ['subject_id', 'hadm_id', 'cad', 'severity_level', 'severity_class', 'processed_npy_path', 'feature_path']
    X_blood_all_features = df_test_fusion.drop(columns=[c for c in columns_to_drop_final if c in df_test_fusion.columns], errors='ignore')

    # Reindex to exactly the training 99-feature order; missing columns become NaN
    X_blood_all_features = X_blood_all_features.reindex(columns=feature_names_full)

    # Fill any remaining NaNs with the numeric medians computed earlier (or 0 fallback)
    for col in feature_names_full:
        if col not in X_blood_all_features.columns:
            # safety: add column of zeros (shouldn't happen because reindex added them)
            X_blood_all_features[col] = 0
    # Now replace NaNs with median_values (for numeric columns) or 0
    for col in X_blood_all_features.columns:
        if X_blood_all_features[col].isna().any():
            if col in median_values:
                X_blood_all_features[col].fillna(median_values[col], inplace=True)
            else:
                # Non-numeric or missing median - fill with 0
                X_blood_all_features[col].fillna(0, inplace=True)

    # Final sanity checks
    print(f"Blood test features shape: {X_blood_all_features.shape}")
    if X_blood_all_features.shape[1] != len(feature_names_full):
        raise RuntimeError(f"Feature count mismatch after reindexing: expected {len(feature_names_full)}, got {X_blood_all_features.shape[1]}")

    # Prepare outputs
    y_test_fusion = df_test_fusion['severity_class'].values
    X_ecg_metadata = df_test_fusion  # contains processed_npy_path, feature_path, severity_level etc.
    original_feature_names_list = feature_names_full

    return X_blood_all_features, X_ecg_metadata, y_test_fusion, original_feature_names_list


In [None]:
# ============================================
# 4. PREDICTION GENERATION
# ============================================

def generate_predictions(X_blood_raw, X_ecg_metadata, y_true, original_feature_names):
    """Generates P_A and P_B and creates the fusion dataset."""
    
    print("Generating Predictions")
    print(f"Original Feature List Size: {len(original_feature_names)}")
    
    # --- Load XGBoost Components (Model A) ---
    model_blood = joblib.load(BLOOD_MODEL_PATH)
    scaler = joblib.load(SCALER_PATH)
    selected_features = joblib.load(FEATURES_LIST_PATH)
    
    # --- Load ECG Model (Model B) ---
    model_ecg = FinalMultiInputHybridModel(num_clinical_features=NUM_CLINICAL_FEATURES).to(DEVICE)
    model_ecg.load_state_dict(torch.load(ECG_MODEL_PATH, map_location=DEVICE))
    model_ecg.eval()
    
    # --- A. Model A: XGBoost Predictions (P_A) ---
    X_test_blood_df = X_blood_raw.copy()
    
    # Get the names of the 99 features the scaler expects (passed from data loading)
    expected_99_features = set(original_feature_names)
    
    # Get the names of the 104 features currently in the test data
    current_104_features = set(X_test_blood_df.columns)
    
    # Find columns that are in the test data but NOT in the expected 99 features
    extra_columns_to_drop = list(current_104_features - expected_99_features)
    
    if extra_columns_to_drop:
        print("\nFeature Mismatch Detected!")
        print(f"Expected Features: {len(expected_99_features)}")
        print(f"Received Features: {len(current_104_features)}")
        print(f"Extra Columns Found (must be dropped): {extra_columns_to_drop}")
        
        # Drop the extra columns
        X_test_blood_df = X_test_blood_df.drop(columns=extra_columns_to_drop)
        print(f"Successfully dropped {len(extra_columns_to_drop)} columns.")
    
    # Check for missing columns (should be empty if data loading is correct)
    missing_columns = list(expected_99_features - set(X_test_blood_df.columns))
    if missing_columns:
        print(f"FATAL ERROR: Missing columns required by scaler: {missing_columns}")
        return None, None # Stop execution if features are missing!
    
    # 1. Align the columns and order them correctly for the scaler (99 features)
    # This step is crucial because the scaler relies on positional ordering.
    X_test_full = X_test_blood_df[original_feature_names]
    
    # Forcing NumPy conversion ensures no residual Pandas metadata confuses the scaler.
    X_test_full_np = X_test_full.values 
    
    # DEBUG CHECK: What is the size immediately before the crash?
    print(f"DEBUG CHECK: X_test_full_np shape: {X_test_full_np.shape}")
    
    # 2. Scale the full 99 features
    X_test_scaled_full = scaler.transform(X_test_full_np)      #failing but why
    
    # 3. Select the final 30 features from the scaled array for the XGBoost model
    
    X_test_scaled_df = pd.DataFrame(X_test_scaled_full, columns=original_feature_names)
    X_test_sel = X_test_scaled_df[selected_features]
    
    # Predict probabilities
    P_A = model_blood.predict_proba(X_test_sel)
    print(f"P_A (XGBoost) probabilities shape: {P_A.shape}")

    # --- B. Model B: CNN-LSTM Predictions (P_B) ---
    ecg_dataset = HybridECGDataset(X_ecg_metadata)
    ecg_loader = DataLoader(ecg_dataset, batch_size=BATCH_SIZE, shuffle=False)
    
    P_B_list = []
    with torch.no_grad():
        for (x_signal, x_features), _ in tqdm(ecg_loader, desc="ECG Model Prediction"):
            x_signal, x_features = x_signal.to(DEVICE), x_features.to(DEVICE)
            logits = model_ecg(x_signal, x_features)
            probabilities = F.softmax(logits, dim=1).cpu().numpy()
            P_B_list.append(probabilities)
            
    P_B = np.concatenate(P_B_list, axis=0)
    print(f"P_B (ECG-LSTM) probabilities shape: {P_B.shape}")
    
    # --- C. Create Fusion Dataset ---
    X_fusion = np.hstack((P_A, P_B)) # Shape N x 6
    Y_fusion = y_true
    
    return X_fusion, Y_fusion

In [21]:
# ============================================
# 5. LATE FUSION (META-MODEL TRAINING)
# ============================================

def perform_late_fusion(X_fusion, Y_fusion):
    """Trains a Logistic Regression Meta-Model on the combined probabilities."""
    print("\nTraining Logistic Regression Meta-Model")
    
    # Splitting the test set predictions into a Meta-Train and Meta-Test set
    # This allows us to train the fusion rule and evaluate the final system fairly.
    X_meta_train, X_meta_test, y_meta_train, y_meta_test = train_test_split(
        X_fusion, Y_fusion, test_size=0.5, random_state=42, stratify=Y_fusion
    )

    # Train the Meta-Model (Logistic Regression)
    meta_model = LogisticRegression(solver='liblinear', multi_class='auto', random_state=42)
    meta_model.fit(X_meta_train, y_meta_train)
    
    # --- SAVE THE FUSION MODEL CHECKPOINT ---
    
    # 1. Create the directory if it doesn't exist
    os.makedirs(FUSION_DIR, exist_ok=True)
    
    # 2. Define the full path for the checkpoint file
    FUSION_MODEL_FILENAME = os.path.join(FUSION_DIR, 'fusion_meta_model.joblib')
    
    try:
        joblib.dump(meta_model, FUSION_MODEL_FILENAME)
        print(f"\nFusion Meta-Model successfully saved to {FUSION_MODEL_FILENAME}")
    except Exception as e:
        print(f"Error saving fusion model: {e}")

    # Evaluate Fusion Performance on the Meta-Test Set
    y_fusion_pred = meta_model.predict(X_meta_test)
    
    acc_fusion = accuracy_score(y_meta_test, y_fusion_pred)
    f1_w = f1_score(y_meta_test, y_fusion_pred, average='weighted')

    print("\n")
    print("FINAL LATE FUSION PERFORMANCE (Meta-Model)")
    print(f"Accuracy on Meta-Test Set: {acc_fusion:.4f}")
    print(f"Weighted F1-Score: {f1_w:.4f}")
    print("\nClassification Report:")
    print(classification_report(y_meta_test, y_fusion_pred, target_names=['Low','Moderate','High']))
    print('\n')

    # Optionally, you can also test the individual model performance on the same set for comparison
    P_A_test = X_meta_test[:, 0:NUM_CLASSES]
    P_B_test = X_meta_test[:, NUM_CLASSES:2*NUM_CLASSES]
    
    acc_a = accuracy_score(y_meta_test, np.argmax(P_A_test, axis=1))
    acc_b = accuracy_score(y_meta_test, np.argmax(P_B_test, axis=1))
    

In [22]:
#main exec
print(f"Starting fusion")

# Step 1: Load and Align the Common Test Data, Successfully loads the 99 feature names list
X_blood_raw, X_ecg_metadata, y_test_fusion, original_feature_names_list = load_and_align_test_data(CLEANED_CSV_PATH, DUCKDB_PATH)

# Step 2: Generate Predictions (P_A and P_B)
X_fusion, Y_fusion = generate_predictions(X_blood_raw, X_ecg_metadata, y_test_fusion, original_feature_names_list)

# Check if prediction generation was successful before fusion
if X_fusion is not None:
    perform_late_fusion(X_fusion, Y_fusion)

Starting fusion
Loaded artifacts: feature_names_full (99), selected_features (30)
Blood test features shape: (11393, 99)
Generating Predictions
Original Feature List Size: 99
DEBUG CHECK: X_test_full_np shape: (11393, 99)
P_A (XGBoost) probabilities shape: (11393, 3)


ECG Model Prediction: 100%|██████████| 179/179 [05:00<00:00,  1.68s/it]

P_B (ECG-LSTM) probabilities shape: (11393, 3)

Training Logistic Regression Meta-Model

Fusion Meta-Model successfully saved to fusion_1_checkpoints\fusion_meta_model.joblib


FINAL LATE FUSION PERFORMANCE (Meta-Model)
Accuracy on Meta-Test Set: 0.9326
Weighted F1-Score: 0.9332

Classification Report:
              precision    recall  f1-score   support

         Low       0.81      0.89      0.85       798
    Moderate       0.95      0.94      0.94      3068
        High       0.97      0.94      0.95      1831

    accuracy                           0.93      5697
   macro avg       0.91      0.92      0.92      5697
weighted avg       0.93      0.93      0.93      5697








In [20]:
# import duckdb
# import os

# DB_PATH = "../../final_db/mimic_analysis.db"

# try:
#     # 1. Attempt to connect to the locked database file path
#     # This often forces DuckDB to acknowledge the old lock or cleanup resources.
#     cleanup_con = duckdb.connect(DB_PATH) 
    
#     # 2. Immediately close the connection in a controlled way
#     cleanup_con.close()
    
#     print(f"Successfully connected to and closed the connection to: {DB_PATH}")
#     print("The file lock should now be released.")

# except Exception as e:
#     print(f"Failed to connect and close the database at {DB_PATH}.")
#     print(f"Error: {e}")
#     # In rare cases, you might need to restart your terminal/IDE or the machine

In [None]:
# import joblib
# import torch
# import torch.nn as nn
# import torch.nn.functional as F
# from torch.utils.data import Dataset, DataLoader
# import numpy as np
# import pandas as pd
# import duckdb
# from sklearn.linear_model import LogisticRegression
# from sklearn.model_selection import train_test_split
# from sklearn.metrics import accuracy_score, classification_report, f1_score
# from tqdm import tqdm
# import os
# import warnings
# import sys 

# # Suppress warnings
# warnings.filterwarnings("ignore", category=UserWarning)
# warnings.filterwarnings("ignore", category=FutureWarning)

# # --- CONFIGURATION ---
# BLOOD_MODEL_PATH = 'xgb_cad_severity_model-7.joblib'
# SCALER_PATH = 'scaler_cad_severity-7.joblib'
# FEATURES_LIST_PATH = 'selected_features-7.joblib'
# DUCKDB_PATH = '../../final_db/mimic_analysis.db'

# ECG_MODEL_PATH = './cnn_lstm_hybrid_checkpoints/hybrid_model_epoch60.pt' 
# CLEANED_CSV_PATH = 'metadata_with_features.csv'

# FUSION_DIR = 'fusion_1_checkpoints'

# BATCH_SIZE = 64
# LEAD_COUNT = 12
# SAMPLES = 2500
# NUM_CLASSES = 3 
# NUM_CLINICAL_FEATURES = 8 
# HIDDEN_SIZE = 64
# DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# # ============================================
# # 1. ECG MODEL ARCHITECTURE 
# # ============================================

# class CNNLSTM_FeatureExtractor(nn.Module):
#     """Processes the raw ECG signal to generate a deep feature vector."""
#     def __init__(self, hidden_size=HIDDEN_SIZE, num_layers=1):
#         super().__init__()
#         self.hidden_size = hidden_size
        
#         # CNN (Feature Extraction)
#         self.cnn = nn.Sequential(
#             nn.Conv1d(LEAD_COUNT, 32, kernel_size=15, stride=2, padding=7),
#             nn.BatchNorm1d(32), nn.ReLU(inplace=True),
#             nn.MaxPool1d(kernel_size=3, stride=2, padding=1),
            
#             nn.Conv1d(32, 64, kernel_size=11, stride=2, padding=5),
#             nn.BatchNorm1d(64), nn.ReLU(inplace=True),
#             nn.MaxPool1d(kernel_size=3, stride=2, padding=1),
            
#             nn.Conv1d(64, 128, kernel_size=7, stride=2, padding=3),
#             nn.BatchNorm1d(128), nn.ReLU(inplace=True)
#         )
        
#         # LSTM (Temporal Context)
#         self.lstm = nn.LSTM(
#             input_size=128, hidden_size=hidden_size, num_layers=num_layers,
#             batch_first=True, bidirectional=True
#         )

#     def forward(self, x):
#         cnn_out = self.cnn(x) 
#         lstm_input = cnn_out.transpose(1, 2) 
        
#         _, (h_n, _) = self.lstm(lstm_input)
        
#         # Concatenate final hidden states from both directions
#         final_state = torch.cat((h_n[-2, :, :], h_n[-1, :, :]), dim=1)
#         return final_state
        
# class FinalMultiInputHybridModel(nn.Module):
#     """Combines deep features (CNN-LSTM output) and 8 handcrafted features."""
#     def __init__(self, num_classes=NUM_CLASSES, num_clinical_features=NUM_CLINICAL_FEATURES):
#         super().__init__()
        
#         self.signal_extractor = CNNLSTM_FeatureExtractor(hidden_size=HIDDEN_SIZE)
        
#         # Input size: (2 * HIDDEN_SIZE for bidirectional LSTM) + NUM_CLINICAL_FEATURES
#         INPUT_SIZE = (2 * HIDDEN_SIZE) + num_clinical_features 
        
#         self.final_fc = nn.Sequential(
#             nn.Linear(INPUT_SIZE, 64),
#             nn.ReLU(inplace=True),
#             nn.Dropout(0.5),
#             nn.Linear(64, num_classes)
#         )

#     def forward(self, signal, clinical_features):
#         deep_features = self.signal_extractor(signal) 
        
#         combined_features = torch.cat((deep_features, clinical_features.float()), dim=1) 
        
#         return self.final_fc(combined_features)
		
# # ============================================
# # 2. ECG DATASET CLASS 
# # ============================================
# class HybridECGDataset(Dataset):
#     """Dataset for loading ECG signal and 8 handcrafted features."""
#     def __init__(self, df):
#         self.df = df.reset_index(drop=True)

#     def __len__(self):
#         return len(self.df)

#     def __getitem__(self, idx):
#         row = self.df.iloc[idx]
        
#         # --- Load Signal ---
#         try:
#             signal = np.load(row['processed_npy_path']).T
#             if signal.shape[1] > SAMPLES:
#                 signal = signal[:, :SAMPLES]
#             elif signal.shape[1] < SAMPLES:
#                  # Pad if signal is too short
#                  padding = np.zeros((LEAD_COUNT, SAMPLES - signal.shape[1]), dtype=np.float32)
#                  signal = np.concatenate([signal, padding], axis=1)
#             signal = np.nan_to_num(signal, nan=0.0)
#         except Exception: 
#             signal = np.zeros((LEAD_COUNT, SAMPLES), dtype=np.float32)

#         # --- Load Clinical Features ---
#         try:
#             features = np.load(row['feature_path'])
#         except Exception:
#             features = np.zeros(NUM_CLINICAL_FEATURES, dtype=np.float32)

#         label = {"Low": 0, "Moderate": 1, "High": 2}[row['severity_level']]
        
#         # Return Signal, 8 Handcrafted Features, and Label
#         return (torch.tensor(signal, dtype=torch.float32), 
#                 torch.tensor(features, dtype=torch.float32)), label
				
# # ============================================
# # 3. DATA ACQUISITION & ALIGNMENT
# # ============================================

# def load_and_align_test_data(cleaned_csv_path, duckdb_path):
#     """
#     Loads ECG metadata and blood lab data, aligns blood features to the exact
#     99-feature schema used in training, and returns the test set components.
#     """
    
#     # 1) Load ECG metadata CSV
#     df_ecg_full = pd.read_csv(cleaned_csv_path)

#     # 2) Query DuckDB for admissions + labs (same logic as training)
#     con = duckdb.connect(duckdb_path)

#     df_blood_full = con.execute("""
#         SELECT hadm_id, subject_id, anchor_age, gender, cad, severity_level
#         FROM admissions_severity
#         WHERE severity_level IS NOT NULL
#     """).df()

#     # lab item selection (same keyword groups as training)
#     cbc_keywords = ["hemoglobin", "hematocrit", "rbc", "wbc", "platelet", "mcv", "mch", "mchc", "rdw", "neutrophil", "lymphocyte", "monocyte", "eosinophil", "basophil"]
#     bmp_keywords = ["sodium", "potassium", "chloride", "bicarbonate", "co2", "urea", "bun", "creatinine", "glucose", "calcium"]
#     lft_keywords = ["albumin", "protein", "bilirubin", "alkaline phosphatase", "ast", "sgot", "alt", "sgpt", "lactate"]
#     lipid_keywords = ["cholesterol", "hdl", "ldl", "triglyceride"]
#     cardiac_keywords = ["troponin", "ck-mb", "creatine kinase", "ck", "bnp", "nt-probnp", "hs-crp"]
#     all_keywords = cbc_keywords + bmp_keywords + lft_keywords + lipid_keywords + cardiac_keywords
#     pattern = '|'.join(all_keywords)

#     df_labitems = con.execute("SELECT itemid, label, fluid FROM d_labitems WHERE LOWER(fluid) = 'blood'").df()
#     mask = df_labitems['label'].str.lower().str.contains(pattern, na=False)
#     blood_itemids = df_labitems[mask]['itemid'].tolist()
    
#     if len(blood_itemids) == 0:
#         con.close()
#         raise RuntimeError("No matching blood lab items found. Check df_labitems and keyword list.")
    
#     blood_itemids_str = ', '.join(map(str, blood_itemids))

#     df_labs = con.execute(f"""
#         SELECT l.subject_id, d.label, AVG(l.valuenum) as mean_value
#         FROM labevents l JOIN d_labitems d ON l.itemid = d.itemid
#         WHERE l.itemid IN ({blood_itemids_str})
#         GROUP BY l.subject_id, d.label
#     """).df()
#     df_labs_pivot = df_labs.pivot(index='subject_id', columns='label', values='mean_value').reset_index()

#     # Merge lab pivot with admission-level demographics
#     df_blood_full = pd.merge(df_blood_full, df_labs_pivot, on='subject_id', how='left')

#     # Add comorbidities
#     comorb_tables = {
#         'diabetes': 'diabetes_adm',
#         'hypertension': 'hypertension_adm',
#         'renal': 'renal_adm',
#         'obesity': 'obesity_icd_adm',
#         'smoker': 'smokers_adm'
#     }
#     # ensure hadm_id present as string
#     df_blood_full['hadm_id'] = df_blood_full['hadm_id'].astype(str)

#     for comorb, table in comorb_tables.items():
#         df_comorb = con.execute(f"SELECT DISTINCT hadm_id, 1 AS {comorb} FROM {table}").df()
#         if not df_comorb.empty:
#             df_comorb['hadm_id'] = df_comorb['hadm_id'].astype(str)
#             df_blood_full = pd.merge(df_blood_full, df_comorb, on='hadm_id', how='left')
#             df_blood_full[comorb] = df_blood_full[comorb].fillna(0).astype(int)
#         else:
#             # If table empty or absent, create column of zeros
#             df_blood_full[comorb] = 0

#     con.close()

#     # Guarantee expected comorb columns exist (Post-merge cleanup)
#     expected_comorb_cols = list(comorb_tables.keys())
#     for col in expected_comorb_cols:
#         if col not in df_blood_full.columns:
#             df_blood_full[col] = 0

#     # Encode gender
#     df_blood_full['gender'] = df_blood_full['gender'].map({'M': 0, 'F': 1})
    
#     # Compute numeric medians (used to fill missing numeric values as a robust proxy)
#     numeric_cols = df_blood_full.select_dtypes(include=['number']).columns.tolist()
#     median_values = df_blood_full[numeric_cols].median().to_dict()

#     # Fill numeric NaNs with medians computed from df_blood_full
#     df_blood_full.fillna(median_values, inplace=True)

#     # ----- Load training artifacts -----
#     try:
#         feature_names_full = joblib.load(f"{SCALER_PATH[:-7]}feature_names_full-7.joblib")   # list of 99 feature names used for scaler
#         selected_features = joblib.load(FEATURES_LIST_PATH)     # final selected 30 features
#     except Exception as e:
#         raise RuntimeError(f"Required joblib artifact missing or unreadable: {e}")

#     if not isinstance(feature_names_full, list):
#         feature_names_full = list(feature_names_full)

#     # --------------------------------------------------------------------------------
#     # FIX: ESTABLISH GROUND TRUTH 99-FEATURE LIST 
#     # --------------------------------------------------------------------------------
    
#     columns_to_drop_for_X = ['subject_id', 'hadm_id', 'cad', 'severity_level', 'severity_class'] 
    
#     X_initial_structure = df_blood_full.drop(
#         columns=[col for col in columns_to_drop_for_X if col in df_blood_full.columns]
#     )
    
#     # Store the list of 99 feature names derived from the initial training data structure
#     original_feature_names_list = X_initial_structure.columns.tolist() 
    
#     # --------------------------------------------------------------------------------
    
#     # Merge ECG metadata (paths) -> keep only patients with ECG metadata and severity labels
#     df_common = pd.merge(
#         df_blood_full,
#         df_ecg_full[['subject_id', 'processed_npy_path', 'feature_path']],
#         on='subject_id', how='inner'
#     )
    
#     # Keep only the severity levels used in model training
#     df_common = df_common[df_common['severity_level'].isin(['Low', 'Moderate', 'High'])].reset_index(drop=True)

#     # Map severity to numeric label for stratification
#     df_common['severity_class'] = df_common['severity_level'].map({'Low': 0, 'Moderate': 1, 'High': 2})

#     # Train/test split to create an independent test set for fusion (same random state as training code)
#     df_train, df_test_fusion = train_test_split(
#         df_common,
#         test_size=0.2,
#         random_state=42,
#         stratify=df_common['severity_class']
#     )

#     # Final check to ensure comorbidity columns exist in test
#     for col in expected_comorb_cols:
#         if col not in df_test_fusion.columns:
#             df_test_fusion[col] = 0

#     # Prepare the final blood matrix: drop identifiers and ECG columns
#     columns_to_drop_final = ['subject_id', 'hadm_id', 'cad', 'severity_level', 'severity_class', 'processed_npy_path', 'feature_path']
#     X_blood_all_features = df_test_fusion.drop(columns=[c for c in columns_to_drop_final if c in df_test_fusion.columns], errors='ignore')

#     # Reindex to exactly the training 99-feature order; this line enforces the correct schema and order
#     X_blood_all_features = X_blood_all_features[original_feature_names_list]

#     # Final sanity check on the data size before returning (Should print 99)
#     print(f"Blood test features shape: {X_blood_all_features.shape}")
#     print(f"Expected 99 features, got {X_blood_all_features.shape[1]} features")

#     # Prepare outputs
#     y_test_fusion = df_test_fusion['severity_class'].values
#     X_ecg_metadata = df_test_fusion  # contains processed_npy_path, feature_path, severity_level etc.
    
#     return X_blood_all_features, X_ecg_metadata, y_test_fusion, original_feature_names_list

# # ============================================
# # 4. PREDICTION GENERATION
# # ============================================

# def generate_predictions(X_blood_raw, X_ecg_metadata, y_true, original_feature_names):
#     """Generates P_A and P_B and creates the fusion dataset."""
    
#     print("Generating Predictions")
#     print(f"Original Feature List Size: {len(original_feature_names)}")
    
#     # --- Load XGBoost Components (Model A) ---
#     model_blood = joblib.load(BLOOD_MODEL_PATH)
#     scaler = joblib.load(SCALER_PATH)
#     selected_features = joblib.load(FEATURES_LIST_PATH)
    
#     # --- Load ECG Model (Model B) ---
#     model_ecg = FinalMultiInputHybridModel(num_clinical_features=NUM_CLINICAL_FEATURES).to(DEVICE)
#     model_ecg.load_state_dict(torch.load(ECG_MODEL_PATH, map_location=DEVICE))
#     model_ecg.eval()
    
#     # --- A. Model A: XGBoost Predictions (P_A) ---
#     X_test_blood_df = X_blood_raw.copy()
    
#     # 1. Align the columns and order them correctly for the scaler (99 features)
#     X_test_full = X_test_blood_df[original_feature_names]
    
#     # Forcing NumPy conversion ensures no residual Pandas metadata confuses the scaler.
#     X_test_full_np = X_test_full.values 
    
#     print(f"DEBUG CHECK: X_test_full_np shape: {X_test_full_np.shape}")
    
#     # 2. Scale the full 99 features
#     X_test_scaled_full = scaler.transform(X_test_full_np)
    
#     # 3. Select the final 30 features from the scaled array for the XGBoost model
#     X_test_scaled_df = pd.DataFrame(X_test_scaled_full, columns=original_feature_names)
#     X_test_sel = X_test_scaled_df[selected_features].values
    
#     # Predict probabilities
#     P_A = model_blood.predict_proba(X_test_sel)
#     print(f"P_A (XGBoost) probabilities shape: {P_A.shape}")

#     # --- B. Model B: CNN-LSTM Predictions (P_B) ---
#     ecg_dataset = HybridECGDataset(X_ecg_metadata)
#     ecg_loader = DataLoader(ecg_dataset, batch_size=BATCH_SIZE, shuffle=False)
    
#     P_B_list = []
#     with torch.no_grad():
#         for (x_signal, x_features), _ in tqdm(ecg_loader, desc="ECG Model Prediction"):
#             x_signal, x_features = x_signal.to(DEVICE), x_features.to(DEVICE)
#             logits = model_ecg(x_signal, x_features)
#             probabilities = F.softmax(logits, dim=1).cpu().numpy()
#             P_B_list.append(probabilities)
            
#     P_B = np.concatenate(P_B_list, axis=0)
#     print(f"P_B (ECG-LSTM) probabilities shape: {P_B.shape}")
    
#     # --- C. Create Fusion Dataset ---
#     X_fusion = np.hstack((P_A, P_B)) # Shape N x 6
#     Y_fusion = y_true
    
#     return X_fusion, Y_fusion

# # ============================================
# # 5. LATE FUSION (META-MODEL TRAINING)
# # ============================================

# def perform_late_fusion(X_fusion, Y_fusion):
#     """Trains a Logistic Regression Meta-Model on the combined probabilities."""
#     print("\nTraining Logistic Regression Meta-Model")
    
#     # Splitting the test set predictions into a Meta-Train and Meta-Test set
#     X_meta_train, X_meta_test, y_meta_train, y_meta_test = train_test_split(
#         X_fusion, Y_fusion, test_size=0.5, random_state=42, stratify=Y_fusion
#     )

#     # Train the Meta-Model (Logistic Regression)
#     meta_model = LogisticRegression(solver='liblinear', multi_class='auto', random_state=42)
#     meta_model.fit(X_meta_train, y_meta_train)
    
#     # --- SAVE THE FUSION MODEL CHECKPOINT ---
#     os.makedirs(FUSION_DIR, exist_ok=True)
#     FUSION_MODEL_FILENAME = os.path.join(FUSION_DIR, 'fusion_meta_model.joblib')
    
#     try:
#         joblib.dump(meta_model, FUSION_MODEL_FILENAME)
#         print(f"\nFusion Meta-Model successfully saved to {FUSION_MODEL_FILENAME}")
#     except Exception as e:
#         print(f"Error saving fusion model: {e}")

#     # Evaluate Fusion Performance on the Meta-Test Set
#     y_fusion_pred = meta_model.predict(X_meta_test)
    
#     acc_fusion = accuracy_score(y_meta_test, y_fusion_pred)
#     f1_w = f1_score(y_meta_test, y_fusion_pred, average='weighted')

#     print("\n")
#     print("FINAL LATE FUSION PERFORMANCE (Meta-Model)")
#     print(f"Accuracy on Meta-Test Set: {acc_fusion:.4f}")
#     print(f"Weighted F1-Score: {f1_w:.4f}")
#     print("\nClassification Report:")
#     print(classification_report(y_meta_test, y_fusion_pred, target_names=['Low','Moderate','High']))
#     print('\n')

#     # Optionally, you can also test the individual model performance on the same set for comparison
#     P_A_test = X_meta_test[:, 0:NUM_CLASSES]
#     P_B_test = X_meta_test[:, NUM_CLASSES:2*NUM_CLASSES]
    
#     acc_a = accuracy_score(y_meta_test, np.argmax(P_A_test, axis=1))
#     acc_b = accuracy_score(y_meta_test, np.argmax(P_B_test, axis=1))
#     print(f"Individual XGBoost (A) Accuracy: {acc_a:.4f}")
#     print(f"Individual ECG-LSTM (B) Accuracy: {acc_b:.4f}")

# # ============================================
# # MAIN EXECUTION
# # ============================================

# if __name__ == "__main__":
#     print(f"Starting fusion")

#     # Step 1: Load and Align the Common Test Data, Successfully loads the 99 feature names list
#     X_blood_raw, X_ecg_metadata, y_test_fusion, original_feature_names_list = load_and_align_test_data(CLEANED_CSV_PATH, DUCKDB_PATH)

#     # Step 2: Generate Predictions (P_A and P_B)
#     X_fusion, Y_fusion = generate_predictions(X_blood_raw, X_ecg_metadata, y_test_fusion, original_feature_names_list)

#     # Check if prediction generation was successful before fusion
#     if X_fusion is not None:
#         perform_late_fusion(X_fusion, Y_fusion)