# 1. Loading in Required Packages

In [None]:
# ===============================
# Standard Library Imports
# ===============================
import os
import gc

# ===============================
# Third-Party Imports
# ===============================
import numpy as np
import pandas as pd
import importlib.util
from IPython.display import clear_output

# TensorFlow / Keras
import tensorflow as tf
import keras
from tensorflow.keras import backend as K
from tensorflow.keras.applications import MobileNetV2
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, GlobalAveragePooling2D
from tensorflow.keras.optimizers import Adam

# Scikit-learn
from sklearn.model_selection import train_test_split

# ===============================
# Local Module Imports
# ===============================
from QHETI_Transformer import *
from model_evaluation import *
from LR_scheduler import *

# ... MORE IMPORTS COMMENTED OUT

# 2. Checking if GPU is present

In [None]:
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    try:
        # Set memory growth to avoid allocating all GPU memory upfront
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        print(f"✅ Using GPU: {[gpu.name for gpu in gpus]}")
    except RuntimeError as e:
        print(f"❌ RuntimeError: {e}")
else:
    print("⚠️ No GPU found. Running on CPU.")

# 3. Version Check

In [5]:
# CHECK FOR CORRECT KERAS AND PANDAS VERSIONS
print("keras.__version__ = ", keras.__version__) # 2.14.0

# Error will occur if pandas greater than specified due to loss of backward compatibility
# https://stackoverflow.com/questions/75953279/modulenotfounderror-no-module-named-pandas-core-indexes-numeric-using-metaflo
# pip install "pandas<2.0.0"
print("pd.__version__ = ", pd.__version__) # 1.5.3
print("np.__version__ = ", np.__version__) # 1.24.4
print("tf.__version__ = ", tf.__version__) # 2.14.0
# [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
print("tf.config.list_physical_devices('GPU') = ", tf.config.list_physical_devices())
print("tf.test.is_built_with_cuda() = ", tf.test.is_built_with_cuda())  # True
device = "cuda" if tf.test.is_built_with_cuda() else "cpu"
print("device = ", device)

keras.__version__ =  2.14.0
pd.__version__ =  1.5.3
np.__version__ =  1.24.4
tf.__version__ =  2.14.0
tf.config.list_physical_devices('GPU') =  [PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'), PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
tf.test.is_built_with_cuda() =  True
device =  cuda


# 4. Data Settings (Individual)

In [None]:
# ===============================
# File Paths
# ===============================
source_file_path = ""   # specify your data source path
OUTPUT_PATH = ""        # specify your output path

# ===============================
# Feature Info
# ===============================
quadrant_features = {
    'Q1': [],
    'Q2': [],
    'Q3': [],
    'Q4': []
}

FEATURES_DROPPED = []

# ===============================
# Model Training Config
# ===============================
CLASS_VAR = "class"         
MINORITY_CLASS = 0          

NUM_CV_FOLDS = 3
FIRST_EPOCHS = ""
LAST_EPOCHS = [1000]
BATCH_SIZE = ""

# Layers to unfreeze during transfer learning
NUM_LAYERS_UNFROZEN_SOURCE = []
NUM_LAYERS_UNFROZEN_IND = []

bool_remove_target = False
augmentation_algo = ""   # placeholder

# ===============================
# Evaluation Metrics
# ===============================
EVALUATION_METRICS = [
    "Weighted Accuracy", "Sensitivity/Recall", "Specificity",
    "Precision_class0", "Precision_class1", "Precision_avg",
    "F1_class0", "F1_class1", "F1_avg", "auc_roc_score",
    "False_Discovery_Rate", "False_Negative_Rate",
    "False_Omission_Rate", "False_Positive_Rate", "Jaccard"
]

# ===============================
# Experiment Groups
# ===============================
patient_grp = [
    # Patients list removed for privacy
]

# 5. Load & Process Patient Data

In [None]:
# Function to load data from a file and convert it to a NumPy array (if applicable)
def load_data(source_file_path, allow_pickle=True):
    try:
        # Load the .npy file; expected to contain a single dictionary object
        data_ndarr = np.load(source_file_path, allow_pickle=allow_pickle)
        print(f"[INFO] Loaded object of type: {type(data_ndarr)}")

        # Extract the dictionary (assumes it's the only item in the array)
        datadict = data_ndarr.item()
        print(f"[INFO] Extracted dictionary of type: {type(datadict)}")

        return datadict

    except IOError as e:
        raise IOError(f"[ERROR] Failed to load data from {source_file_path}") from e
    
datadict = load_data(source_file_path)

In [None]:
def process_patient_data(datadict):
    p_ids = datadict.keys()
    sample_size_dict = {}
    print("patients: n =", len(p_ids), end="\n\n")

    for p_id in p_ids:
        df = datadict[p_id]
        df.columns = df.columns.str.lower()
        # Drop common unnecessary columns
        df.drop(FEATURES_DROPPED, axis=1, inplace=True)
        
        # Convert data frame to NumPy array and cast to float32
        df = np.asarray(df).astype(np.float32)
        print(p_id, "shape:", df.shape)
        sample_size_dict[p_id] = df.shape[0]

    return sample_size_dict, p_ids, df


sample_size_dict, p_ids, df = process_patient_data(datadict)

In [10]:
def split_data(target, data, bool_remove_target, transfer=False) -> (pd.DataFrame, pd.DataFrame):
    dataset = pd.DataFrame()

    if transfer:
        if bool_remove_target:
            dataset = pd.concat([data[key].copy() for key in data.keys() if key != target])
        else:
            dataset = pd.concat([data[key].copy() for key in data.keys()])
    else:
        dataset = data[target].copy()

    X = dataset.drop([CLASS_VAR], axis=1)
    Y = dataset[[CLASS_VAR]]
    return X, Y

# 6. Data Augmentation

In [None]:
# Dynamically load external data augmentation module
spec_file = "../DataAugmentation.py"
spec = importlib.util.spec_from_file_location("DataAugmentation", spec_file)
balance_method = importlib.util.module_from_spec(spec)
spec.loader.exec_module(balance_method)

def data_augmentation(train_dataset, classIndex, minorityLabel, printDebug=False):
    # Apply augmentation (placeholder: algorithm intentionally not specified)
    train_dataset = balance_method.augment_method(
        train_dataset,
        numIterations=5,
        printDebug=printDebug,
    )
    
    if printDebug:
        print(f"[INFO] Augmented data size: {train_dataset.shape}")
        print("~~~~~~~ Class Distribution After Augmentation ~~~~~~~")
        print(train_dataset[classIndex].value_counts())

    return train_dataset

# 7. Model Training Utilities

In [None]:
# -----------------------------------
# Build Model
# -----------------------------------
def build_individual_model(
    pretrained_model, 
    input_shape=(224, 224, 3), 
    individual_learning_rate=0.0001, 
    unfreeze_base_model=False
):
    inputs = Input(shape=input_shape)
    base_model = MobileNetV2(input_tensor=inputs, include_top=False, weights='imagenet')
    base_model.trainable = unfreeze_base_model

    # NOTE: Custom proprietary architecture is applied here (not shown for confidentiality)
    x = GlobalAveragePooling2D()(base_model.output)
    outputs = Dense(1, activation='sigmoid')(x)

    individual_model = Model(inputs=inputs, outputs=outputs)
    individual_model.compile(optimizer=Adam(learning_rate=individual_learning_rate),
                          loss='binary_crossentropy',
                          metrics=['accuracy'])

    # NOTE: Proprietary weight transfer logic is applied here (not shown for confidentiality)
    return individual_model


# -----------------------------------
# Build Source Model
# -----------------------------------
def build_source_model(
    input_shape=(224, 224, 3), 
    source_learning_rate=0.0001, 
    unfreeze_base_model=False
):
    inputs = Input(shape=input_shape)
    base_model = MobileNetV2(input_tensor=inputs, include_top=False, weights='imagenet')
    base_model.trainable = unfreeze_base_model

    # NOTE: Custom proprietary architecture is applied here (not shown for confidentiality)
    x = GlobalAveragePooling2D()(base_model.output)
    outputs = Dense(1, activation='sigmoid')(x)

    source_model = Model(inputs=inputs, outputs=outputs)
    source_model.compile(optimizer=Adam(learning_rate=source_learning_rate),
                  loss='binary_crossentropy',
                  metrics=['accuracy'])
    return source_model


# -----------------------------------
# Find Source Model
# -----------------------------------
def find_source_model(OUTPUT_PATH, num_layers_unfrozen):
    model_dir = ""
    pattern = ""

    # NOTE: Custom logic for selecting models is proprietary/confidential (pattern matching kept minimal)
    matching_files = [
        file_name
        for file_name in os.listdir(model_dir)
        if file_name.endswith(".h5") and pattern in file_name
    ]

    return sorted(matching_files)

# -----------------------------------
# Compare Model Weights (Individual vs. Source) for double check
# -----------------------------------

def compare_all_weights(individual_model, source_model):
    # Ensure both models have the same number of layers
    if len(individual_model.layers) != len(source_model.layers):
        print(f"Models have different number of layers: "
              f"individual={len(individual_model.layers)}, source={len(source_model.layers)}")
        return False

    for i, (s_layer, t_layer) in enumerate(zip(individual_model.layers, source_model.layers)):
        s_weights = s_layer.get_weights()
        t_weights = t_layer.get_weights()

        # If both layers have weights, compare them
        if s_weights and t_weights:
            if len(s_weights) != len(t_weights):
                print(f"Layer {i} ('{s_layer.name}') weights count mismatch "
                      f"(individual: {len(s_weights)}, source: {len(t_weights)})")
                return False

            for j, (sw, tw) in enumerate(zip(s_weights, t_weights)):
                if not np.array_equal(sw, tw):
                    print(f"Layer {i} ('{s_layer.name}') weight #{j} differs.")
                    return False
        # If one layer has weights and the other does not
        elif s_weights or t_weights:
            print(f"Layer {i} ('{s_layer.name}') weights presence mismatch.")
            return False

    print("All corresponding layers' weights match exactly.")
    return True

# 8. individual Model Run Overview

In [None]:
# ---------------------------------------------------------
# Train individual Model for One Fold
# ---------------------------------------------------------
def train_single_fold(
    images_train, images_test, Y_train, Y_test,
    source_model_path, layer_ind, layer_source, 
    target_id, fold, first_epochs, last_epochs, 
    patient_source_layer_dir, patient_source_layer_eval_dir,
    version_tag
):
    """
    Trains a individual model for one fold using a source-trained model as initialization.
    """
    try:
        # NOTE: Proprietary model loading, architecture, and training logic is hidden
        # pretrained_model = build_source_model(input_shape=(224, 224, 3))
        # pretrained_model.load_weights(source_model_path)
        # individual_model = build_individual_model(pretrained_model=pretrained_model, input_shape=images_train.shape[1:])
        # ... Training phase 1 ...
        # ... Fine-tuning phase ...
        # ... Saving model and training curves ...
        # evaluator = Evaluation(individual_model)
        # results, confusion_matrix = evaluator.model_test(images_test, Y_test)
        # print(confusion_matrix)

        # Placeholder return for demonstration
        return ["metric1", "metric2", "metric3"]

    finally:
        # Clear session and memory (safe to keep)
        K.clear_session()
        # del individual_model
        gc.collect()

## 9. Patient-Specific Model Training Pipeline

In [None]:
# ---------------------------------------------------------
# Main Training Loop: Patients × Layers × Folds
# ---------------------------------------------------------
for last_epochs in LAST_EPOCHS:
    for layer_source in NUM_LAYERS_UNFROZEN_source:
        print(f"🧪 source Model Unfrozen Layers: {layer_source}")
        print(f"🧪 Last Epochs (Fine-tuning stage): {last_epochs}")

        for target_id in patient_grp:
            # --- Create directories ---
            # --- Split data for this target patient ---
            X, Y = split_data(
                target=target_id,
                data=datadict,
                bool_remove_target=bool_remove_target,
                transfer=False,
            )

            for layer_ind in NUM_LAYERS_UNFROZEN_IND:
                print(f"🧪 individual Model Unfrozen Layers: {layer_ind}")
                source_evaluation_results = pd.DataFrame(columns=EVALUATION_METRICS)

                # --- K-Fold Setup ---
                if NUM_CV_FOLDS > 1:
                    kf = "" # NOTE: KFold setup here

                for fold in range(1, NUM_CV_FOLDS + 1):
                    print(f"📂 Fold: {fold}")

                    # --- Train/test split ---
                    if NUM_CV_FOLDS == 1:
                        X_train, X_test, Y_train, Y_test = train_test_split(
                            X.to_numpy(), Y.to_numpy(), test_size=0.2, shuffle=True
                        )
                    else:
                        train_idx, test_idx = # fold_indices
                        X_train, X_test = X.to_numpy()[train_idx], X.to_numpy()[test_idx]
                        Y_train, Y_test = Y.to_numpy()[train_idx], Y.to_numpy()[test_idx]

                    # --- Combine & normalize features ---
                    df_X_train = pd.DataFrame(X_train, columns=X.columns)
                    df_Y_train = pd.DataFrame(Y_train, columns=Y.columns)
                    joined_train_dataset = df_X_train.join(df_Y_train)

                    # NOTE: Proprietary data augmentation method is hidden
                    # NOTE: Proprietary feature-to-image transformation (QHETI) is hidden

                    # --- Find source-trained models ---
                    source_model_files = find_source_model(OUTPUT_PATH, layer_source)

                    for source_model_file in source_model_files:
                        print(f" ~~~ Patient: {target_id} | Fold: {fold} | source Model: {source_model_file} ~~~")

                        VERSION_TAG = ""
                        source_model_path = ""

                        # NOTE: Proprietary training function internals hidden
                        result_list = train_single_fold(
                            images_train=None,   # placeholder
                            images_test=None,    # placeholder
                            Y_train=Y_train,
                            Y_test=Y_test,
                            source_model_path=source_model_path,
                            layer_ind=layer_ind,
                            layer_source=layer_source,
                            target_id=target_id,
                            fold=fold,
                            first_epochs=FIRST_EPOCHS,
                            last_epochs=last_epochs,
                            patient_source_layer_dir="",        # placeholder
                            patient_source_layer_eval_dir="",   # placeholder
                            version_tag=VERSION_TAG
                        )

                        source_evaluation_results.loc[VERSION_TAG] = result_list

                        # --- Save results ---
                        output_csv = ""
                        single_result_df = pd.DataFrame(source_evaluation_results.loc[[VERSION_TAG]])
                        if output_csv.exists():
                            single_result_df.to_csv(output_csv, mode="a", header=False)
                        else:
                            single_result_df.to_csv(output_csv, mode="a", header=True)

                        clear_output(wait=True)
                        K.clear_session()
                        gc.collect()