# 1. Loading in Required Packages

In [None]:
# ===============================
# Standard Library Imports
# ===============================
import os
import gc

# ===============================
# Third-Party Imports
# ===============================
import numpy as np
import pandas as pd
import importlib.util
from IPython.display import clear_output

# TensorFlow / Keras
import tensorflow as tf
import keras
from tensorflow.keras import backend as K
from tensorflow.keras.applications import MobileNetV2
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, GlobalAveragePooling2D
from tensorflow.keras.optimizers import Adam

# Scikit-learn
from sklearn.model_selection import train_test_split

# ===============================
# Local Module Imports
# ===============================
from QHETI_Transformer import *
from model_evaluation import *
from LR_scheduler import *

# ... MORE IMPORTS COMMENTED OUT

# 2. Data Settings (Source)

In [None]:
# ===============================
# File Paths
# ===============================
source_file_path = ""   # specify your data source path
OUTPUT_PATH = ""        # specify your output path

# ===============================
# Feature Info
# ===============================
quadrant_features = {
    'Q1': [],
    'Q2': [],
    'Q3': [],
    'Q4': []
}

FEATURES_DROPPED = []

# ===============================
# Model Training Config
# ===============================
CLASS_VAR = "class"         
MINORITY_CLASS = 0          

NUM_CV_FOLDS = 5
FIRST_EPOCHS = ""
LAST_EPOCHS = ""
BATCH_SIZE = ""
NUM_LAYERS_UNFROZEN = [10, 15, 20, 25, 30]

# Layers to unfreeze during transfer learning
NUM_LAYERS_UNFROZEN_SOURCE = []
NUM_LAYERS_UNFROZEN_IND = []

bool_remove_target = False
augmentation_algo = ""   # placeholder

# ===============================
# Evaluation Metrics
# ===============================
EVALUATION_METRICS = [
    "Weighted Accuracy", "Sensitivity/Recall", "Specificity",
    "Precision_class0", "Precision_class1", "Precision_avg",
    "F1_class0", "F1_class1", "F1_avg", "auc_roc_score",
    "False_Discovery_Rate", "False_Negative_Rate",
    "False_Omission_Rate", "False_Positive_Rate", "Jaccard"
]

# ===============================
# Experiment Groups
# ===============================
pop_target_grp = ["source"]
patient_grp = [
    # Patients list removed for privacy
]

# 3. Load & Process Patient Data

In [None]:
# Function to load data from a file and convert it to a NumPy array (if applicable)
def load_data(source_file_path, allow_pickle=True):
    try:
        # Load the .npy file; expected to contain a single dictionary object
        data_ndarr = np.load(source_file_path, allow_pickle=allow_pickle)
        print(f"[INFO] Loaded object of type: {type(data_ndarr)}")

        # Extract the dictionary (assumes it's the only item in the array)
        datadict = data_ndarr.item()
        print(f"[INFO] Extracted dictionary of type: {type(datadict)}")

        return datadict

    except IOError as e:
        raise IOError(f"[ERROR] Failed to load data from {source_file_path}") from e
    
datadict = load_data(source_file_path)

In [None]:
def process_patient_data(datadict):
    p_ids = datadict.keys()
    sample_size_dict = {}
    print("patients: n =", len(p_ids), end="\n\n")

    for p_id in p_ids:
        df = datadict[p_id]
        df.columns = df.columns.str.lower()
        # Drop common unnecessary columns
        df.drop(FEATURES_DROPPED, axis=1, inplace=True)
        
        # Convert data frame to NumPy array and cast to float32
        df = np.asarray(df).astype(np.float32)
        print(p_id, "shape:", df.shape)
        sample_size_dict[p_id] = df.shape[0]

    return sample_size_dict, p_ids, df


sample_size_dict, p_ids, df = process_patient_data(datadict)

In [None]:
def split_data(target, data, bool_remove_target, transfer=False) -> (pd.DataFrame, pd.DataFrame):
    dataset = pd.DataFrame()

    if transfer:
        if bool_remove_target:
            dataset = pd.concat([data[key].copy() for key in data.keys() if key != target])
        else:
            dataset = pd.concat([data[key].copy() for key in data.keys()])
    else:
        dataset = data[target].copy()

    X = dataset.drop([CLASS_VAR], axis=1)
    Y = dataset[[CLASS_VAR]]
    return X, Y

# 4. Data Augmentation


In [None]:
# Dynamically load external data augmentation module
spec_file = "../DataAugmentation.py"
spec = importlib.util.spec_from_file_location("DataAugmentation", spec_file)
balance_method = importlib.util.module_from_spec(spec)
spec.loader.exec_module(balance_method)

def data_augmentation(train_dataset, classIndex, minorityLabel, printDebug=False):
    # Apply augmentation (placeholder: algorithm intentionally not specified)
    train_dataset = balance_method.augment_method(
        train_dataset,
        numIterations=5,
        printDebug=printDebug,
    )
    
    if printDebug:
        print(f"[INFO] Augmented data size: {train_dataset.shape}")
        print("~~~~~~~ Class Distribution After Augmentation ~~~~~~~")
        print(train_dataset[classIndex].value_counts())

    return train_dataset

# 5. Source Model

In [None]:
# -----------------------------------
# Build Source Model
# -----------------------------------
def build_source_model(
    input_shape=(224, 224, 3), 
    source_learning_rate=0.0001, 
    unfreeze_base_model=False
):
    inputs = Input(shape=input_shape)
    base_model = MobileNetV2(input_tensor=inputs, include_top=False, weights='imagenet')
    base_model.trainable = unfreeze_base_model

    # NOTE: Custom proprietary architecture is applied here (not shown for confidentiality)
    x = GlobalAveragePooling2D()(base_model.output)
    outputs = Dense(1, activation='sigmoid')(x)

    source_model = Model(inputs=inputs, outputs=outputs)
    source_model.compile(optimizer=Adam(learning_rate=source_learning_rate),
                  loss='binary_crossentropy',
                  metrics=['accuracy'])
    return source_model

In [None]:
target = pop_target_grp[0]

# Different from Individual Model as we use all of the patient data for source model training
X, Y = split_data(
    target=target,
    data=datadict,
    bool_remove_target=bool_remove_target,
    transfer=True,
)

for layer in NUM_LAYERS_UNFROZEN:
    number_of_layers_unfrozen = layer
    source_evaluation_results = pd.DataFrame(columns=EVALUATION_METRICS)
    
    # --- K-Fold Setup ---
    if NUM_CV_FOLDS > 1:
        kf = "" # NOTE: KFold setup here

    for fold in range(1, NUM_CV_FOLDS + 1):
        if NUM_CV_FOLDS == 1:
            X_train, X_test, Y_train, Y_test = train_test_split(
                X.to_numpy(), Y.to_numpy(), test_size=0.2, shuffle=True
            )
        else:
            train_index, test_index = #fold_indices[fold-1]
            X_train, X_test = X.to_numpy()[train_index], X.to_numpy()[test_index]
            Y_train, Y_test = Y.to_numpy()[train_index], Y.to_numpy()[test_index]

        # --- Combine & normalize features ---
        df_X_train = pd.DataFrame(X_train, columns=X.columns)
        df_Y_train = pd.DataFrame(Y_train, columns=Y.columns)
        joined_train_dataset = df_X_train.join(df_Y_train)

        # NOTE: Proprietary data augmentation method is hidden
        # NOTE: Proprietary feature-to-image transformation (QHETI) is hidden

        source_model = build_source_model() # params

        # NOTE: Proprietary training function internals hidden
        # ... Training phase 1 ...
        # ... Fine-tuning phase ...
        # ... Saving model and training curves ...
        # evaluator = Evaluation(individual_model)
        # results, confusion_matrix = evaluator.model_test(images_test, Y_test)
        # print(confusion_matrix)

        # --- Save results ---
        output_csv = ""

        clear_output(wait=True)
        K.clear_session()
        gc.collect()
