In [None]:
from tensorflow.python.util import deprecation
deprecation._PRINT_DEPRECATION_WARNINGS = False   #ignore FutureWarning

import tensorflow as tf
tf.__version__
import random
from tqdm import tqdm
import json
import os
import gc
gc.collect()

import pandas as pd
import numpy as np
import numpy.random as nr
from numpy.random import seed
import matplotlib.pyplot as plt
from glob import glob
from pathlib import Path
import tensorflow.keras as keras
from tensorflow.keras import backend as K

from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import roc_curve, auc, roc_auc_score
from sklearn.model_selection import train_test_split
from tensorflow.keras.mixed_precision import set_global_policy

set_global_policy('mixed_float16')

gpus = tf.config.list_physical_devices('GPU')

# Check if there are any GPUs available
if gpus:
    # Iterate over all available GPUs and set memory growth
    for gpu in gpus:
        try:
            tf.config.experimental.set_memory_growth(gpu, True)
            print(f'Memory growth enabled for {gpu.name}')
        except RuntimeError as e:
            # Memory growth must be set before initializing GPUs
            print(f'Could not set memory growth for {gpu.name}: {e}')


from tensorflow.keras.models import Sequential, Model, load_model
#from tensorflow.keras.applications.vgg16 import preprocess_input
from tensorflow.keras.applications.resnet50 import preprocess_input
#from tensorflow.keras.applications.densenet import preprocess_input
from tensorflow.keras.applications.efficientnet import preprocess_input
from tensorflow.keras.layers import Conv2D,Dense,Flatten,Dropout,MaxPooling2D, Activation, BatchNormalization, GlobalAveragePooling2D
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import categorical_crossentropy
from tensorflow.keras.preprocessing import image
from tensorflow.keras.initializers import glorot_uniform
from tensorflow.keras.callbacks import Callback
from PIL import ImageFile
ImageFile.LOAD_TRUNCATED_IMAGES = True
#clear memory in case of OOM
K.clear_session()
#set dictionary for disease and its index
disease_class = {'Atelectasis': 1,
                 'Cardiomegaly': 2,
                 'Effusion': 3,
                 'Infiltration': 4,
                 'Mass': 5,
                 'Nodule': 6,
                 'Pneumonia': 7,
                 'Pneumothorax': 8,
                 'Consolidation': 9,
                 'Edema': 10,
                 'Emphysema': 11,
                 'Fibrosis': 12,
                 'Pleural_Thickening': 13,
                 'Hernia': 14,
                 'No Finding': 15}

disease_rev = {v: k for k, v in disease_class.items()}
disease_img = {'Atelectasis': [],
                 'Cardiomegaly': [],
                 'Effusion': [],
                 'Infiltration': [],
                 'Mass': [],
                 'Nodule': [],
                 'Pneumonia': [],
                 'Pneumothorax': [],
                 'Consolidation': [],
                 'Edema': [],
                 'Emphysema': [],
                 'Fibrosis': [],
                 'Pleural_Thickening': [],
                 'Hernia': [],
                 'No Finding':[]}
#import labels of the images
data_ref = pd.read_csv("/media/ntu/volume1/home/s123md305_01/Documents/CXR8/Data_Entry_2017_v2020.csv")
pd.options.mode.chained_assignment = None        #ignore the SettingWithCopyWarning

#/media/ntu/volume1/home/s123md305_01/Documents/Generated/reconstructed_labels.csv
#/media/ntu/volume1/home/s123md305_01/Documents/CXR8/Data_Entry_2017_v2020.csv
for i in tqdm(range(len(data_ref))):
    #print(i)
    if "|" not in data_ref['Finding Labels'][i]:
        disease_img[data_ref['Finding Labels'][i]].append(data_ref['Image Index'][i])
simp_data_ref = data_ref[["Image Index", "Finding Labels"]]
simp_data_ref.set_index("Image Index", inplace = True)

data_ref_2 = pd.read_csv("/media/ntu/volume1/home/s123md305_01/Documents/Generated/reconstructed_labels.csv")  # Update the path to the second dataset

# Initialize dictionary for the second dataset
disease_img_2 = {disease: [] for disease in disease_class.keys()}

# Populate the dictionary with image names from the second dataset
for i in tqdm(range(len(data_ref_2))):
    if "|" not in data_ref_2['Finding Labels'][i]:
        disease_img_2[data_ref_2['Finding Labels'][i]].append(data_ref_2['Image Index'][i])

# If you need a simplified reference for the second dataset as well
simp_data_ref_2 = data_ref_2[["Image Index", "Finding Labels"]]
simp_data_ref_2.set_index("Image Index", inplace=True)
from tensorflow.keras.preprocessing import image
from tqdm import tqdm
import numpy as np

# Assuming disease_img, disease_class, and simp_data_ref are predefined
from tensorflow.keras.preprocessing import image
from tqdm import tqdm
import numpy as np

# Assuming disease_img, disease_class, and simp_data_ref are predefined
for number in range(91324, 92324, 10000): 
    print(number)# From 10,000 to 90,000
    img_names = []
    for dis in disease_img.keys():
        num = round(number / 91324 * len(disease_img[dis]))
        for i in range(num):
            img_names.append(disease_img[dis][i])
            
    X = []
    train_image = []
    y = np.zeros(shape=(len(img_names), len(disease_class.keys())))

    for i in tqdm(range(len(img_names))):
        img = image.load_img('/media/ntu/volume1/home/s123md305_01/Documents/CombinedResized/Resized112/' + img_names[i], target_size=(112, 112, 3))
        img = image.img_to_array(img)
        train_image.append(img)
        
        for j in range(len(disease_class.keys())):
            if disease_rev[j + 1] == simp_data_ref['Finding Labels'][img_names[i]]:
                y[i][j] = 1
                
    X = np.array(train_image)
    
    print(number, 'augmented')  # From 10,000 to 90,000
    img_names_new = []  # List to store image names from the second dataset
    for dis in disease_img_2.keys():  # Use the dictionary for the second dataset
        num = round(number / 91324 * len(disease_img_2[dis]))  # Calculate the number of images per disease
        for i in range(num):
            img_names_new.append(disease_img_2[dis][i])  # Append image names from the second dataset

    X_new = [] 
    train_new = [] # This will store the new images
    y_new = np.zeros(shape=(len(img_names_new), len(disease_class.keys())))  # Initialize the new labels array

    for i in tqdm(range(len(img_names_new))):
        img_path = '/media/ntu/volume1/home/s123md305_01/Documents/Generated/ComGenerated112/' + img_names_new[i]  # Update with the actual path to your second dataset images
        img = image.load_img(img_path, target_size=(112, 112, 3))
        img = image.img_to_array(img)
        train_new.append(img)  # Append the processed image

        # Assign labels based on the disease class
        for j in range(len(disease_class.keys())):
            if disease_rev[j + 1] == simp_data_ref_2['Finding Labels'][img_names_new[i]]:  # Use the reference for the second dataset
                y_new[i][j] = 1

    X_new = np.array(train_new)  # Convert the list of images to a numpy array

    
    
    gc.collect()


    import numpy as np
    import os
    from tensorflow.keras.models import Sequential
    from tensorflow.keras.layers import GlobalAveragePooling2D, Dense
    from tensorflow.keras.optimizers import Adam
    from tensorflow.keras.applications import VGG16, ResNet50, DenseNet121
    #from tensorflow.keras.applications.vgg16 import preprocess_input
    from tensorflow.keras.applications.resnet50 import preprocess_input
    #from tensorflow.keras.applications.densenet import preprocess_input
    from sklearn.model_selection import train_test_split
    from sklearn.metrics import roc_auc_score
    from tensorflow.keras.callbacks import EarlyStopping
    #from tensorflow.keras.mixed_precision import set_global_policy

    #set_global_policy('mixed_float16')
    input_shape = (112, 112, 3)  # Example input shape for a typical image dataset
    num_classes = 15  # Change this to match the number of classes in your dataset

    strategy = tf.distribute.MirroredStrategy()
    print('Number of devices: {}'.format(strategy.num_replicas_in_sync))

    # Open a strategy scope and create your model, compile it, and train it inside this scope

  # Your model creation and compilation

    # Build, compile, and train your model within the strategy scope

    # Function to define and compile the model

    def build_model(input_shape, num_classes):
        base_model = ResNet50(weights='imagenet', include_top=False, input_shape=input_shape)
        for layer in base_model.layers:
            layer.trainable = False

        model = Sequential([
            base_model,
            GlobalAveragePooling2D(),
            Dense(num_classes, activation='softmax', kernel_initializer='glorot_uniform', dtype='float32')
        ])

        model.compile(optimizer=Adam(learning_rate=0.0001), loss=focal_loss(), metrics=[tf.keras.metrics.AUC(name='auc')])
        return model
    # Function for Focal Loss
    #https://www.programmersought.com/article/60001511310/
    def focal_loss(alpha = 0.5, beta = 2.0):
        epsilon = 1.e-7
        def loss_fn2(y_true, y_pred):
            y_true = tf.cast(y_true, tf.float32)
            y_pred = tf.clip_by_value(y_pred, epsilon, 1. - epsilon)

            alpha_t = y_true*alpha + (tf.ones_like(y_true)-y_true)*(1-alpha)
            y_t = tf.multiply(y_true, y_pred) + tf.multiply(1-y_true, 1-y_pred)
            ce = -tf.math.log(y_t)
            weight = tf.pow(tf.subtract(1., y_t), beta)
            fl = tf.multiply(tf.multiply(weight, ce), alpha_t)
            loss = tf.reduce_mean(fl)
            return loss

        return loss_fn2

    # Number of runs to calculate the standard deviation
    n_runs = 3
    runno=42
    auc_scores = []

    for run in range(n_runs):
        tf.keras.backend.clear_session()
        # Assuming X and y are your complete dataset excluding the test set
        X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.3, random_state=42+run)
        X_train_bef, X_val, y_train_bef, y_val = train_test_split(X_train_val, y_train_val, test_size=0.42857, random_state=42+run)


        # Preprocess the test set
        X_test = preprocess_input(X_test)
        
        X_train = np.concatenate((X_train_bef, X_new), axis=0)
        y_train = np.concatenate((y_train_bef, y_new), axis=0)
        # Split the training + validation set into actual training and validation sets (82.35:17.65)
        # This will give you 70% of the total data for training and 15% of the total data for validation
 
        # Preprocess the training and validation sets
        X_train = preprocess_input(X_train)
        X_val = preprocess_input(X_val)
        # Convert the numpy arrays into tf.data.Dataset
        with tf.device("CPU"):
            train_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train))
            val_dataset = tf.data.Dataset.from_tensor_slices((X_val, y_val))
            test_dataset = tf.data.Dataset.from_tensor_slices((X_test, y_test))
            batch_size = 10  # You can adjust this according to your specific requirements
            train_dataset = train_dataset.batch(batch_size).prefetch(tf.data.AUTOTUNE)
            val_dataset = val_dataset.batch(batch_size).prefetch(tf.data.AUTOTUNE)
            test_dataset = test_dataset.batch(batch_size).prefetch(tf.data.AUTOTUNE)


        gc.collect()
        
        with strategy.scope():
            model = build_model(input_shape, num_classes)

        early_stopping = EarlyStopping(monitor='val_auc', patience=3, mode='max', verbose=1)


        model.fit(
            train_dataset,  # Use the batched and prefetched dataset
            epochs=20,  # Adjust based on your dataset and model's performance
            validation_data=val_dataset,  # Use the validation dataset
            callbacks=[early_stopping],
            verbose=1  # Set to 0 to reduce log messages
        )
        gc.collect()

        # Evaluate the model on your test set, assuming X_test, y_test are your test data and labels
        y_pred = model.predict(test_dataset)
        auc = roc_auc_score(y_test, y_pred, multi_class='ovo')
                # Collect all predictions for computing AUC
        #all_y_pred = []
       # for batch in test_dataset:
        #    all_y_pred.extend(model.predict(batch[0]))  # batch[0] contains the images

        # Convert to a single numpy array
      #  all_y_pred = np.vstack(all_y_pred)

        # Compute AUC assuming y_test is a single numpy array of labels
      #  auc = roc_auc_score(y_test, all_y_pred, multi_class='ovo')

        auc_scores.append(auc)
        print(f"Run {run+1}/{n_runs}, Test AUC: {auc:.4f}")
        gc.collect()
        # Calculate and print the standard deviation of AUC scores
    auc_std_dev = np.std(auc_scores)
    aauc=np.mean(auc_scores)
    print(f"Standard Deviation of AUC over {n_runs} runs: {auc_std_dev:.4f}")
    print("aauc=",aauc)
    gc.collect()
        # Now, X and y contain the images and labels for this iteration
        # You can now proceed with training or saving this data

In [None]:
from tensorflow.python.util import deprecation
deprecation._PRINT_DEPRECATION_WARNINGS = False   #ignore FutureWarning

import tensorflow as tf
tf.__version__
import random
from tqdm import tqdm
import json
import os
import gc
gc.collect()

import pandas as pd
import numpy as np
import numpy.random as nr
from numpy.random import seed
import matplotlib.pyplot as plt
from glob import glob
from pathlib import Path
import tensorflow.keras as keras
from tensorflow.keras import backend as K

from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import roc_curve, auc, roc_auc_score
from sklearn.model_selection import train_test_split
from tensorflow.keras.mixed_precision import set_global_policy

set_global_policy('mixed_float16')

gpus = tf.config.list_physical_devices('GPU')

# Check if there are any GPUs available
if gpus:
    # Iterate over all available GPUs and set memory growth
    for gpu in gpus:
        try:
            tf.config.experimental.set_memory_growth(gpu, True)
            print(f'Memory growth enabled for {gpu.name}')
        except RuntimeError as e:
            # Memory growth must be set before initializing GPUs
            print(f'Could not set memory growth for {gpu.name}: {e}')


from tensorflow.keras.models import Sequential, Model, load_model
#from tensorflow.keras.applications.vgg16 import preprocess_input
from tensorflow.keras.applications.resnet50 import preprocess_input
#from tensorflow.keras.applications.densenet import preprocess_input
from tensorflow.keras.applications.efficientnet import preprocess_input
from tensorflow.keras.layers import Conv2D,Dense,Flatten,Dropout,MaxPooling2D, Activation, BatchNormalization, GlobalAveragePooling2D
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import categorical_crossentropy
from tensorflow.keras.preprocessing import image
from tensorflow.keras.initializers import glorot_uniform
from tensorflow.keras.callbacks import Callback
from PIL import ImageFile
ImageFile.LOAD_TRUNCATED_IMAGES = True
#clear memory in case of OOM
K.clear_session()
#set dictionary for disease and its index
disease_class = {'Atelectasis': 1,
                 'Cardiomegaly': 2,
                 'Effusion': 3,
                 'Infiltration': 4,
                 'Mass': 5,
                 'Nodule': 6,
                 'Pneumonia': 7,
                 'Pneumothorax': 8,
                 'Consolidation': 9,
                 'Edema': 10,
                 'Emphysema': 11,
                 'Fibrosis': 12,
                 'Pleural_Thickening': 13,
                 'Hernia': 14,
                 'No Finding': 15}

disease_rev = {v: k for k, v in disease_class.items()}
disease_img = {'Atelectasis': [],
                 'Cardiomegaly': [],
                 'Effusion': [],
                 'Infiltration': [],
                 'Mass': [],
                 'Nodule': [],
                 'Pneumonia': [],
                 'Pneumothorax': [],
                 'Consolidation': [],
                 'Edema': [],
                 'Emphysema': [],
                 'Fibrosis': [],
                 'Pleural_Thickening': [],
                 'Hernia': [],
                 'No Finding':[]}
#import labels of the images
data_ref = pd.read_csv("/media/ntu/volume1/home/s123md305_01/Documents/CXR8/Data_Entry_2017_v2020.csv")
pd.options.mode.chained_assignment = None        #ignore the SettingWithCopyWarning

#/media/ntu/volume1/home/s123md305_01/Documents/Generated/reconstructed_labels.csv
#/media/ntu/volume1/home/s123md305_01/Documents/CXR8/Data_Entry_2017_v2020.csv
for i in tqdm(range(len(data_ref))):
    #print(i)
    if "|" not in data_ref['Finding Labels'][i]:
        disease_img[data_ref['Finding Labels'][i]].append(data_ref['Image Index'][i])
simp_data_ref = data_ref[["Image Index", "Finding Labels"]]
simp_data_ref.set_index("Image Index", inplace = True)

data_ref_2 = pd.read_csv("/media/ntu/volume1/home/s123md305_01/Documents/Generated/reconstructed_labels.csv")  # Update the path to the second dataset

# Initialize dictionary for the second dataset
disease_img_2 = {disease: [] for disease in disease_class.keys()}

# Populate the dictionary with image names from the second dataset
for i in tqdm(range(len(data_ref_2))):
    if "|" not in data_ref_2['Finding Labels'][i]:
        disease_img_2[data_ref_2['Finding Labels'][i]].append(data_ref_2['Image Index'][i])

# If you need a simplified reference for the second dataset as well
simp_data_ref_2 = data_ref_2[["Image Index", "Finding Labels"]]
simp_data_ref_2.set_index("Image Index", inplace=True)
from tensorflow.keras.preprocessing import image
from tqdm import tqdm
import numpy as np

# Assuming disease_img, disease_class, and simp_data_ref are predefined
from tensorflow.keras.preprocessing import image
from tqdm import tqdm
import numpy as np

# Assuming disease_img, disease_class, and simp_data_ref are predefined
for number in range(91324, 92324, 10000): 
    print(number)# From 10,000 to 90,000
    img_names = []
    for dis in disease_img.keys():
        num = round(number / 91324 * len(disease_img[dis]))
        for i in range(num):
            img_names.append(disease_img[dis][i])
            
    X = []
    train_image = []
    y = np.zeros(shape=(len(img_names), len(disease_class.keys())))

    for i in tqdm(range(len(img_names))):
        img = image.load_img('/media/ntu/volume1/home/s123md305_01/Documents/CombinedResized/Resized112/' + img_names[i], target_size=(112, 112, 3))
        img = image.img_to_array(img)
        train_image.append(img)
        
        for j in range(len(disease_class.keys())):
            if disease_rev[j + 1] == simp_data_ref['Finding Labels'][img_names[i]]:
                y[i][j] = 1
                
    X = np.array(train_image)
    
    print(number, 'augmented')  # From 10,000 to 90,000
    img_names_new = []  # List to store image names from the second dataset
    for dis in disease_img_2.keys():  # Use the dictionary for the second dataset
        num = round(number / 91324 * len(disease_img_2[dis]))  # Calculate the number of images per disease
        for i in range(num):
            img_names_new.append(disease_img_2[dis][i])  # Append image names from the second dataset

    X_new = [] 
    train_new = [] # This will store the new images
    y_new = np.zeros(shape=(len(img_names_new), len(disease_class.keys())))  # Initialize the new labels array

    for i in tqdm(range(len(img_names_new))):
        img_path = '/media/ntu/volume1/home/s123md305_01/Documents/Generated/ComGenerated112/' + img_names_new[i]  # Update with the actual path to your second dataset images
        img = image.load_img(img_path, target_size=(112, 112, 3))
        img = image.img_to_array(img)
        train_new.append(img)  # Append the processed image

        # Assign labels based on the disease class
        for j in range(len(disease_class.keys())):
            if disease_rev[j + 1] == simp_data_ref_2['Finding Labels'][img_names_new[i]]:  # Use the reference for the second dataset
                y_new[i][j] = 1

    X_new = np.array(train_new)  # Convert the list of images to a numpy array

    
    
    gc.collect()


    import numpy as np
    import os
    from tensorflow.keras.models import Sequential
    from tensorflow.keras.layers import GlobalAveragePooling2D, Dense
    from tensorflow.keras.optimizers import Adam
    from tensorflow.keras.applications import VGG16, ResNet50, DenseNet121
    #from tensorflow.keras.applications.vgg16 import preprocess_input
    from tensorflow.keras.applications.resnet50 import preprocess_input
    #from tensorflow.keras.applications.densenet import preprocess_input
    from sklearn.model_selection import train_test_split
    from sklearn.metrics import roc_auc_score
    from tensorflow.keras.callbacks import EarlyStopping
    #from tensorflow.keras.mixed_precision import set_global_policy

    #set_global_policy('mixed_float16')
    input_shape = (112, 112, 3)  # Example input shape for a typical image dataset
    num_classes = 15  # Change this to match the number of classes in your dataset

    strategy = tf.distribute.MirroredStrategy()
    print('Number of devices: {}'.format(strategy.num_replicas_in_sync))

    # Open a strategy scope and create your model, compile it, and train it inside this scope

  # Your model creation and compilation

    # Build, compile, and train your model within the strategy scope

    # Function to define and compile the model

    def build_model(input_shape, num_classes):
        base_model = ResNet50(weights='imagenet', include_top=False, input_shape=input_shape)
        for layer in base_model.layers:
            layer.trainable = False

        model = Sequential([
            base_model,
            GlobalAveragePooling2D(),
            Dense(num_classes, activation='softmax', kernel_initializer='glorot_uniform', dtype='float32')
        ])

        model.compile(optimizer=Adam(learning_rate=0.0001), loss=focal_loss(), metrics=[tf.keras.metrics.AUC(name='auc')])
        return model
    # Function for Focal Loss
    #https://www.programmersought.com/article/60001511310/
    def focal_loss(alpha = 0.5, beta = 2.0):
        epsilon = 1.e-7
        def loss_fn2(y_true, y_pred):
            y_true = tf.cast(y_true, tf.float32)
            y_pred = tf.clip_by_value(y_pred, epsilon, 1. - epsilon)

            alpha_t = y_true*alpha + (tf.ones_like(y_true)-y_true)*(1-alpha)
            y_t = tf.multiply(y_true, y_pred) + tf.multiply(1-y_true, 1-y_pred)
            ce = -tf.math.log(y_t)
            weight = tf.pow(tf.subtract(1., y_t), beta)
            fl = tf.multiply(tf.multiply(weight, ce), alpha_t)
            loss = tf.reduce_mean(fl)
            return loss

        return loss_fn2

    # Number of runs to calculate the standard deviation
    n_runs = 3
    runno=42
    auc_scores = []

    for run in range(n_runs):
        tf.keras.backend.clear_session()
        # Assuming X and y are your complete dataset excluding the test set
        X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.3, random_state=42+run)
    
        # Preprocess the test set
        X_test = preprocess_input(X_test)
        
        X_train_new = np.concatenate((X_train_val, X_new), axis=0)
        y_train_new = np.concatenate((y_train_val, y_new), axis=0)
        # Split the training + validation set into actual training and validation sets (82.35:17.65)
        # This will give you 70% of the total data for training and 15% of the total data for validation
    
        X_train, X_val, y_train, y_val = train_test_split(X_train_new, y_train_new, test_size=0.1765, random_state=42+run)
        # Preprocess the training and validation sets
        X_train = preprocess_input(X_train)
        X_val = preprocess_input(X_val)
        # Convert the numpy arrays into tf.data.Dataset
        with tf.device("CPU"):
            train_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train))
            val_dataset = tf.data.Dataset.from_tensor_slices((X_val, y_val))
            test_dataset = tf.data.Dataset.from_tensor_slices((X_test, y_test))
            batch_size = 10  # You can adjust this according to your specific requirements
            train_dataset = train_dataset.batch(batch_size).prefetch(tf.data.AUTOTUNE)
            val_dataset = val_dataset.batch(batch_size).prefetch(tf.data.AUTOTUNE)
            test_dataset = test_dataset.batch(batch_size).prefetch(tf.data.AUTOTUNE)


        gc.collect()
        
        with strategy.scope():
            model = build_model(input_shape, num_classes)

        early_stopping = EarlyStopping(monitor='val_auc', patience=3, mode='max', verbose=1)


        model.fit(
            train_dataset,  # Use the batched and prefetched dataset
            epochs=20,  # Adjust based on your dataset and model's performance
            validation_data=val_dataset,  # Use the validation dataset
            callbacks=[early_stopping],
            verbose=1  # Set to 0 to reduce log messages
        )
        gc.collect()

        # Evaluate the model on your test set, assuming X_test, y_test are your test data and labels
        y_pred = model.predict(test_dataset)
        auc = roc_auc_score(y_test, y_pred, multi_class='ovo')
                # Collect all predictions for computing AUC
        #all_y_pred = []
       # for batch in test_dataset:
        #    all_y_pred.extend(model.predict(batch[0]))  # batch[0] contains the images

        # Convert to a single numpy array
      #  all_y_pred = np.vstack(all_y_pred)

        # Compute AUC assuming y_test is a single numpy array of labels
      #  auc = roc_auc_score(y_test, all_y_pred, multi_class='ovo')

        auc_scores.append(auc)
        print(f"Run {run+1}/{n_runs}, Test AUC: {auc:.4f}")
        gc.collect()
        # Calculate and print the standard deviation of AUC scores
    auc_std_dev = np.std(auc_scores)
    aauc=np.mean(auc_scores)
    print(f"Standard Deviation of AUC over {n_runs} runs: {auc_std_dev:.4f}")
    print("aauc=",aauc)
    gc.collect()
        # Now, X and y contain the images and labels for this iteration
        # You can now proceed with training or saving this data

In [None]:
from tensorflow.python.util import deprecation
deprecation._PRINT_DEPRECATION_WARNINGS = False   #ignore FutureWarning

import tensorflow as tf
tf.__version__
import random
from tqdm import tqdm
import json
import os
import gc
gc.collect()

import pandas as pd
import numpy as np
import numpy.random as nr
from numpy.random import seed
import matplotlib.pyplot as plt
from glob import glob
from pathlib import Path
import tensorflow.keras as keras
from tensorflow.keras import backend as K

from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import roc_curve, auc, roc_auc_score
from sklearn.model_selection import train_test_split
#from tensorflow.keras.mixed_precision import set_global_policy

#set_global_policy('mixed_float16')

#gpus = tf.config.list_physical_devices('GPU')

# Check if there are any GPUs available
#if gpus:
    # Iterate over all available GPUs and set memory growth
#    for gpu in gpus:
#        try:
#            tf.config.experimental.set_memory_growth(gpu, True)
#            print(f'Memory growth enabled for {gpu.name}')
#        except RuntimeError as e:
            # Memory growth must be set before initializing GPUs
#            print(f'Could not set memory growth for {gpu.name}: {e}')


from tensorflow.keras.models import Sequential, Model, load_model
#from tensorflow.keras.applications.vgg16 import preprocess_input
from tensorflow.keras.applications.resnet50 import preprocess_input
#from tensorflow.keras.applications.densenet import preprocess_input
from tensorflow.keras.applications.efficientnet import preprocess_input
from tensorflow.keras.layers import Conv2D,Dense,Flatten,Dropout,MaxPooling2D, Activation, BatchNormalization, GlobalAveragePooling2D
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import categorical_crossentropy
from tensorflow.keras.preprocessing import image
from tensorflow.keras.initializers import glorot_uniform
from tensorflow.keras.callbacks import Callback
from PIL import ImageFile
ImageFile.LOAD_TRUNCATED_IMAGES = True
#clear memory in case of OOM
K.clear_session()
#set dictionary for disease and its index
disease_class = {'Atelectasis': 1,
                 'Cardiomegaly': 2,
                 'Effusion': 3,
                 'Infiltration': 4,
                 'Mass': 5,
                 'Nodule': 6,
                 'Pneumonia': 7,
                 'Pneumothorax': 8,
                 'Consolidation': 9,
                 'Edema': 10,
                 'Emphysema': 11,
                 'Fibrosis': 12,
                 'Pleural_Thickening': 13,
                 'Hernia': 14,
                 'No Finding': 15}

disease_rev = {v: k for k, v in disease_class.items()}
disease_img = {'Atelectasis': [],
                 'Cardiomegaly': [],
                 'Effusion': [],
                 'Infiltration': [],
                 'Mass': [],
                 'Nodule': [],
                 'Pneumonia': [],
                 'Pneumothorax': [],
                 'Consolidation': [],
                 'Edema': [],
                 'Emphysema': [],
                 'Fibrosis': [],
                 'Pleural_Thickening': [],
                 'Hernia': [],
                 'No Finding':[]}
#import labels of the images
data_ref = pd.read_csv("/media/ntu/volume1/home/s123md305_01/Documents/CXR8/Data_Entry_2017_v2020.csv")
pd.options.mode.chained_assignment = None        #ignore the SettingWithCopyWarning

#/media/ntu/volume1/home/s123md305_01/Documents/Generated/reconstructed_labels.csv
#/media/ntu/volume1/home/s123md305_01/Documents/CXR8/Data_Entry_2017_v2020.csv
for i in tqdm(range(len(data_ref))):
    #print(i)
    if "|" not in data_ref['Finding Labels'][i]:
        disease_img[data_ref['Finding Labels'][i]].append(data_ref['Image Index'][i])
simp_data_ref = data_ref[["Image Index", "Finding Labels"]]
simp_data_ref.set_index("Image Index", inplace = True)

data_ref_2 = pd.read_csv("/media/ntu/volume1/home/s123md305_01/Documents/Generated/reconstructed_labels.csv")  # Update the path to the second dataset

# Initialize dictionary for the second dataset
disease_img_2 = {disease: [] for disease in disease_class.keys()}

# Populate the dictionary with image names from the second dataset
for i in tqdm(range(len(data_ref_2))):
    if "|" not in data_ref_2['Finding Labels'][i]:
        disease_img_2[data_ref_2['Finding Labels'][i]].append(data_ref_2['Image Index'][i])

# If you need a simplified reference for the second dataset as well
simp_data_ref_2 = data_ref_2[["Image Index", "Finding Labels"]]
simp_data_ref_2.set_index("Image Index", inplace=True)
from tensorflow.keras.preprocessing import image
from tqdm import tqdm
import numpy as np

# Assuming disease_img, disease_class, and simp_data_ref are predefined
from tensorflow.keras.preprocessing import image
from tqdm import tqdm
import numpy as np

# Assuming disease_img, disease_class, and simp_data_ref are predefined
for number in range(91324, 92324, 10000): 
    print(number)# From 10,000 to 90,000
    img_names = []
    for dis in disease_img.keys():
        num = round(number / 91324 * len(disease_img[dis]))
        for i in range(num):
            img_names.append(disease_img[dis][i])
            
    X = []
    train_image = []
    y = np.zeros(shape=(len(img_names), len(disease_class.keys())))

    for i in tqdm(range(len(img_names))):
        img = image.load_img('/media/ntu/volume1/home/s123md305_01/Documents/CombinedResized/Resized112/' + img_names[i], target_size=(112, 112, 3))
        img = image.img_to_array(img)
        train_image.append(img)
        
        for j in range(len(disease_class.keys())):
            if disease_rev[j + 1] == simp_data_ref['Finding Labels'][img_names[i]]:
                y[i][j] = 1
                
    X = np.array(train_image)
    for number in range(1000, 11000, 1000): 
        print(number, 'augmented')  # From 10,000 to 90,000
        img_names_new = []  # List to store image names from the second dataset
        for dis in disease_img_2.keys():  # Use the dictionary for the second dataset
            num = round(number / 91324 * len(disease_img_2[dis]))  # Calculate the number of images per disease
            for i in range(num):
                img_names_new.append(disease_img_2[dis][i])  # Append image names from the second dataset

        X_new = [] 
        train_new = [] # This will store the new images
        y_new = np.zeros(shape=(len(img_names_new), len(disease_class.keys())))  # Initialize the new labels array

        for i in tqdm(range(len(img_names_new))):
            img_path = '/media/ntu/volume1/home/s123md305_01/Documents/Generated/ComGenerated112/' + img_names_new[i]  # Update with the actual path to your second dataset images
            img = image.load_img(img_path, target_size=(112, 112, 3))
            img = image.img_to_array(img)
            train_new.append(img)  # Append the processed image

            # Assign labels based on the disease class
            for j in range(len(disease_class.keys())):
                if disease_rev[j + 1] == simp_data_ref_2['Finding Labels'][img_names_new[i]]:  # Use the reference for the second dataset
                    y_new[i][j] = 1

        X_new = np.array(train_new)  # Convert the list of images to a numpy array
        ratio=number/91324


        gc.collect()


        import numpy as np
        import os
        from tensorflow.keras.models import Sequential
        from tensorflow.keras.layers import GlobalAveragePooling2D, Dense
        from tensorflow.keras.optimizers import Adam
        from tensorflow.keras.applications import VGG16, ResNet50, DenseNet121
        #from tensorflow.keras.applications.vgg16 import preprocess_input
        from tensorflow.keras.applications.resnet50 import preprocess_input
        #from tensorflow.keras.applications.densenet import preprocess_input
        from sklearn.model_selection import train_test_split
        from sklearn.metrics import roc_auc_score
        from tensorflow.keras.callbacks import EarlyStopping
        #from tensorflow.keras.mixed_precision import set_global_policy

        #set_global_policy('mixed_float16')
        input_shape = (112, 112, 3)  # Example input shape for a typical image dataset
        num_classes = 15  # Change this to match the number of classes in your dataset

        #strategy = tf.distribute.MirroredStrategy()
        #print('Number of devices: {}'.format(strategy.num_replicas_in_sync))

        # Open a strategy scope and create your model, compile it, and train it inside this scope

      # Your model creation and compilation

        # Build, compile, and train your model within the strategy scope

        # Function to define and compile the model

        def build_model(input_shape, num_classes):
            base_model = ResNet50(weights='imagenet', include_top=False, input_shape=input_shape)
            for layer in base_model.layers:
                layer.trainable = False

            model = Sequential([
                base_model,
                GlobalAveragePooling2D(),
                Dense(num_classes, activation='softmax', kernel_initializer='glorot_uniform', dtype='float32')
            ])

            model.compile(optimizer=Adam(learning_rate=0.0001), loss=focal_loss(), metrics=[tf.keras.metrics.AUC(name='auc')])
            return model
        # Function for Focal Loss
        #https://www.programmersought.com/article/60001511310/
        def focal_loss(alpha = 0.5, beta = 2.0):
            epsilon = 1.e-7
            def loss_fn2(y_true, y_pred):
                y_true = tf.cast(y_true, tf.float32)
                y_pred = tf.clip_by_value(y_pred, epsilon, 1. - epsilon)

                alpha_t = y_true*alpha + (tf.ones_like(y_true)-y_true)*(1-alpha)
                y_t = tf.multiply(y_true, y_pred) + tf.multiply(1-y_true, 1-y_pred)
                ce = -tf.math.log(y_t)
                weight = tf.pow(tf.subtract(1., y_t), beta)
                fl = tf.multiply(tf.multiply(weight, ce), alpha_t)
                loss = tf.reduce_mean(fl)
                return loss

            return loss_fn2

        # Number of runs to calculate the standard deviation
        n_runs = 3
        runno=42
        auc_scores = []

        for run in range(n_runs):
            tf.keras.backend.clear_session()
            firstdiv=(1+ratio)*(0.15)
            seconddiv=(firstdiv)/(1+ratio-firstdiv)
            print(firstdiv)
            print(seconddiv)
            
            # Assuming X and y are your complete dataset excluding the test set
            X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=firstdiv, random_state=42+run)

            # Preprocess the test set
            X_test = preprocess_input(X_test)

            X_train_new = np.concatenate((X_train_val, X_new), axis=0)
            y_train_new = np.concatenate((y_train_val, y_new), axis=0)
            # Split the training + validation set into actual training and validation sets (82.35:17.65)
            # This will give you 70% of the total data for training and 15% of the total data for validation

            X_train, X_val, y_train, y_val = train_test_split(X_train_new, y_train_new, test_size=seconddiv, random_state=42+run)
            # Preprocess the training and validation sets
            X_train = preprocess_input(X_train)
            X_val = preprocess_input(X_val)
            # Convert the numpy arrays into tf.data.Dataset
            with tf.device("CPU"):
                train_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train))
                val_dataset = tf.data.Dataset.from_tensor_slices((X_val, y_val))
                test_dataset = tf.data.Dataset.from_tensor_slices((X_test, y_test))
                batch_size = 10  # You can adjust this according to your specific requirements
                train_dataset = train_dataset.batch(batch_size).prefetch(tf.data.AUTOTUNE)
                val_dataset = val_dataset.batch(batch_size).prefetch(tf.data.AUTOTUNE)
                test_dataset = test_dataset.batch(batch_size).prefetch(tf.data.AUTOTUNE)


            gc.collect()

            #with strategy.scope():
            model = build_model(input_shape, num_classes)

            early_stopping = EarlyStopping(monitor='val_auc', patience=3, mode='max', verbose=1)


            model.fit(
                train_dataset,  # Use the batched and prefetched dataset
                epochs=20,  # Adjust based on your dataset and model's performance
                validation_data=val_dataset,  # Use the validation dataset
                callbacks=[early_stopping],
                verbose=1  # Set to 0 to reduce log messages
            )
            gc.collect()

            # Evaluate the model on your test set, assuming X_test, y_test are your test data and labels
            y_pred = model.predict(test_dataset)
            auc = roc_auc_score(y_test, y_pred, multi_class='ovo')
                    # Collect all predictions for computing AUC
            #all_y_pred = []
           # for batch in test_dataset:
            #    all_y_pred.extend(model.predict(batch[0]))  # batch[0] contains the images

            # Convert to a single numpy array
          #  all_y_pred = np.vstack(all_y_pred)

            # Compute AUC assuming y_test is a single numpy array of labels
          #  auc = roc_auc_score(y_test, all_y_pred, multi_class='ovo')

            auc_scores.append(auc)
            print(f"Run {run+1}/{n_runs}, Test AUC: {auc:.4f}")
            gc.collect()
            # Calculate and print the standard deviation of AUC scores
        auc_std_dev = np.std(auc_scores)
        aauc=np.mean(auc_scores)
        print(f"Standard Deviation of AUC over {n_runs} runs: {auc_std_dev:.4f}")
        print("aauc=",aauc)
        gc.collect()
            # Now, X and y contain the images and labels for this iteration
            # You can now proceed with training or saving this data

In [None]:
import matplotlib.pyplot as plt

# Data
augmented = [10000, 20000, 30000, 40000, 50000, 60000, 70000, 80000, 90000]
aauc = [
    0.7019529448460166, 0.6981793642807048, 0.6965962017994758,
    0.6946183463298995, 0.692164595411011, 0.6905014069274252,
    0.6901929777484542, 0.6921003819384491, 0.6874990049854891
]
sdv = [0.0070, 0.0036, 0.0059, 0.0038, 0.0028, 0.0074, 0.0039, 0.0006, 0.0043]

# Plotting
fig, ax1 = plt.subplots()

color = 'tab:red'
ax1.set_xlabel('Number of Augmented Images')
ax1.set_ylabel('AAUC', color=color)
ax1.plot(augmented, aauc, color=color)
ax1.tick_params(axis='y', labelcolor=color)

ax2 = ax1.twinx()  
color = 'tab:blue'
ax2.set_ylabel('Standard Deviation', color=color)
ax2.plot(augmented, sdv, color=color)
ax2.tick_params(axis='y', labelcolor=color)

fig.tight_layout()
plt.title('AAUC and Standard Deviation vs. Number of Augmented Images')
plt.show()

import numpy as np

# Calculate the upper and lower bounds for the AAUC considering the standard deviation
aauc_upper = np.array(aauc) + np.array(sdv)
aauc_lower = np.array(aauc) - np.array(sdv)

# Plotting AAUC with highlighted standard deviation range
plt.fill_between(augmented, aauc_lower, aauc_upper, color='gray', alpha=0.2)
plt.plot(augmented, aauc, '-o', color='red')
plt.title('AAUC with Standard Deviation Highlighted')
plt.xlabel('Number of Augmented Images')
plt.ylabel('AAUC')
plt.show()


In [None]:
from tensorflow.python.util import deprecation
deprecation._PRINT_DEPRECATION_WARNINGS = False   #ignore FutureWarning

import tensorflow as tf
tf.__version__
import random
from tqdm import tqdm
import json
import os
import gc
gc.collect()

import pandas as pd
import numpy as np
import numpy.random as nr
from numpy.random import seed
import matplotlib.pyplot as plt
from glob import glob
from pathlib import Path
import tensorflow.keras as keras
from tensorflow.keras import backend as K

from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import roc_curve, auc, roc_auc_score
from sklearn.model_selection import train_test_split
#from tensorflow.keras.mixed_precision import set_global_policy

#set_global_policy('mixed_float16')

#gpus = tf.config.list_physical_devices('GPU')

# Check if there are any GPUs available
#if gpus:
    # Iterate over all available GPUs and set memory growth
#    for gpu in gpus:
#        try:
#            tf.config.experimental.set_memory_growth(gpu, True)
#            print(f'Memory growth enabled for {gpu.name}')
#        except RuntimeError as e:
            # Memory growth must be set before initializing GPUs
#            print(f'Could not set memory growth for {gpu.name}: {e}')


from tensorflow.keras.models import Sequential, Model, load_model
#from tensorflow.keras.applications.vgg16 import preprocess_input
from tensorflow.keras.applications.resnet50 import preprocess_input
#from tensorflow.keras.applications.densenet import preprocess_input
from tensorflow.keras.applications.efficientnet import preprocess_input
from tensorflow.keras.layers import Conv2D,Dense,Flatten,Dropout,MaxPooling2D, Activation, BatchNormalization, GlobalAveragePooling2D
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import categorical_crossentropy
from tensorflow.keras.preprocessing import image
from tensorflow.keras.initializers import glorot_uniform
from tensorflow.keras.callbacks import Callback
from PIL import ImageFile
ImageFile.LOAD_TRUNCATED_IMAGES = True
#clear memory in case of OOM
K.clear_session()
#set dictionary for disease and its index
disease_class = {'Atelectasis': 1,
                 'Cardiomegaly': 2,
                 'Effusion': 3,
                 'Infiltration': 4,
                 'Mass': 5,
                 'Nodule': 6,
                 'Pneumonia': 7,
                 'Pneumothorax': 8,
                 'Consolidation': 9,
                 'Edema': 10,
                 'Emphysema': 11,
                 'Fibrosis': 12,
                 'Pleural_Thickening': 13,
                 'Hernia': 14,
                 'No Finding': 15}

disease_rev = {v: k for k, v in disease_class.items()}
disease_img = {'Atelectasis': [],
                 'Cardiomegaly': [],
                 'Effusion': [],
                 'Infiltration': [],
                 'Mass': [],
                 'Nodule': [],
                 'Pneumonia': [],
                 'Pneumothorax': [],
                 'Consolidation': [],
                 'Edema': [],
                 'Emphysema': [],
                 'Fibrosis': [],
                 'Pleural_Thickening': [],
                 'Hernia': [],
                 'No Finding':[]}
#import labels of the images
data_ref = pd.read_csv("/media/ntu/volume1/home/s123md305_01/Documents/CXR8/Data_Entry_2017_v2020.csv")
pd.options.mode.chained_assignment = None        #ignore the SettingWithCopyWarning

#/media/ntu/volume1/home/s123md305_01/Documents/Generated/reconstructed_labels.csv
#/media/ntu/volume1/home/s123md305_01/Documents/CXR8/Data_Entry_2017_v2020.csv
for i in tqdm(range(len(data_ref))):
    #print(i)
    if "|" not in data_ref['Finding Labels'][i]:
        disease_img[data_ref['Finding Labels'][i]].append(data_ref['Image Index'][i])
simp_data_ref = data_ref[["Image Index", "Finding Labels"]]
simp_data_ref.set_index("Image Index", inplace = True)

data_ref_2 = pd.read_csv("/media/ntu/volume1/home/s123md305_01/Documents/Generated/reconstructed_labels.csv")  # Update the path to the second dataset

# Initialize dictionary for the second dataset
disease_img_2 = {disease: [] for disease in disease_class.keys()}

# Populate the dictionary with image names from the second dataset
for i in tqdm(range(len(data_ref_2))):
    if "|" not in data_ref_2['Finding Labels'][i]:
        disease_img_2[data_ref_2['Finding Labels'][i]].append(data_ref_2['Image Index'][i])

# If you need a simplified reference for the second dataset as well
simp_data_ref_2 = data_ref_2[["Image Index", "Finding Labels"]]
simp_data_ref_2.set_index("Image Index", inplace=True)
from tensorflow.keras.preprocessing import image
from tqdm import tqdm
import numpy as np

# Assuming disease_img, disease_class, and simp_data_ref are predefined
from tensorflow.keras.preprocessing import image
from tqdm import tqdm
import numpy as np

# Assuming disease_img, disease_class, and simp_data_ref are predefined
for number in range(90000, 92324, 10000): 
    print(number)# From 10,000 to 90,000
    img_names = []
    for dis in disease_img.keys():
        num = round(number / 91324 * len(disease_img[dis]))
        for i in range(num):
            img_names.append(disease_img[dis][i])
            
    X = []
    train_image = []
    y = np.zeros(shape=(len(img_names), len(disease_class.keys())))

    for i in tqdm(range(len(img_names))):
        img = image.load_img('/media/ntu/volume1/home/s123md305_01/Documents/CombinedResized/Resized112/' + img_names[i], target_size=(112, 112, 3))
        img = image.img_to_array(img)
        train_image.append(img)
        
        for j in range(len(disease_class.keys())):
            if disease_rev[j + 1] == simp_data_ref['Finding Labels'][img_names[i]]:
                y[i][j] = 1
                
    X = np.array(train_image)
    
    number2=number//9
    print(number2, 'augmented')  # From 10,000 to 90,000
    img_names_new = []  # List to store image names from the second dataset
    for dis in disease_img_2.keys():  # Use the dictionary for the second dataset
        num = round(number2 / 91324 * len(disease_img_2[dis]))  # Calculate the number of images per disease
        for i in range(num):
            img_names_new.append(disease_img_2[dis][i])  # Append image names from the second dataset

    X_new = [] 
    train_new = [] # This will store the new images
    y_new = np.zeros(shape=(len(img_names_new), len(disease_class.keys())))  # Initialize the new labels array

    for i in tqdm(range(len(img_names_new))):
        img_path = '/media/ntu/volume1/home/s123md305_01/Documents/Generated/ComGenerated112/' + img_names_new[i]  # Update with the actual path to your second dataset images
        img = image.load_img(img_path, target_size=(112, 112, 3))
        img = image.img_to_array(img)
        train_new.append(img)  # Append the processed image

        # Assign labels based on the disease class
        for j in range(len(disease_class.keys())):
            if disease_rev[j + 1] == simp_data_ref_2['Finding Labels'][img_names_new[i]]:  # Use the reference for the second dataset
                y_new[i][j] = 1

    X_new = np.array(train_new)  # Convert the list of images to a numpy array
    ratio=number2/number


    gc.collect()


    import numpy as np
    import os
    from tensorflow.keras.models import Sequential
    from tensorflow.keras.layers import GlobalAveragePooling2D, Dense
    from tensorflow.keras.optimizers import Adam
    from tensorflow.keras.applications import VGG16, ResNet50, DenseNet121
    #from tensorflow.keras.applications.vgg16 import preprocess_input
    from tensorflow.keras.applications.resnet50 import preprocess_input
    #from tensorflow.keras.applications.densenet import preprocess_input
    from sklearn.model_selection import train_test_split
    from sklearn.metrics import roc_auc_score
    from tensorflow.keras.callbacks import EarlyStopping
    #from tensorflow.keras.mixed_precision import set_global_policy

    #set_global_policy('mixed_float16')
    input_shape = (112, 112, 3)  # Example input shape for a typical image dataset
    num_classes = 15  # Change this to match the number of classes in your dataset

    #strategy = tf.distribute.MirroredStrategy()
    #print('Number of devices: {}'.format(strategy.num_replicas_in_sync))

    # Open a strategy scope and create your model, compile it, and train it inside this scope

  # Your model creation and compilation

    # Build, compile, and train your model within the strategy scope

    # Function to define and compile the model

    def build_model(input_shape, num_classes):
        base_model = ResNet50(weights='imagenet', include_top=False, input_shape=input_shape)
        for layer in base_model.layers:
            layer.trainable = False

        model = Sequential([
            base_model,
            GlobalAveragePooling2D(),
            Dense(num_classes, activation='softmax', kernel_initializer='glorot_uniform', dtype='float32')
        ])

        model.compile(optimizer=Adam(learning_rate=0.0001), loss=focal_loss(), metrics=[tf.keras.metrics.AUC(name='auc')])
        return model
    # Function for Focal Loss
    #https://www.programmersought.com/article/60001511310/
    def focal_loss(alpha = 0.5, beta = 2.0):
        epsilon = 1.e-7
        def loss_fn2(y_true, y_pred):
            y_true = tf.cast(y_true, tf.float32)
            y_pred = tf.clip_by_value(y_pred, epsilon, 1. - epsilon)

            alpha_t = y_true*alpha + (tf.ones_like(y_true)-y_true)*(1-alpha)
            y_t = tf.multiply(y_true, y_pred) + tf.multiply(1-y_true, 1-y_pred)
            ce = -tf.math.log(y_t)
            weight = tf.pow(tf.subtract(1., y_t), beta)
            fl = tf.multiply(tf.multiply(weight, ce), alpha_t)
            loss = tf.reduce_mean(fl)
            return loss

        return loss_fn2

    # Number of runs to calculate the standard deviation
    n_runs=3
    runno=42
    auc_scores = []

    for run in range(n_runs):
        tf.keras.backend.clear_session()
        firstdiv=(1+ratio)*(0.15)
        seconddiv=(firstdiv)/(1+ratio-firstdiv)
        print(firstdiv)
        print(seconddiv)

        # Assuming X and y are your complete dataset excluding the test set
        X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=firstdiv, random_state=42+run)

        # Preprocess the test set
        X_test = preprocess_input(X_test)

        X_train_new = np.concatenate((X_train_val, X_new), axis=0)
        y_train_new = np.concatenate((y_train_val, y_new), axis=0)
        # Split the training + validation set into actual training and validation sets (82.35:17.65)
        # This will give you 70% of the total data for training and 15% of the total data for validation

        X_train, X_val, y_train, y_val = train_test_split(X_train_new, y_train_new, test_size=seconddiv, random_state=42+run)
        # Preprocess the training and validation sets
        X_train = preprocess_input(X_train)
        X_val = preprocess_input(X_val)
        # Convert the numpy arrays into tf.data.Dataset
        with tf.device("CPU"):
            train_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train))
            val_dataset = tf.data.Dataset.from_tensor_slices((X_val, y_val))
            test_dataset = tf.data.Dataset.from_tensor_slices((X_test, y_test))
            batch_size = 10  # You can adjust this according to your specific requirements
            train_dataset = train_dataset.batch(batch_size).prefetch(tf.data.AUTOTUNE)
            val_dataset = val_dataset.batch(batch_size).prefetch(tf.data.AUTOTUNE)
            test_dataset = test_dataset.batch(batch_size).prefetch(tf.data.AUTOTUNE)


        gc.collect()

        #with strategy.scope():
        model = build_model(input_shape, num_classes)

        early_stopping = EarlyStopping(monitor='val_auc', patience=3, mode='max', verbose=1)


        model.fit(
            train_dataset,  # Use the batched and prefetched dataset
            epochs=20,  # Adjust based on your dataset and model's performance
            validation_data=val_dataset,  # Use the validation dataset
            callbacks=[early_stopping],
            verbose=1  # Set to 0 to reduce log messages
        )
        gc.collect()

        # Evaluate the model on your test set, assuming X_test, y_test are your test data and labels
        y_pred = model.predict(test_dataset)
        auc = roc_auc_score(y_test, y_pred, multi_class='ovo')
                # Collect all predictions for computing AUC
        #all_y_pred = []
       # for batch in test_dataset:
        #    all_y_pred.extend(model.predict(batch[0]))  # batch[0] contains the images

        # Convert to a single numpy array
      #  all_y_pred = np.vstack(all_y_pred)

        # Compute AUC assuming y_test is a single numpy array of labels
      #  auc = roc_auc_score(y_test, all_y_pred, multi_class='ovo')

        auc_scores.append(auc)
        print(f"Run {run+1}/{n_runs}, Test AUC: {auc:.4f}")
        gc.collect()
        # Calculate and print the standard deviation of AUC scores
    auc_std_dev = np.std(auc_scores)
    aauc=np.mean(auc_scores)
    print(f"Standard Deviation of AUC over {n_runs} runs: {auc_std_dev:.4f}")
    print("aauc=",aauc)
    gc.collect()
        # Now, X and y contain the images and labels for this iteration
        # You can now proceed with training or saving this data

In [None]:
from tensorflow.python.util import deprecation
deprecation._PRINT_DEPRECATION_WARNINGS = False   #ignore FutureWarning

import tensorflow as tf
tf.__version__
import random
from tqdm import tqdm
import json
import os
import gc
gc.collect()

import pandas as pd
import numpy as np
import numpy.random as nr
from numpy.random import seed
import matplotlib.pyplot as plt
from glob import glob
from pathlib import Path
import tensorflow.keras as keras
from tensorflow.keras import backend as K

from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import roc_curve, auc, roc_auc_score
from sklearn.model_selection import train_test_split
#from tensorflow.keras.mixed_precision import set_global_policy

#set_global_policy('mixed_float16')

#gpus = tf.config.list_physical_devices('GPU')

# Check if there are any GPUs available
#if gpus:
    # Iterate over all available GPUs and set memory growth
#    for gpu in gpus:
#        try:
#            tf.config.experimental.set_memory_growth(gpu, True)
#            print(f'Memory growth enabled for {gpu.name}')
#        except RuntimeError as e:
            # Memory growth must be set before initializing GPUs
#            print(f'Could not set memory growth for {gpu.name}: {e}')


from tensorflow.keras.models import Sequential, Model, load_model
#from tensorflow.keras.applications.vgg16 import preprocess_input
from tensorflow.keras.applications.resnet50 import preprocess_input
#from tensorflow.keras.applications.densenet import preprocess_input
from tensorflow.keras.applications.efficientnet import preprocess_input
from tensorflow.keras.layers import Conv2D,Dense,Flatten,Dropout,MaxPooling2D, Activation, BatchNormalization, GlobalAveragePooling2D
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import categorical_crossentropy
from tensorflow.keras.preprocessing import image
from tensorflow.keras.initializers import glorot_uniform
from tensorflow.keras.callbacks import Callback
from PIL import ImageFile
ImageFile.LOAD_TRUNCATED_IMAGES = True
#clear memory in case of OOM
K.clear_session()
#set dictionary for disease and its index
disease_class = {'Atelectasis': 1,
                 'Cardiomegaly': 2,
                 'Effusion': 3,
                 'Infiltration': 4,
                 'Mass': 5,
                 'Nodule': 6,
                 'Pneumonia': 7,
                 'Pneumothorax': 8,
                 'Consolidation': 9,
                 'Edema': 10,
                 'Emphysema': 11,
                 'Fibrosis': 12,
                 'Pleural_Thickening': 13,
                 'Hernia': 14,
                 'No Finding': 15}

disease_rev = {v: k for k, v in disease_class.items()}
disease_img = {'Atelectasis': [],
                 'Cardiomegaly': [],
                 'Effusion': [],
                 'Infiltration': [],
                 'Mass': [],
                 'Nodule': [],
                 'Pneumonia': [],
                 'Pneumothorax': [],
                 'Consolidation': [],
                 'Edema': [],
                 'Emphysema': [],
                 'Fibrosis': [],
                 'Pleural_Thickening': [],
                 'Hernia': [],
                 'No Finding':[]}
#import labels of the images
data_ref = pd.read_csv("/media/ntu/volume1/home/s123md305_01/Documents/CXR8/Data_Entry_2017_v2020.csv")
pd.options.mode.chained_assignment = None        #ignore the SettingWithCopyWarning

#/media/ntu/volume1/home/s123md305_01/Documents/Generated/reconstructed_labels.csv
#/media/ntu/volume1/home/s123md305_01/Documents/CXR8/Data_Entry_2017_v2020.csv
for i in tqdm(range(len(data_ref))):
    #print(i)
    if "|" not in data_ref['Finding Labels'][i]:
        disease_img[data_ref['Finding Labels'][i]].append(data_ref['Image Index'][i])
simp_data_ref = data_ref[["Image Index", "Finding Labels"]]
simp_data_ref.set_index("Image Index", inplace = True)

data_ref_2 = pd.read_csv("/media/ntu/volume1/home/s123md305_01/Documents/Generated/reconstructed_labels.csv")  # Update the path to the second dataset

# Initialize dictionary for the second dataset
disease_img_2 = {disease: [] for disease in disease_class.keys()}

# Populate the dictionary with image names from the second dataset
for i in tqdm(range(len(data_ref_2))):
    if "|" not in data_ref_2['Finding Labels'][i]:
        disease_img_2[data_ref_2['Finding Labels'][i]].append(data_ref_2['Image Index'][i])

# If you need a simplified reference for the second dataset as well
simp_data_ref_2 = data_ref_2[["Image Index", "Finding Labels"]]
simp_data_ref_2.set_index("Image Index", inplace=True)
from tensorflow.keras.preprocessing import image
from tqdm import tqdm
import numpy as np

# Assuming disease_img, disease_class, and simp_data_ref are predefined
from tensorflow.keras.preprocessing import image
from tqdm import tqdm
import numpy as np

# Assuming disease_img, disease_class, and simp_data_ref are predefined
for number in range(91324, 92324, 10000): 
    print(number)# From 10,000 to 90,000
    img_names = []
    for dis in disease_img.keys():
        num = round(number / 91324 * len(disease_img[dis]))
        for i in range(num):
            img_names.append(disease_img[dis][i])
            
    X = []
    train_image = []
    y = np.zeros(shape=(len(img_names), len(disease_class.keys())))

    for i in tqdm(range(len(img_names))):
        img = image.load_img('/media/ntu/volume1/home/s123md305_01/Documents/CombinedResized/Resized112/' + img_names[i], target_size=(112, 112, 3))
        img = image.img_to_array(img)
        train_image.append(img)
        
        for j in range(len(disease_class.keys())):
            if disease_rev[j + 1] == simp_data_ref['Finding Labels'][img_names[i]]:
                y[i][j] = 1
                
    X = np.array(train_image)
    
    number2=round(number/30)
    print(number2, 'augmented')  # From 10,000 to 90,000
    img_names_new = []  # List to store image names from the second dataset
    for dis in disease_img_2.keys():  # Use the dictionary for the second dataset
        num = round(number2 / 91324 * len(disease_img_2[dis]))  # Calculate the number of images per disease
        for i in range(num):
            img_names_new.append(disease_img_2[dis][i])  # Append image names from the second dataset

    X_new = [] 
    train_new = [] # This will store the new images
    y_new = np.zeros(shape=(len(img_names_new), len(disease_class.keys())))  # Initialize the new labels array

    for i in tqdm(range(len(img_names_new))):
        img_path = '/media/ntu/volume1/home/s123md305_01/Documents/Generated/ComGenerated112/' + img_names_new[i]  # Update with the actual path to your second dataset images
        img = image.load_img(img_path, target_size=(112, 112, 3))
        img = image.img_to_array(img)
        train_new.append(img)  # Append the processed image

        # Assign labels based on the disease class
        for j in range(len(disease_class.keys())):
            if disease_rev[j + 1] == simp_data_ref_2['Finding Labels'][img_names_new[i]]:  # Use the reference for the second dataset
                y_new[i][j] = 1

    X_new = np.array(train_new)  # Convert the list of images to a numpy array
    ratio=number2/number


    gc.collect()


    import numpy as np
    import os
    from tensorflow.keras.models import Sequential
    from tensorflow.keras.layers import GlobalAveragePooling2D, Dense
    from tensorflow.keras.optimizers import Adam
    from tensorflow.keras.applications import VGG16, ResNet50, DenseNet121
    #from tensorflow.keras.applications.vgg16 import preprocess_input
    from tensorflow.keras.applications.resnet50 import preprocess_input
    #from tensorflow.keras.applications.densenet import preprocess_input
    from sklearn.model_selection import train_test_split
    from sklearn.metrics import roc_auc_score
    from tensorflow.keras.callbacks import EarlyStopping
    #from tensorflow.keras.mixed_precision import set_global_policy

    #set_global_policy('mixed_float16')
    input_shape = (112, 112, 3)  # Example input shape for a typical image dataset
    num_classes = 15  # Change this to match the number of classes in your dataset

    #strategy = tf.distribute.MirroredStrategy()
    #print('Number of devices: {}'.format(strategy.num_replicas_in_sync))

    # Open a strategy scope and create your model, compile it, and train it inside this scope

  # Your model creation and compilation

    # Build, compile, and train your model within the strategy scope

    # Function to define and compile the model

    def build_model(input_shape, num_classes):
        base_model = ResNet50(weights='imagenet', include_top=False, input_shape=input_shape)
        for layer in base_model.layers:
            layer.trainable = False

        model = Sequential([
            base_model,
            GlobalAveragePooling2D(),
            Dense(num_classes, activation='softmax', kernel_initializer='glorot_uniform', dtype='float32')
        ])

        model.compile(optimizer=Adam(learning_rate=0.0001), loss=focal_loss(), metrics=[tf.keras.metrics.AUC(name='auc')])
        return model
    # Function for Focal Loss
    #https://www.programmersought.com/article/60001511310/
    def focal_loss(alpha = 0.5, beta = 2.0):
        epsilon = 1.e-7
        def loss_fn2(y_true, y_pred):
            y_true = tf.cast(y_true, tf.float32)
            y_pred = tf.clip_by_value(y_pred, epsilon, 1. - epsilon)

            alpha_t = y_true*alpha + (tf.ones_like(y_true)-y_true)*(1-alpha)
            y_t = tf.multiply(y_true, y_pred) + tf.multiply(1-y_true, 1-y_pred)
            ce = -tf.math.log(y_t)
            weight = tf.pow(tf.subtract(1., y_t), beta)
            fl = tf.multiply(tf.multiply(weight, ce), alpha_t)
            loss = tf.reduce_mean(fl)
            return loss

        return loss_fn2

    # Number of runs to calculate the standard deviation
    n_runs=3
    runno=42
    auc_scores = []

    for run in range(n_runs):
        tf.keras.backend.clear_session()
        firstdiv=(1+ratio)*(0.15)
        seconddiv=(firstdiv)/(1+ratio-firstdiv)
        print(firstdiv)
        print(seconddiv)

        # Assuming X and y are your complete dataset excluding the test set
        X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=firstdiv, random_state=42+run)

        # Preprocess the test set
        X_test = preprocess_input(X_test)

        X_train_new = np.concatenate((X_train_val, X_new), axis=0)
        y_train_new = np.concatenate((y_train_val, y_new), axis=0)
        # Split the training + validation set into actual training and validation sets (82.35:17.65)
        # This will give you 70% of the total data for training and 15% of the total data for validation

        X_train, X_val, y_train, y_val = train_test_split(X_train_new, y_train_new, test_size=seconddiv, random_state=42+run)
        # Preprocess the training and validation sets
        X_train = preprocess_input(X_train)
        X_val = preprocess_input(X_val)
        # Convert the numpy arrays into tf.data.Dataset
        with tf.device("CPU"):
            train_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train))
            val_dataset = tf.data.Dataset.from_tensor_slices((X_val, y_val))
            test_dataset = tf.data.Dataset.from_tensor_slices((X_test, y_test))
            batch_size = 10  # You can adjust this according to your specific requirements
            train_dataset = train_dataset.batch(batch_size).prefetch(tf.data.AUTOTUNE)
            val_dataset = val_dataset.batch(batch_size).prefetch(tf.data.AUTOTUNE)
            test_dataset = test_dataset.batch(batch_size).prefetch(tf.data.AUTOTUNE)


        gc.collect()

        #with strategy.scope():
        model = build_model(input_shape, num_classes)

        early_stopping = EarlyStopping(monitor='val_auc', patience=3, mode='max', verbose=1)


        model.fit(
            train_dataset,  # Use the batched and prefetched dataset
            epochs=20,  # Adjust based on your dataset and model's performance
            validation_data=val_dataset,  # Use the validation dataset
            callbacks=[early_stopping],
            verbose=1  # Set to 0 to reduce log messages
        )
        gc.collect()

        # Evaluate the model on your test set, assuming X_test, y_test are your test data and labels
        y_pred = model.predict(test_dataset)
        auc = roc_auc_score(y_test, y_pred, multi_class='ovo')
                # Collect all predictions for computing AUC
        #all_y_pred = []
       # for batch in test_dataset:
        #    all_y_pred.extend(model.predict(batch[0]))  # batch[0] contains the images

        # Convert to a single numpy array
      #  all_y_pred = np.vstack(all_y_pred)

        # Compute AUC assuming y_test is a single numpy array of labels
      #  auc = roc_auc_score(y_test, all_y_pred, multi_class='ovo')

        auc_scores.append(auc)
        print(f"Run {run+1}/{n_runs}, Test AUC: {auc:.4f}")
        gc.collect()
        # Calculate and print the standard deviation of AUC scores
    auc_std_dev = np.std(auc_scores)
    aauc=np.mean(auc_scores)
    print(f"Standard Deviation of AUC over {n_runs} runs: {auc_std_dev:.4f}")
    print("aauc=",aauc)
    gc.collect()
        # Now, X and y contain the images and labels for this iteration
        # You can now proceed with training or saving this data

In [None]:
from tensorflow.python.util import deprecation
deprecation._PRINT_DEPRECATION_WARNINGS = False   #ignore FutureWarning

import tensorflow as tf
tf.__version__
import random
from tqdm import tqdm
import json
import os
import gc
gc.collect()

import pandas as pd
import numpy as np
import numpy.random as nr
from numpy.random import seed
import matplotlib.pyplot as plt
from glob import glob
from pathlib import Path
import tensorflow.keras as keras
from tensorflow.keras import backend as K

from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import roc_curve, auc, roc_auc_score
from sklearn.model_selection import train_test_split
from tensorflow.keras.mixed_precision import set_global_policy
set_global_policy('mixed_float16')

gpus = tf.config.list_physical_devices('GPU')

# Check if there are any GPUs available
if gpus:
    # Iterate over all available GPUs and set memory growth
    for gpu in gpus:
        try:
            tf.config.experimental.set_memory_growth(gpu, True)
            print(f'Memory growth enabled for {gpu.name}')
        except RuntimeError as e:
            # Memory growth must be set before initializing GPUs
            print(f'Could not set memory growth for {gpu.name}: {e}')


from tensorflow.keras.models import Sequential, Model, load_model
#from tensorflow.keras.applications.vgg16 import preprocess_input
from tensorflow.keras.applications.resnet50 import preprocess_input
#from tensorflow.keras.applications.densenet import preprocess_input
from tensorflow.keras.applications.efficientnet import preprocess_input
from tensorflow.keras.layers import Conv2D,Dense,Flatten,Dropout,MaxPooling2D, Activation, BatchNormalization, GlobalAveragePooling2D
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import categorical_crossentropy
from tensorflow.keras.preprocessing import image
from tensorflow.keras.initializers import glorot_uniform
from tensorflow.keras.callbacks import Callback
from PIL import ImageFile
ImageFile.LOAD_TRUNCATED_IMAGES = True
#clear memory in case of OOM
K.clear_session()
#set dictionary for disease and its index
disease_class = {'Atelectasis': 1,
                 'Cardiomegaly': 2,
                 'Effusion': 3,
                 'Infiltration': 4,
                 'Mass': 5,
                 'Nodule': 6,
                 'Pneumonia': 7,
                 'Pneumothorax': 8,
                 'Consolidation': 9,
                 'Edema': 10,
                 'Emphysema': 11,
                 'Fibrosis': 12,
                 'Pleural_Thickening': 13,
                 'Hernia': 14,
                 'No Finding': 15}

disease_rev = {v: k for k, v in disease_class.items()}
disease_img = {'Atelectasis': [],
                 'Cardiomegaly': [],
                 'Effusion': [],
                 'Infiltration': [],
                 'Mass': [],
                 'Nodule': [],
                 'Pneumonia': [],
                 'Pneumothorax': [],
                 'Consolidation': [],
                 'Edema': [],
                 'Emphysema': [],
                 'Fibrosis': [],
                 'Pleural_Thickening': [],
                 'Hernia': [],
                 'No Finding':[]}
#import labels of the images
data_ref = pd.read_csv("/media/ntu/volume1/home/s123md305_01/Documents/CXR8/Data_Entry_2017_v2020.csv")
pd.options.mode.chained_assignment = None        #ignore the SettingWithCopyWarning

#/media/ntu/volume1/home/s123md305_01/Documents/Generated/reconstructed_labels.csv
#/media/ntu/volume1/home/s123md305_01/Documents/CXR8/Data_Entry_2017_v2020.csv
for i in tqdm(range(len(data_ref))):
    #print(i)
    if "|" not in data_ref['Finding Labels'][i]:
        disease_img[data_ref['Finding Labels'][i]].append(data_ref['Image Index'][i])
simp_data_ref = data_ref[["Image Index", "Finding Labels"]]
simp_data_ref.set_index("Image Index", inplace = True)

data_ref_2 = pd.read_csv("/media/ntu/volume1/home/s123md305_01/Documents/Generated/reconstructed_labels.csv")  # Update the path to the second dataset

# Initialize dictionary for the second dataset
disease_img_2 = {disease: [] for disease in disease_class.keys()}

# Populate the dictionary with image names from the second dataset
for i in tqdm(range(len(data_ref_2))):
    if "|" not in data_ref_2['Finding Labels'][i]:
        disease_img_2[data_ref_2['Finding Labels'][i]].append(data_ref_2['Image Index'][i])

# If you need a simplified reference for the second dataset as well
simp_data_ref_2 = data_ref_2[["Image Index", "Finding Labels"]]
simp_data_ref_2.set_index("Image Index", inplace=True)
from tensorflow.keras.preprocessing import image
from tqdm import tqdm
import numpy as np

# Assuming disease_img, disease_class, and simp_data_ref are predefined
from tensorflow.keras.preprocessing import image
from tqdm import tqdm
import numpy as np

# Assuming disease_img, disease_class, and simp_data_ref are predefined
for number in range(30000, 31000, 10000): 
    print(number)# From 10,000 to 90,000
    img_names = []
    for dis in disease_img.keys():
        num = round(number / 91324 * len(disease_img[dis]))
        for i in range(num):
            img_names.append(disease_img[dis][i])
            
    X = []
    train_image = []
    y = np.zeros(shape=(len(img_names), len(disease_class.keys())))

    for i in tqdm(range(len(img_names))):
        img = image.load_img('/media/ntu/volume1/home/s123md305_01/Documents/CombinedResized/Resized224/' + img_names[i], target_size=(224, 224, 3))
        img = image.img_to_array(img)
        train_image.append(img)
        
        for j in range(len(disease_class.keys())):
            if disease_rev[j + 1] == simp_data_ref['Finding Labels'][img_names[i]]:
                y[i][j] = 1
                
    X = np.array(train_image)
    number2=number
    #number2=round(number/10)
    print(number2, 'augmented')  # From 10,000 to 90,000
    img_names_new = []  # List to store image names from the second dataset
    for dis in disease_img_2.keys():  # Use the dictionary for the second dataset
        num = round(number2 / 91324 * len(disease_img_2[dis]))  # Calculate the number of images per disease
        for i in range(num):
            img_names_new.append(disease_img_2[dis][i])  # Append image names from the second dataset

    X_new = [] 
    train_new = [] # This will store the new images
    y_new = np.zeros(shape=(len(img_names_new), len(disease_class.keys())))  # Initialize the new labels array

    for i in tqdm(range(len(img_names_new))):
        img_path = '/media/ntu/volume1/home/s123md305_01/Documents/Generated/ComGenerated224/' + img_names_new[i]  # Update with the actual path to your second dataset images
        img = image.load_img(img_path, target_size=(224, 224, 3))
        img = image.img_to_array(img)
        train_new.append(img)  # Append the processed image

        # Assign labels based on the disease class
        for j in range(len(disease_class.keys())):
            if disease_rev[j + 1] == simp_data_ref_2['Finding Labels'][img_names_new[i]]:  # Use the reference for the second dataset
                y_new[i][j] = 1

    X_new = np.array(train_new)  # Convert the list of images to a numpy array
    ratio=number2/number


    gc.collect()


    import numpy as np
    import os
    from tensorflow.keras.models import Sequential
    from tensorflow.keras.layers import GlobalAveragePooling2D, Dense
    from tensorflow.keras.optimizers import Adam
    from tensorflow.keras.applications import VGG16, ResNet50, DenseNet121
    #from tensorflow.keras.applications.vgg16 import preprocess_input
    from tensorflow.keras.applications.resnet50 import preprocess_input
    #from tensorflow.keras.applications.densenet import preprocess_input
    from sklearn.model_selection import train_test_split
    from sklearn.metrics import roc_auc_score
    from tensorflow.keras.callbacks import EarlyStopping
    #from tensorflow.keras.mixed_precision import set_global_policy

    #set_global_policy('mixed_float16')
    input_shape = (224, 224, 3)  # Example input shape for a typical image dataset
    num_classes = 15  # Change this to match the number of classes in your dataset

    strategy = tf.distribute.MirroredStrategy()
    print('Number of devices: {}'.format(strategy.num_replicas_in_sync))

    # Open a strategy scope and create your model, compile it, and train it inside this scope

  # Your model creation and compilation

    # Build, compile, and train your model within the strategy scope

    # Function to define and compile the model

    def build_model(input_shape, num_classes):
        base_model = ResNet50(weights='imagenet', include_top=False, input_shape=input_shape)
        for layer in base_model.layers:
            layer.trainable = False

        model = Sequential([
            base_model,
            GlobalAveragePooling2D(),
            Dense(num_classes, activation='softmax', kernel_initializer='glorot_uniform', dtype='float32')
        ])

        model.compile(optimizer=Adam(learning_rate=0.0001), loss=focal_loss(), metrics=[tf.keras.metrics.AUC(name='auc')])
        return model
    # Function for Focal Loss
    #https://www.programmersought.com/article/60001511310/
    def focal_loss(alpha = 0.5, beta = 2.0):
        epsilon = 1.e-7
        def loss_fn2(y_true, y_pred):
            y_true = tf.cast(y_true, tf.float32)
            y_pred = tf.clip_by_value(y_pred, epsilon, 1. - epsilon)

            alpha_t = y_true*alpha + (tf.ones_like(y_true)-y_true)*(1-alpha)
            y_t = tf.multiply(y_true, y_pred) + tf.multiply(1-y_true, 1-y_pred)
            ce = -tf.math.log(y_t)
            weight = tf.pow(tf.subtract(1., y_t), beta)
            fl = tf.multiply(tf.multiply(weight, ce), alpha_t)
            loss = tf.reduce_mean(fl)
            return loss

        return loss_fn2

    # Number of runs to calculate the standard deviation
    n_runs=3
    runno=42
    auc_scores = []

    for run in range(n_runs):
        tf.keras.backend.clear_session()
        firstdiv=(1+ratio)*(0.15)
        seconddiv=(firstdiv)/(1+ratio-firstdiv)
        print(firstdiv)
        print(seconddiv)

        # Assuming X and y are your complete dataset excluding the test set
        X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=firstdiv, random_state=42+run)

        # Preprocess the test set
        X_test = preprocess_input(X_test)

        X_train_new = np.concatenate((X_train_val, X_new), axis=0)
        y_train_new = np.concatenate((y_train_val, y_new), axis=0)
        # Split the training + validation set into actual training and validation sets (82.35:17.65)
        # This will give you 70% of the total data for training and 15% of the total data for validation

        X_train, X_val, y_train, y_val = train_test_split(X_train_new, y_train_new, test_size=seconddiv, random_state=42+run)
        # Preprocess the training and validation sets
        X_train = preprocess_input(X_train)
        X_val = preprocess_input(X_val)
        # Convert the numpy arrays into tf.data.Dataset
        with tf.device("CPU"):
            train_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train))
            val_dataset = tf.data.Dataset.from_tensor_slices((X_val, y_val))
            test_dataset = tf.data.Dataset.from_tensor_slices((X_test, y_test))
            batch_size = 10  # You can adjust this according to your specific requirements
            train_dataset = train_dataset.batch(batch_size).prefetch(tf.data.AUTOTUNE)
            val_dataset = val_dataset.batch(batch_size).prefetch(tf.data.AUTOTUNE)
            test_dataset = test_dataset.batch(batch_size).prefetch(tf.data.AUTOTUNE)


        gc.collect()

        with strategy.scope():
            model = build_model(input_shape, num_classes)

        early_stopping = EarlyStopping(monitor='val_auc', patience=3, mode='max', verbose=1)


        model.fit(
            train_dataset,  # Use the batched and prefetched dataset
            epochs=20,  # Adjust based on your dataset and model's performance
            validation_data=val_dataset,  # Use the validation dataset
            callbacks=[early_stopping],
            verbose=1  # Set to 0 to reduce log messages
        )
        gc.collect()

        # Evaluate the model on your test set, assuming X_test, y_test are your test data and labels
        y_pred = model.predict(test_dataset)
        auc = roc_auc_score(y_test, y_pred, multi_class='ovo')
                # Collect all predictions for computing AUC
        #all_y_pred = []
       # for batch in test_dataset:
        #    all_y_pred.extend(model.predict(batch[0]))  # batch[0] contains the images

        # Convert to a single numpy array
      #  all_y_pred = np.vstack(all_y_pred)

        # Compute AUC assuming y_test is a single numpy array of labels
      #  auc = roc_auc_score(y_test, all_y_pred, multi_class='ovo')

        auc_scores.append(auc)
        print(f"Run {run+1}/{n_runs}, Test AUC: {auc:.4f}")
        gc.collect()
        # Calculate and print the standard deviation of AUC scores
    auc_std_dev = np.std(auc_scores)
    aauc=np.mean(auc_scores)
    print(f"Standard Deviation of AUC over {n_runs} runs: {auc_std_dev:.4f}")
    print("aauc=",aauc)
    gc.collect()
        # Now, X and y contain the images and labels for this iteration
        # You can now proceed with training or saving this data

In [None]:
from tensorflow.python.util import deprecation
deprecation._PRINT_DEPRECATION_WARNINGS = False   #ignore FutureWarning

import tensorflow as tf
tf.__version__
import random
from tqdm import tqdm
import json
import os
import gc
gc.collect()

import pandas as pd
import numpy as np
import numpy.random as nr
from numpy.random import seed
import matplotlib.pyplot as plt
from glob import glob
from pathlib import Path
import tensorflow.keras as keras
from tensorflow.keras import backend as K

from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import roc_curve, auc, roc_auc_score
from sklearn.model_selection import train_test_split
#from tensorflow.keras.mixed_precision import set_global_policy

#set_global_policy('mixed_float16')

#gpus = tf.config.list_physical_devices('GPU')

# Check if there are any GPUs available
#if gpus:
    # Iterate over all available GPUs and set memory growth
#    for gpu in gpus:
#        try:
#            tf.config.experimental.set_memory_growth(gpu, True)
#            print(f'Memory growth enabled for {gpu.name}')
#        except RuntimeError as e:
            # Memory growth must be set before initializing GPUs
#            print(f'Could not set memory growth for {gpu.name}: {e}')


from tensorflow.keras.models import Sequential, Model, load_model
#from tensorflow.keras.applications.vgg16 import preprocess_input
from tensorflow.keras.applications.resnet50 import preprocess_input
#from tensorflow.keras.applications.densenet import preprocess_input
from tensorflow.keras.applications.efficientnet import preprocess_input
from tensorflow.keras.layers import Conv2D,Dense,Flatten,Dropout,MaxPooling2D, Activation, BatchNormalization, GlobalAveragePooling2D
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import categorical_crossentropy
from tensorflow.keras.preprocessing import image
from tensorflow.keras.initializers import glorot_uniform
from tensorflow.keras.callbacks import Callback
from PIL import ImageFile
ImageFile.LOAD_TRUNCATED_IMAGES = True
#clear memory in case of OOM
K.clear_session()
#set dictionary for disease and its index
disease_class = {'Atelectasis': 1,
                 'Cardiomegaly': 2,
                 'Effusion': 3,
                 'Infiltration': 4,
                 'Mass': 5,
                 'Nodule': 6,
                 'Pneumonia': 7,
                 'Pneumothorax': 8,
                 'Consolidation': 9,
                 'Edema': 10,
                 'Emphysema': 11,
                 'Fibrosis': 12,
                 'Pleural_Thickening': 13,
                 'Hernia': 14,
                 'No Finding': 15}

disease_rev = {v: k for k, v in disease_class.items()}
disease_img = {'Atelectasis': [],
                 'Cardiomegaly': [],
                 'Effusion': [],
                 'Infiltration': [],
                 'Mass': [],
                 'Nodule': [],
                 'Pneumonia': [],
                 'Pneumothorax': [],
                 'Consolidation': [],
                 'Edema': [],
                 'Emphysema': [],
                 'Fibrosis': [],
                 'Pleural_Thickening': [],
                 'Hernia': [],
                 'No Finding':[]}
#import labels of the images
data_ref = pd.read_csv("/media/ntu/volume1/home/s123md305_01/Documents/CXR8/Data_Entry_2017_v2020.csv")
pd.options.mode.chained_assignment = None        #ignore the SettingWithCopyWarning

#/media/ntu/volume1/home/s123md305_01/Documents/Generated/reconstructed_labels.csv
#/media/ntu/volume1/home/s123md305_01/Documents/CXR8/Data_Entry_2017_v2020.csv
for i in tqdm(range(len(data_ref))):
    #print(i)
    if "|" not in data_ref['Finding Labels'][i]:
        disease_img[data_ref['Finding Labels'][i]].append(data_ref['Image Index'][i])
simp_data_ref = data_ref[["Image Index", "Finding Labels"]]
simp_data_ref.set_index("Image Index", inplace = True)

data_ref_2 = pd.read_csv("/media/ntu/volume1/home/s123md305_01/Documents/Generated/reconstructed_labels.csv")  # Update the path to the second dataset

# Initialize dictionary for the second dataset
disease_img_2 = {disease: [] for disease in disease_class.keys()}

# Populate the dictionary with image names from the second dataset
for i in tqdm(range(len(data_ref_2))):
    if "|" not in data_ref_2['Finding Labels'][i]:
        disease_img_2[data_ref_2['Finding Labels'][i]].append(data_ref_2['Image Index'][i])

# If you need a simplified reference for the second dataset as well
simp_data_ref_2 = data_ref_2[["Image Index", "Finding Labels"]]
simp_data_ref_2.set_index("Image Index", inplace=True)
from tensorflow.keras.preprocessing import image
from tqdm import tqdm
import numpy as np

# Assuming disease_img, disease_class, and simp_data_ref are predefined
from tensorflow.keras.preprocessing import image
from tqdm import tqdm
import numpy as np

# Assuming disease_img, disease_class, and simp_data_ref are predefined
for number in range(10000, 92324, 10000): 
    print(number)# From 10,000 to 90,000
    img_names = []
    for dis in disease_img.keys():
        num = round(number / 91324 * len(disease_img[dis]))
        for i in range(num):
            img_names.append(disease_img[dis][i])
            
    X = []
    train_image = []
    y = np.zeros(shape=(len(img_names), len(disease_class.keys())))

    for i in tqdm(range(len(img_names))):
        img = image.load_img('/media/ntu/volume1/home/s123md305_01/Documents/CombinedResized/Resized112/' + img_names[i], target_size=(112, 112, 3))
        img = image.img_to_array(img)
        train_image.append(img)
        
        for j in range(len(disease_class.keys())):
            if disease_rev[j + 1] == simp_data_ref['Finding Labels'][img_names[i]]:
                y[i][j] = 1
                
    X = np.array(train_image)
    
    number2=round(number/30)
    print(number2, 'augmented')  # From 10,000 to 90,000
    img_names_new = []  # List to store image names from the second dataset
    for dis in disease_img_2.keys():  # Use the dictionary for the second dataset
        num = round(number2 / 91324 * len(disease_img_2[dis]))  # Calculate the number of images per disease
        for i in range(num):
            img_names_new.append(disease_img_2[dis][i])  # Append image names from the second dataset

    X_new = [] 
    train_new = [] # This will store the new images
    y_new = np.zeros(shape=(len(img_names_new), len(disease_class.keys())))  # Initialize the new labels array

    for i in tqdm(range(len(img_names_new))):
        img_path = '/media/ntu/volume1/home/s123md305_01/Documents/Generated/ComGenerated112/' + img_names_new[i]  # Update with the actual path to your second dataset images
        img = image.load_img(img_path, target_size=(112, 112, 3))
        img = image.img_to_array(img)
        train_new.append(img)  # Append the processed image

        # Assign labels based on the disease class
        for j in range(len(disease_class.keys())):
            if disease_rev[j + 1] == simp_data_ref_2['Finding Labels'][img_names_new[i]]:  # Use the reference for the second dataset
                y_new[i][j] = 1

    X_new = np.array(train_new)  # Convert the list of images to a numpy array
    ratio=number2/number


    gc.collect()


    import numpy as np
    import os
    from tensorflow.keras.models import Sequential
    from tensorflow.keras.layers import GlobalAveragePooling2D, Dense
    from tensorflow.keras.optimizers import Adam
    from tensorflow.keras.applications import VGG16, ResNet50, DenseNet121
    #from tensorflow.keras.applications.vgg16 import preprocess_input
    from tensorflow.keras.applications.resnet50 import preprocess_input
    #from tensorflow.keras.applications.densenet import preprocess_input
    from sklearn.model_selection import train_test_split
    from sklearn.metrics import roc_auc_score
    from tensorflow.keras.callbacks import EarlyStopping
    #from tensorflow.keras.mixed_precision import set_global_policy

    #set_global_policy('mixed_float16')
    input_shape = (112, 112, 3)  # Example input shape for a typical image dataset
    num_classes = 15  # Change this to match the number of classes in your dataset

    #strategy = tf.distribute.MirroredStrategy()
    #print('Number of devices: {}'.format(strategy.num_replicas_in_sync))

    # Open a strategy scope and create your model, compile it, and train it inside this scope

  # Your model creation and compilation

    # Build, compile, and train your model within the strategy scope

    # Function to define and compile the model

    def build_model(input_shape, num_classes):
        base_model = ResNet50(weights='imagenet', include_top=False, input_shape=input_shape)
        for layer in base_model.layers:
            layer.trainable = False

        model = Sequential([
            base_model,
            GlobalAveragePooling2D(),
            Dense(num_classes, activation='softmax', kernel_initializer='glorot_uniform', dtype='float32')
        ])

        model.compile(optimizer=Adam(learning_rate=0.0001), loss=focal_loss(), metrics=[tf.keras.metrics.AUC(name='auc')])
        return model
    # Function for Focal Loss
    #https://www.programmersought.com/article/60001511310/
    def focal_loss(alpha = 0.5, beta = 2.0):
        epsilon = 1.e-7
        def loss_fn2(y_true, y_pred):
            y_true = tf.cast(y_true, tf.float32)
            y_pred = tf.clip_by_value(y_pred, epsilon, 1. - epsilon)

            alpha_t = y_true*alpha + (tf.ones_like(y_true)-y_true)*(1-alpha)
            y_t = tf.multiply(y_true, y_pred) + tf.multiply(1-y_true, 1-y_pred)
            ce = -tf.math.log(y_t)
            weight = tf.pow(tf.subtract(1., y_t), beta)
            fl = tf.multiply(tf.multiply(weight, ce), alpha_t)
            loss = tf.reduce_mean(fl)
            return loss

        return loss_fn2

    # Number of runs to calculate the standard deviation
    n_runs=3
    runno=42
    auc_scores = []

    for run in range(n_runs):
        tf.keras.backend.clear_session()
        firstdiv=(1+ratio)*(0.15)
        seconddiv=(firstdiv)/(1+ratio-firstdiv)
        print(firstdiv)
        print(seconddiv)

        # Assuming X and y are your complete dataset excluding the test set
        X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=firstdiv, random_state=42+run)

        # Preprocess the test set
        X_test = preprocess_input(X_test)

        X_train_new = np.concatenate((X_train_val, X_new), axis=0)
        y_train_new = np.concatenate((y_train_val, y_new), axis=0)
        # Split the training + validation set into actual training and validation sets (82.35:17.65)
        # This will give you 70% of the total data for training and 15% of the total data for validation

        X_train, X_val, y_train, y_val = train_test_split(X_train_new, y_train_new, test_size=seconddiv, random_state=42+run)
        # Preprocess the training and validation sets
        X_train = preprocess_input(X_train)
        X_val = preprocess_input(X_val)
        # Convert the numpy arrays into tf.data.Dataset
        with tf.device("CPU"):
            train_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train))
            val_dataset = tf.data.Dataset.from_tensor_slices((X_val, y_val))
            test_dataset = tf.data.Dataset.from_tensor_slices((X_test, y_test))
            batch_size = 10  # You can adjust this according to your specific requirements
            train_dataset = train_dataset.batch(batch_size).prefetch(tf.data.AUTOTUNE)
            val_dataset = val_dataset.batch(batch_size).prefetch(tf.data.AUTOTUNE)
            test_dataset = test_dataset.batch(batch_size).prefetch(tf.data.AUTOTUNE)


        gc.collect()

        #with strategy.scope():
        model = build_model(input_shape, num_classes)

        early_stopping = EarlyStopping(monitor='val_auc', patience=3, mode='max', verbose=1)


        model.fit(
            train_dataset,  # Use the batched and prefetched dataset
            epochs=20,  # Adjust based on your dataset and model's performance
            validation_data=val_dataset,  # Use the validation dataset
            callbacks=[early_stopping],
            verbose=1  # Set to 0 to reduce log messages
        )
        gc.collect()

        # Evaluate the model on your test set, assuming X_test, y_test are your test data and labels
        y_pred = model.predict(test_dataset)
        auc = roc_auc_score(y_test, y_pred, average = 'macro')
                # Collect all predictions for computing AUC
        #all_y_pred = []
       # for batch in test_dataset:
        #    all_y_pred.extend(model.predict(batch[0]))  # batch[0] contains the images

        # Convert to a single numpy array
      #  all_y_pred = np.vstack(all_y_pred)

        # Compute AUC assuming y_test is a single numpy array of labels
      #  auc = roc_auc_score(y_test, all_y_pred, multi_class='ovo')

        auc_scores.append(auc)
        print(f"Run {run+1}/{n_runs}, Test AUC: {auc:.4f}")
        gc.collect()
        # Calculate and print the standard deviation of AUC scores
    auc_std_dev = np.std(auc_scores)
    aauc=np.mean(auc_scores)
    print(f"Standard Deviation of AUC over {n_runs} runs: {auc_std_dev:.4f}")
    print("aauc=",aauc)
    gc.collect()
        # Now, X and y contain the images and labels for this iteration
        # You can now proceed with training or saving this data

In [None]:
from tensorflow.python.util import deprecation
deprecation._PRINT_DEPRECATION_WARNINGS = False   #ignore FutureWarning

import tensorflow as tf
tf.__version__
import random
from tqdm import tqdm
import json
import os
import gc
gc.collect()

import pandas as pd
import numpy as np
import numpy.random as nr
from numpy.random import seed
import matplotlib.pyplot as plt
from glob import glob
from pathlib import Path
import tensorflow.keras as keras
from tensorflow.keras import backend as K

from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import roc_curve, auc, roc_auc_score
from sklearn.model_selection import train_test_split
from tensorflow.keras.mixed_precision import set_global_policy

set_global_policy('mixed_float16')

gpus = tf.config.list_physical_devices('GPU')

# Check if there are any GPUs available
if gpus:
    # Iterate over all available GPUs and set memory growth
    for gpu in gpus:
        try:
            tf.config.experimental.set_memory_growth(gpu, True)
            print(f'Memory growth enabled for {gpu.name}')
        except RuntimeError as e:
            # Memory growth must be set before initializing GPUs
            print(f'Could not set memory growth for {gpu.name}: {e}')


from tensorflow.keras.models import Sequential, Model, load_model
#from tensorflow.keras.applications.vgg16 import preprocess_input
from tensorflow.keras.applications.resnet50 import preprocess_input
#from tensorflow.keras.applications.densenet import preprocess_input
from tensorflow.keras.applications.efficientnet import preprocess_input
from tensorflow.keras.layers import Conv2D,Dense,Flatten,Dropout,MaxPooling2D, Activation, BatchNormalization, GlobalAveragePooling2D
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import categorical_crossentropy
from tensorflow.keras.preprocessing import image
from tensorflow.keras.initializers import glorot_uniform
from tensorflow.keras.callbacks import Callback
from PIL import ImageFile
ImageFile.LOAD_TRUNCATED_IMAGES = True
#clear memory in case of OOM
K.clear_session()
#set dictionary for disease and its index
disease_class = {'Atelectasis': 1,
                 'Cardiomegaly': 2,
                 'Effusion': 3,
                 'Infiltration': 4,
                 'Mass': 5,
                 'Nodule': 6,
                 'Pneumonia': 7,
                 'Pneumothorax': 8,
                 'Consolidation': 9,
                 'Edema': 10,
                 'Emphysema': 11,
                 'Fibrosis': 12,
                 'Pleural_Thickening': 13,
                 'Hernia': 14,
                 'No Finding': 15}

disease_rev = {v: k for k, v in disease_class.items()}
disease_img = {'Atelectasis': [],
                 'Cardiomegaly': [],
                 'Effusion': [],
                 'Infiltration': [],
                 'Mass': [],
                 'Nodule': [],
                 'Pneumonia': [],
                 'Pneumothorax': [],
                 'Consolidation': [],
                 'Edema': [],
                 'Emphysema': [],
                 'Fibrosis': [],
                 'Pleural_Thickening': [],
                 'Hernia': [],
                 'No Finding':[]}
#import labels of the images
data_ref = pd.read_csv("/media/ntu/volume1/home/s123md305_01/Documents/CXR8/Data_Entry_2017_v2020.csv")
pd.options.mode.chained_assignment = None        #ignore the SettingWithCopyWarning

#/media/ntu/volume1/home/s123md305_01/Documents/Generated/reconstructed_labels.csv
#/media/ntu/volume1/home/s123md305_01/Documents/CXR8/Data_Entry_2017_v2020.csv
for i in tqdm(range(len(data_ref))):
    #print(i)
    if "|" not in data_ref['Finding Labels'][i]:
        disease_img[data_ref['Finding Labels'][i]].append(data_ref['Image Index'][i])
simp_data_ref = data_ref[["Image Index", "Finding Labels"]]
simp_data_ref.set_index("Image Index", inplace = True)

data_ref_2 = pd.read_csv("/media/ntu/volume1/home/s123md305_01/Documents/Generated/reconstructed_labels.csv")  # Update the path to the second dataset

# Initialize dictionary for the second dataset
disease_img_2 = {disease: [] for disease in disease_class.keys()}

# Populate the dictionary with image names from the second dataset
for i in tqdm(range(len(data_ref_2))):
    if "|" not in data_ref_2['Finding Labels'][i]:
        disease_img_2[data_ref_2['Finding Labels'][i]].append(data_ref_2['Image Index'][i])

# If you need a simplified reference for the second dataset as well
simp_data_ref_2 = data_ref_2[["Image Index", "Finding Labels"]]
simp_data_ref_2.set_index("Image Index", inplace=True)
from tensorflow.keras.preprocessing import image
from tqdm import tqdm
import numpy as np

# Assuming disease_img, disease_class, and simp_data_ref are predefined
from tensorflow.keras.preprocessing import image
from tqdm import tqdm
import numpy as np

# Assuming disease_img, disease_class, and simp_data_ref are predefined
for number in range(60000, 70000, 10000): 
    print(number)# From 10,000 to 90,000
    img_names = []
    for dis in disease_img.keys():
        num = round(number / 91324 * len(disease_img[dis]))
        for i in range(num):
            img_names.append(disease_img[dis][i])
            
    X = []
    train_image = []
    y = np.zeros(shape=(len(img_names), len(disease_class.keys())))

    for i in tqdm(range(len(img_names))):
        img = image.load_img('/media/ntu/volume1/home/s123md305_01/Documents/CombinedResized/Resized224/' + img_names[i], target_size=(224, 224, 3))
        img = image.img_to_array(img)
        train_image.append(img)
        
        for j in range(len(disease_class.keys())):
            if disease_rev[j + 1] == simp_data_ref['Finding Labels'][img_names[i]]:
                y[i][j] = 1
                
    X = np.array(train_image)
    
    number2=round(number/30)
    print(number2, 'augmented')  # From 10,000 to 90,000
    img_names_new = []  # List to store image names from the second dataset
    for dis in disease_img_2.keys():  # Use the dictionary for the second dataset
        num = round(number2 / 91324 * len(disease_img_2[dis]))  # Calculate the number of images per disease
        for i in range(num):
            img_names_new.append(disease_img_2[dis][i])  # Append image names from the second dataset

    X_new = [] 
    train_new = [] # This will store the new images
    y_new = np.zeros(shape=(len(img_names_new), len(disease_class.keys())))  # Initialize the new labels array

    for i in tqdm(range(len(img_names_new))):
        img_path = '/media/ntu/volume1/home/s123md305_01/Documents/Generated/ComGenerated224/' + img_names_new[i]  # Update with the actual path to your second dataset images
        img = image.load_img(img_path, target_size=(224, 224, 3))
        img = image.img_to_array(img)
        train_new.append(img)  # Append the processed image

        # Assign labels based on the disease class
        for j in range(len(disease_class.keys())):
            if disease_rev[j + 1] == simp_data_ref_2['Finding Labels'][img_names_new[i]]:  # Use the reference for the second dataset
                y_new[i][j] = 1

    X_new = np.array(train_new)  # Convert the list of images to a numpy array
    ratio=number2/number


    gc.collect()


    import numpy as np
    import os
    from tensorflow.keras.models import Sequential
    from tensorflow.keras.layers import GlobalAveragePooling2D, Dense
    from tensorflow.keras.optimizers import Adam
    from tensorflow.keras.applications import VGG16, ResNet50, DenseNet121
    #from tensorflow.keras.applications.vgg16 import preprocess_input
    from tensorflow.keras.applications.resnet50 import preprocess_input
    #from tensorflow.keras.applications.densenet import preprocess_input
    from sklearn.model_selection import train_test_split
    from sklearn.metrics import roc_auc_score
    from tensorflow.keras.callbacks import EarlyStopping
    #from tensorflow.keras.mixed_precision import set_global_policy

    #set_global_policy('mixed_float16')
    input_shape = (224, 224, 3)  # Example input shape for a typical image dataset
    num_classes = 15  # Change this to match the number of classes in your dataset

    #strategy = tf.distribute.MirroredStrategy()
    #print('Number of devices: {}'.format(strategy.num_replicas_in_sync))

    # Open a strategy scope and create your model, compile it, and train it inside this scope

  # Your model creation and compilation

    # Build, compile, and train your model within the strategy scope

    # Function to define and compile the model

    def build_model(input_shape, num_classes):
        base_model = ResNet50(weights='imagenet', include_top=False, input_shape=input_shape)
        for layer in base_model.layers:
            layer.trainable = False

        model = Sequential([
            base_model,
            GlobalAveragePooling2D(),
            Dense(num_classes, activation='softmax', kernel_initializer='glorot_uniform', dtype='float32')
        ])

        model.compile(optimizer=Adam(learning_rate=0.0001), loss=focal_loss(), metrics=[tf.keras.metrics.AUC(name='auc')])
        return model
    # Function for Focal Loss
    #https://www.programmersought.com/article/60001511310/
    def focal_loss(alpha = 0.5, beta = 2.0):
        epsilon = 1.e-7
        def loss_fn2(y_true, y_pred):
            y_true = tf.cast(y_true, tf.float32)
            y_pred = tf.clip_by_value(y_pred, epsilon, 1. - epsilon)

            alpha_t = y_true*alpha + (tf.ones_like(y_true)-y_true)*(1-alpha)
            y_t = tf.multiply(y_true, y_pred) + tf.multiply(1-y_true, 1-y_pred)
            ce = -tf.math.log(y_t)
            weight = tf.pow(tf.subtract(1., y_t), beta)
            fl = tf.multiply(tf.multiply(weight, ce), alpha_t)
            loss = tf.reduce_mean(fl)
            return loss

        return loss_fn2

    # Number of runs to calculate the standard deviation
    n_runs=3
    runno=42
    auc_scores = []

    for run in range(n_runs):
        tf.keras.backend.clear_session()
        firstdiv=(1+ratio)*(0.15)
        seconddiv=(firstdiv)/(1+ratio-firstdiv)
        print(firstdiv)
        print(seconddiv)

        # Assuming X and y are your complete dataset excluding the test set
        X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=firstdiv, random_state=42+run)

        # Preprocess the test set
        X_test = preprocess_input(X_test)

        X_train_new = np.concatenate((X_train_val, X_new), axis=0)
        y_train_new = np.concatenate((y_train_val, y_new), axis=0)
        # Split the training + validation set into actual training and validation sets (82.35:17.65)
        # This will give you 70% of the total data for training and 15% of the total data for validation

        X_train, X_val, y_train, y_val = train_test_split(X_train_new, y_train_new, test_size=seconddiv, random_state=42+run)
        # Preprocess the training and validation sets
        X_train = preprocess_input(X_train)
        X_val = preprocess_input(X_val)
        # Convert the numpy arrays into tf.data.Dataset
        with tf.device("CPU"):
            train_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train))
            val_dataset = tf.data.Dataset.from_tensor_slices((X_val, y_val))
            test_dataset = tf.data.Dataset.from_tensor_slices((X_test, y_test))
            batch_size = 10  # You can adjust this according to your specific requirements
            train_dataset = train_dataset.batch(batch_size).prefetch(tf.data.AUTOTUNE)
            val_dataset = val_dataset.batch(batch_size).prefetch(tf.data.AUTOTUNE)
            test_dataset = test_dataset.batch(batch_size).prefetch(tf.data.AUTOTUNE)


        gc.collect()

        #with strategy.scope():
        model = build_model(input_shape, num_classes)

        early_stopping = EarlyStopping(monitor='val_auc', patience=3, mode='max', verbose=1)


        model.fit(
            train_dataset,  # Use the batched and prefetched dataset
            epochs=20,  # Adjust based on your dataset and model's performance
            validation_data=val_dataset,  # Use the validation dataset
            callbacks=[early_stopping],
            verbose=1  # Set to 0 to reduce log messages
        )
        gc.collect()

        # Evaluate the model on your test set, assuming X_test, y_test are your test data and labels
        y_pred = model.predict(test_dataset)
        auc = roc_auc_score(y_test, y_pred, multi_class='ovo')
                # Collect all predictions for computing AUC
        #all_y_pred = []
       # for batch in test_dataset:
        #    all_y_pred.extend(model.predict(batch[0]))  # batch[0] contains the images

        # Convert to a single numpy array
      #  all_y_pred = np.vstack(all_y_pred)

        # Compute AUC assuming y_test is a single numpy array of labels
      #  auc = roc_auc_score(y_test, all_y_pred, multi_class='ovo')

        auc_scores.append(auc)
        print(f"Run {run+1}/{n_runs}, Test AUC: {auc:.4f}")
        gc.collect()
        # Calculate and print the standard deviation of AUC scores
    auc_std_dev = np.std(auc_scores)
    aauc=np.mean(auc_scores)
    print(f"Standard Deviation of AUC over {n_runs} runs: {auc_std_dev:.4f}")
    print("aauc=",aauc)
    gc.collect()
        # Now, X and y contain the images and labels for this iteration
        # You can now proceed with training or saving this data