In [1]:
# %%
# Import the mixed precision API
from tensorflow.keras import mixed_precision

# Set the policy to mixed_float16 (use less GPU memory)
mixed_precision.set_global_policy('mixed_float16')

import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.layers import (
    Dense,
    Flatten,
    Softmax
)
from tensorflow.keras.models import Model
import os
import tensorflow as tf
from IPython.display import display, Javascript

import sys
import json
import datetime

from tqdm import tqdm
import numpy as np
from tensorflow import keras
from typing import List, Dict, Any
from random import random
# import pydot
# from sklearn.model_selection import KFold, StratifiedKFold

# %%
DIR = "checkpoints"
# if not os.path.exists(DIR):
#     os.mkdir(DIR)

# %% [markdown]
# ## Build EfficientNet Model
#

# %% [markdown]
# ## Load Dataset

# %%
dataset_path = os.path.join("..", "..", "..", "..", "Volumes", "Siren", "autogleasonprostate", "train_test_split_041724.json")
with open(dataset_path, 'r') as f:
    dataset = json.load(f)

# %%
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

# %%
target_names = dataset.get('target_names')
# root_dir = dataset.get('path')
X_train = dataset.get('X_train')
X_test = dataset.get("X_test")
y_train = np.array(dataset.get("y_train"))
y_test = np.array(dataset.get("y_test"))

# correct all file paths
base_path = os.path.join("..", "..", "..", "..", "Volumes", "Siren", "autogleasonprostate", "PreprocessedData")
for i in range(len(X_train)):
    # Split the string by '/'
    parts = X_train[i].split('/')
    X_file = '/'.join(parts[-3:])
    X_train[i] = os.path.join(base_path, X_file)

for i in range(len(X_test)):
    parts = X_test[i].split('/')
    X_file = '/'.join(parts[-3:])
    X_test[i] = os.path.join(base_path, X_file)


X_train, y_train = shuffle(X_train, y_train)
X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=1/3, stratify=y_test)


# %%

class DatasetGenerator(keras.utils.Sequence):
  
    def __init__(self, image_filenames: List[str], labels: np.array, batch_size: int, augment: bool = False) :
        self.image_filenames = image_filenames
        self.labels = labels
        self.batch_size = batch_size
        if augment:
            _filenames, _labels = [], []
            self.add_augmented_data()
            for i in range(len(self.image_filenames)):
                _l = self.labels[i]
                _fn = self.image_filenames[i]
                if not isinstance(_fn, tuple):
                    if _l == 0 and random() > .10:
                        continue
                    elif _l == 1 and random() > .75:
                        continue
                _filenames.append(_fn)
                _labels.append(_l)

            self.image_filenames = _filenames
            self.labels = _labels

    def add_augmented_data(self):
        self.labels = self.labels.tolist()
        print("Augmenting data!")
        for i in range(len(self.labels)):
            img, label = self.image_filenames[i], self.labels[i]
            if label != 0 and label != 1:
                for i in range(label):
                    self.labels.append(label)
                    self.image_filenames.append((img, True))

        print("Done appending to training list!")
        self.labels = np.array(self.labels)
        # undersample normal tissue

    def __len__(self):
        return (np.ceil(len(self.image_filenames) / float(self.batch_size))).astype(np.int32)

    def save(self, name: str):
        with open(name, 'w') as f:
            json.dump({"X": self.image_filenames, "y": self.labels}, f)

    def augment(self, img: np.array):
        # turn this into a sequence, with scaling flip and rotation
        data_augmentation = tf.keras.Sequential([
          tf.keras.layers.RandomFlip("horizontal_and_vertical"),
          tf.keras.layers.RandomRotation(1),
        ])
        return data_augmentation(img)


    def __getitem__(self, idx):
        batch_x = self.image_filenames[idx * self.batch_size : (idx+1) * self.batch_size]
        batch_y = self.labels[idx * self.batch_size : (idx+1) * self.batch_size]

        imgs = []
        for img in batch_x:
            if isinstance(img, tuple):
                _img = self.augment(np.load(img[0]))
            else:
                _img = np.load(img)
            imgs.append(_img)
        return np.array(imgs) / 255.0, np.array(batch_y)


# %%
class BatchLogs(tf.keras.callbacks.Callback):

    def __init__(self, name: str):
        self.name = name
        self.epoch = 0
        self.results = []

    def on_epoch_begin(self, epoch, logs=None):
        self.epoch = epoch

    def on_epoch_end(self, epoch, logs=None):
        with open(self.name, 'w') as f:
            json.dump(self.results, f)

    def on_train_batch_end(self, batch, logs=None):
        _dict = {"epoch": self.epoch}
        _dict.update(logs)
        self.results.append(_dict)


# %%
sys.argv[-1]

# %%
EPOCHS = 20
BATCH_SIZE = 16
VERBOSITY = 1
LEARNING_RATE = 0.001
MOMENTUM=.9
model_params = {
    "epochs": EPOCHS,
    "batch_size": BATCH_SIZE,
    "learning_rate": LEARNING_RATE,
    "momentum": MOMENTUM,
}

# %%
train_gen = DatasetGenerator(X_train, y_train, batch_size=BATCH_SIZE, augment=True)
val_gen = DatasetGenerator(X_val, y_val, batch_size=BATCH_SIZE)
test_gen = DatasetGenerator(X_test, y_test, batch_size=BATCH_SIZE)

# %%
# https://www.tensorflow.org/api_docs/python/tf/keras/callbacks/ModelCheckpoint
#PARAM_PATH = sys.argv[-1] + '-'.join([str(v) for v in model_params.values()])
CHECKPOINT_PATH = f"./checkpoints/checkpoint.weights.keras" #/{PARAM_PATH}-checkpoint.weights.keras"
INPUT_DIMS = [500, 500, 3]
checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    CHECKPOINT_PATH,
    monitor='val_loss',
    verbose=0,
    save_best_only=True,
    save_weights_only=False,
    mode = 'auto',
    save_freq='epoch',
    initial_value_threshold=None,
)
batcher = BatchLogs(f"./{DIR}/fixed-batches.json")
csv_logger = tf.keras.callbacks.CSVLogger(f'./{DIR}/fixed-training.log')
CALLBACKS = [checkpoint_callback, csv_logger, batcher]

tf.keras.config.disable_interactive_logging()
# tf.debugging.set_log_device_placement(True)
gpus = tf.config.list_logical_devices('GPU')
strategy = tf.distribute.MirroredStrategy(gpus)

with strategy.scope():
    efnet = tf.keras.applications.EfficientNetV2L(
        include_top=False,
        weights=None,
        input_shape=INPUT_DIMS,
        pooling=None,
        include_preprocessing=False
    )
    print("Here 1")
    # https://dontrepeatyourself.org/post/transfer-learning-and-fine-tuning-with-keras-tensorflow-and-python/
    model = tf.keras.Sequential([
        tf.keras.Input(shape=INPUT_DIMS),
        efnet,
        layers.Flatten(),
        layers.Dense(len(target_names), activation='softmax'),
    ])
    model.compile(
        optimizer=tf.keras.optimizers.SGD(
            learning_rate=LEARNING_RATE,
            momentum=MOMENTUM,
            ema_momentum=0.99,
            name='SGD',
        ),
        loss=tf.keras.losses.SparseCategoricalCrossentropy(), 
        metrics=['accuracy'],
    )

print("Here 2")

# %%
from tensorflow.keras.utils import plot_model

plot_model(model, to_file="./model.png", show_shapes=True, rankdir="LR")

# import visualkeras
# visualkeras.layered_view(model, to_file='./model.png', legend=True)


# %%
model.summary(expand_nested=True, show_trainable=True)

# %%
hist = model.fit(x=train_gen, validation_data=val_gen, batch_size=BATCH_SIZE, epochs=EPOCHS, callbacks=CALLBACKS, verbose=VERBOSITY)

print("Here 3")
# https://medium.com/the-owl/k-fold-cross-validation-in-keras-3ec4a3a00538
# TODO

# %%
loss, accuracy = model.evaluate(test_gen)
with open(f'./{DIR}/evaluation.json', 'w') as f:
    print(f'loss: {loss}, accuracy {accuracy}')
    json.dump({ "loss": loss, "accuracy": accuracy, "params": model_params}, f)
train_gen.save(f"./{DIR}/training_data.json")

2024-07-01 16:01:04.714513: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-07-01 16:01:04.747968: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI AVX512_BF16 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Augmenting data!
Done appending to training list!
INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0',)


2024-07-01 16:01:08.334100: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1928] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 43194 MB memory:  -> device: 0, name: NVIDIA L40S, pci bus id: 0000:41:00.0, compute capability: 8.9


Here 1
Here 2


2024-07-01 16:02:17.647829: E tensorflow/core/util/util.cc:131] oneDNN supports DT_HALF only on platforms with AVX-512. Falling back to the default Eigen-based implementation if present.
  self._warn_if_super_not_called()
2024-07-01 16:02:22.764501: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:465] Loaded cuDNN version 8907


KeyboardInterrupt: 

In [4]:
# Open the .log file
with open('./checkpoints/fixed-training.log', 'r') as file:
    # Read the contents of the file
    content = file.read()

# Print the content
print(content)

<class 'str'>


In [1]:
import tensorflow as tf
print(tf.__version__)
print(tf.config.list_physical_devices('GPU'))

2024-06-28 19:18:20.524351: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-06-28 19:18:20.557453: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI AVX512_BF16 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


2.16.1
[]


2024-06-28 19:18:21.697784: W tensorflow/core/common_runtime/gpu/gpu_device.cc:2251] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


In [2]:
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  0


In [3]:
!nvidia-smi

Fri Jun 28 19:19:36 2024       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA L40S                    On  |   00000000:41:00.0 Off |                    0 |
| N/A   49C    P0            104W /  350W |    9170MiB /  46068MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [4]:
import tensorflow as tf
print(tf.__version__)
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

2.16.1
Num GPUs Available:  0


In [5]:
from tensorflow.python.platform import build_info as tf_build_info
print(tf_build_info.cuda_version_number)
print(tf_build_info.cudnn_version_number)

AttributeError: module 'tensorflow.python.platform.build_info' has no attribute 'cuda_version_number'

In [7]:
# Set logging level to display all messages
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '0'

import tensorflow as tf

# This will print detailed log messages, including CUDA version
print("TensorFlow version:", tf.__version__)
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

TensorFlow version: 2.16.1
Num GPUs Available:  0
