### OCI Data Science - Useful Tips
<details>
<summary><font size="2">Check for Public Internet Access</font></summary>

```python
import requests
response = requests.get("https://oracle.com")
assert response.status_code==200, "Internet connection failed"
```
</details>
<details>
<summary><font size="2">Helpful Documentation </font></summary>
<ul><li><a href="https://docs.cloud.oracle.com/en-us/iaas/data-science/using/data-science.htm">Data Science Service Documentation</a></li>
<li><a href="https://docs.cloud.oracle.com/iaas/tools/ads-sdk/latest/index.html">ADS documentation</a></li>
</ul>
</details>
<details>
<summary><font size="2">Typical Cell Imports and Settings for ADS</font></summary>

```python
%load_ext autoreload
%autoreload 2
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

import logging
logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.ERROR)

import ads
from ads.dataset.factory import DatasetFactory
from ads.automl.provider import OracleAutoMLProvider
from ads.automl.driver import AutoML
from ads.evaluations.evaluator import ADSEvaluator
from ads.common.data import ADSData
from ads.explanations.explainer import ADSExplainer
from ads.explanations.mlx_global_explainer import MLXGlobalExplainer
from ads.explanations.mlx_local_explainer import MLXLocalExplainer
from ads.catalog.model import ModelCatalog
from ads.common.model_artifact import ModelArtifact
```
</details>
<details>
<summary><font size="2">Useful Environment Variables</font></summary>

```python
import os
print(os.environ["NB_SESSION_COMPARTMENT_OCID"])
print(os.environ["PROJECT_OCID"])
print(os.environ["USER_OCID"])
print(os.environ["TENANCY_OCID"])
print(os.environ["NB_REGION"])
```
</details>

In [None]:
nvidia-smi -i 0,1 --query-gpu=gpu_bus_id,power.draw,utilization.gpu,memory.used --format=csv,nounits --loop-ms=1000 > ./gpu_stats/tf32_2GPUs.csv

In [84]:
import tensorflow as tf
print(tf.__version__)

2.8.1


Test:
- Baseline (tf-32)
- Precision
    - tf32 (default)
    - float32
    - mixed
    - bfloat
- Batch size
    - 64 (default)
    - 96 (from paper)
    - 128
    - max (power of 2)
    - max (non power of 2) 
    - Test with tf32 and fp32
- different number of GPUs
    - 1
    - 2
    - 4
    - 8
    - 16 (2 nodes)
- diferent GPUs (repeat tests)
    - A100
    - V100
    - A10
    - best of all

# Total model flops

In [4]:
import tensorflow as tf
import numpy as np
from networks2 import generator

def get_flops(model, model_inputs) -> float:
        """
        Calculate FLOPS [GFLOPs] for a tf.keras.Model or tf.keras.Sequential model
        in inference mode. It uses tf.compat.v1.profiler under the hood.
        """
        # if not hasattr(model, "model"):
        #     raise wandb.Error("self.model must be set before using this method.")

        if not isinstance(
            model, (tf.keras.models.Sequential, tf.keras.models.Model)
        ):
            raise ValueError(
                "Calculating FLOPS is only supported for "
                "`tf.keras.Model` and `tf.keras.Sequential` instances."
            )

        from tensorflow.python.framework.convert_to_constants import (
            convert_variables_to_constants_v2_as_graph,
        )

        # Compute FLOPs for one sample
        batch_size = 1
        inputs = [
            tf.TensorSpec([batch_size] + inp.shape[1:], inp.dtype)
            for inp in model_inputs
        ]

        # convert tf.keras model into frozen graph to count FLOPs about operations used at inference
        real_model = tf.function(model).get_concrete_function(inputs)
        frozen_func, _ = convert_variables_to_constants_v2_as_graph(real_model)

        # Calculate FLOPs with tf.profiler
        run_meta = tf.compat.v1.RunMetadata()
        opts = (
            tf.compat.v1.profiler.ProfileOptionBuilder(
                tf.compat.v1.profiler.ProfileOptionBuilder().float_operation()
            )
            .with_empty_output()
            .build()
        )

        flops = tf.compat.v1.profiler.profile(
            graph=frozen_func.graph, run_meta=run_meta, cmd="scope", options=opts
        )

        tf.compat.v1.reset_default_graph()

        # convert to GFLOPs
        return (flops.total_float_ops)/2
    
def forward_backward():
    
    for_flop = 0
    total_flop = 0
    session = tf.compat.v1.Session()
    graph = tf.compat.v1.get_default_graph()
    
    with graph.as_default():
        with session.as_default():

            #model = tf.keras.applications.ResNet50() # change your model here

            model = generator_model(256, dformat="channels_first")
            
            x = tf.constant(np.random.randn(1, 256))
            
            outputTensor = model([x]) 
            listOfVariableTensors = model.trainable_weights
            gradients = tf.gradients(outputTensor, listOfVariableTensors)

            run_meta = tf.compat.v1.RunMetadata()
            opts = tf.compat.v1.profiler.ProfileOptionBuilder.float_operation()

            # We use the Keras session graph in the call to the profiler.
            flops = tf.compat.v1.profiler.profile(graph=graph,
                                                  run_meta=run_meta, cmd='op', options=opts)

            total_flop = flops.total_float_ops
            print(total_flop)

    return for_flop, total_flop
    
    
    
#Usage

if __name__ =="__main__":
    #image_model = tf.keras.applications.EfficientNetB0(include_top=False, weights=None)
    
    #x = tf.constant(np.random.randn(1, 256))
    noise = tf.constant(np.random.randn(1, 2048))
    latent_noise = tf.constant(np.random.randn(1, 6144))
    lod_in = tf.constant(0)
    #y = tf.constant(np.random.randn(1, 1, 51 , 51, 25))
    
    #print(x.shape)
    
    model_g = generator(num_channels=1) #Model(inputs=[latent], outputs=[fake_image], name='Generator')
    #model_d = discriminator_model(dformat="channels_first")
    #model.summary()
    print('Generator FLOPS = ', get_flops(model_g,[noise, latent_noise, lod_in]))
    #print('Discriminator FLOPS = ', get_flops(model_d,[y]))
    
    #forward_backward()
    
    #print(get_flops(model, [x]))

Model: "model_2"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_7 (InputLayer)           [(None, 2048)]       0           []                               
                                                                                                  
 input_8 (InputLayer)           [(None, 6144)]       0           []                               
                                                                                                  
 tf.concat_1 (TFOpLambda)       (None, 8192)         0           ['input_7[0][0]',                
                                                                  'input_8[0][0]']                
                                                                                                  
 pixel_norm_layer_13 (Pixel_nor  (None, 8192)        0           ['tf.concat_1[1][0]']      

2023-04-24 07:58:37.316125: I tensorflow/core/grappler/devices.cc:66] Number of eligible GPUs (core count >= 8, compute capability >= 0.0): 8
2023-04-24 07:58:37.316296: I tensorflow/core/grappler/clusters/single_machine.cc:358] Starting new session
2023-04-24 07:58:37.367878: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 38214 MB memory:  -> device: 0, name: NVIDIA A100-SXM4-40GB, pci bus id: 0000:0f:00.0, compute capability: 8.0
2023-04-24 07:58:37.369417: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 38214 MB memory:  -> device: 1, name: NVIDIA A100-SXM4-40GB, pci bus id: 0000:15:00.0, compute capability: 8.0
2023-04-24 07:58:37.370950: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:2 with 38214 MB memory:  -> device: 2, name: NVIDIA A100-SXM4-40GB, pci bus id: 0000:

Instructions for updating:
Use `tf.compat.v1.graph_util.tensor_shape_from_node_def_name`
Generator FLOPS =  20894345882.5


# Theoretical calculation of floops

$ ConvFlops = 2 * NumberKernel * ShapeKernel * OutputShape $

## Run 

In [None]:
import tensorflow as tf
import sys, os
from train import train_cycle
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))
train_cycle(datapath='/home/datascience/',outpath='./',init_res=32,max_res=256,change_model=True,minibatch_base=32,batch_size=32,profiling=False,lod_training_kimg=3000,lod_transition_kimg=2000,total_kimg=20000)

Num GPUs Available:  8
/home/datascience/floods/
/home/datascience/floods/
INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0', '/job:localhost/replica:0/task:0/device:GPU:1', '/job:localhost/replica:0/task:0/device:GPU:2', '/job:localhost/replica:0/task:0/device:GPU:3', '/job:localhost/replica:0/task:0/device:GPU:4', '/job:localhost/replica:0/task:0/device:GPU:5', '/job:localhost/replica:0/task:0/device:GPU:6', '/job:localhost/replica:0/task:0/device:GPU:7')
Number of devices: 8
['/home/datascience/floods/floods-r02.tfrecords', '/home/datascience/floods/floods-r03.tfrecords', '/home/datascience/floods/floods-r04.tfrecords', '/home/datascience/floods/floods-r05.tfrecords', '/home/datascience/floods/floods-r06.tfrecords', '/home/datascience/floods/floods-r07.tfrecords', '/home/datascience/floods/floods-r08.tfrecords']
[(1, 4, 4), (1, 8, 8), (1, 16, 16), (1, 32, 32), (1, 64, 64), (1, 128, 128), (1, 256, 256)]
[6, 5, 4, 3, 2, 1, 0]
{6: 1024,

# Baseline

In [1]:
import tensorflow as tf
import sys, os
from train import train_cycle
os.environ["CUDA_VISIBLE_DEVICES"]="0"
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))
train_cycle(datapath='/home/datascience/',outpath='./',init_res=256,max_res=256,change_model=True,minibatch_base=32,batch_size=32,profiling=True)

Num GPUs Available:  1
/home/datascience/floods/
/home/datascience/floods/
True
32
INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0',)
Number of devices: 1


2023-05-03 10:20:28.202451: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-05-03 10:20:28.741246: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 38214 MB memory:  -> device: 0, name: NVIDIA A100-SXM4-40GB, pci bus id: 0000:0f:00.0, compute capability: 8.0


INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Redu

2023-05-03 10:21:01.177067: I tensorflow/stream_executor/cuda/cuda_dnn.cc:368] Loaded cuDNN version 8401
2023-05-03 10:21:02.641947: W tensorflow/stream_executor/gpu/asm_compiler.cc:80] Couldn't get ptxas version string: INTERNAL: Running ptxas --version returned 32512
2023-05-03 10:21:02.906587: W tensorflow/stream_executor/gpu/redzone_allocator.cc:314] INTERNAL: ptxas exited with non-zero error code 32512, output: 
Relying on driver to perform ptx compilation. 
Modify $PATH to customize ptxas location.
This message will be only logged once.
2023-05-03 10:21:05.359477: I tensorflow/stream_executor/cuda/cuda_blas.cc:1786] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.


Instructions for updating:
Use `tf.compat.v1.graph_util.tensor_shape_from_node_def_name`
FLOP =  27348714298436
Average per batch was:  5.498366117477417


# Precision

## Float32

In [1]:
import tensorflow as tf
import sys, os
from train import train_cycle
os.environ["CUDA_VISIBLE_DEVICES"]="0"
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))
train_cycle(datapath='/home/datascience/',outpath='./',init_res=256,max_res=256,change_model=True,minibatch_base=32,batch_size=32,profiling=True,use_tf32=False)

Num GPUs Available:  1
/home/datascience/floods/
/home/datascience/floods/
False
32
INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0',)
Number of devices: 1


2023-04-25 22:11:53.754614: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-04-25 22:11:54.419532: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 38214 MB memory:  -> device: 0, name: NVIDIA A100-SXM4-40GB, pci bus id: 0000:0f:00.0, compute capability: 8.0


INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Redu

2023-04-25 22:12:30.598250: I tensorflow/stream_executor/cuda/cuda_dnn.cc:368] Loaded cuDNN version 8401
2023-04-25 22:12:32.055810: W tensorflow/stream_executor/gpu/asm_compiler.cc:80] Couldn't get ptxas version string: INTERNAL: Running ptxas --version returned 32512
2023-04-25 22:12:32.328435: W tensorflow/stream_executor/gpu/redzone_allocator.cc:314] INTERNAL: ptxas exited with non-zero error code 32512, output: 
Relying on driver to perform ptx compilation. 
Modify $PATH to customize ptxas location.
This message will be only logged once.


Instructions for updating:
Use `tf.compat.v1.graph_util.tensor_shape_from_node_def_name`
FLOP =  27348714298436
Average per batch was:  8.767802190780639


## Mixed Float16

In [2]:
import tensorflow as tf
import sys, os
from train import train_cycle
os.environ["CUDA_VISIBLE_DEVICES"]="0"
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))
train_cycle(datapath='/home/datascience/',outpath='./',init_res=256,max_res=256,change_model=True,minibatch_base=32,batch_size=8,profiling=True,use_tf32=False, use_precision='mixed_float16')

Num GPUs Available:  1
/home/datascience/floods/
/home/datascience/floods/
INFO:tensorflow:Mixed precision compatibility check (mixed_float16): OK
Your GPU will likely run quickly with dtype policy mixed_float16 as it has compute capability of at least 7.0. Your GPU: NVIDIA A100-SXM4-40GB, compute capability 8.0
False
8
INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0',)
Number of devices: 1
['/home/datascience/floods/floods-r02.tfrecords', '/home/datascience/floods/floods-r03.tfrecords', '/home/datascience/floods/floods-r04.tfrecords', '/home/datascience/floods/floods-r05.tfrecords', '/home/datascience/floods/floods-r06.tfrecords', '/home/datascience/floods/floods-r07.tfrecords', '/home/datascience/floods/floods-r08.tfrecords']
[(1, 4, 4), (1, 8, 8), (1, 16, 16), (1, 32, 32), (1, 64, 64), (1, 128, 128), (1, 256, 256)]
[6, 5, 4, 3, 2, 1, 0]
{6: 1024, 5: 512, 4: 256, 3: 64, 2: 32, 1: 16, 0: 8}
Dataset Read
['/home/datascience/floods/floo

## Mixed BFloat16

In [4]:
import tensorflow as tf
import sys, os
from train import train_cycle
os.environ["CUDA_VISIBLE_DEVICES"]="0"
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))
train_cycle(datapath='/home/datascience/',outpath='./',init_res=256,max_res=256,change_model=True,minibatch_base=32,batch_size=32,profiling=True,use_tf32=False, use_precision='mixed_bfloat16')

Num GPUs Available:  1
2023-04-20 13:46:39.379308: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-04-20 13:46:40.228279: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 38214 MB memory:  -> device: 0, name: NVIDIA A100-SXM4-40GB, pci bus id: 0000:0f:00.0, compute capability: 8.0
2023-04-20 13:46:40.237752: I tensorflow/core/common_runtime/direct_session.cc:370] Device mapping:
/job:localhost/replica:0/task:0/device:GPU:0 -> device: 0, name: NVIDIA A100-SXM4-40GB, pci bus id: 0000:0f:00.0, compute capability: 8.0

True
64
2023-04-20 13:46:40.252888: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:loca

## Results Mixed BFloat16

# Batch Size tf32

## 8

In [1]:
import tensorflow as tf
import sys, os
from train import train_cycle
os.environ["CUDA_VISIBLE_DEVICES"]="0"
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))
train_cycle(datapath='/home/datascience/',outpath='./',init_res=256,max_res=256,change_model=True,minibatch_base=32,batch_size=8,profiling=True,use_tf32=True)

Num GPUs Available:  1
/home/datascience/floods/
/home/datascience/floods/
True
8
INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0',)
Number of devices: 1
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job

17921

## 16

In [1]:
import tensorflow as tf
import sys, os
from train import train_cycle
os.environ["CUDA_VISIBLE_DEVICES"]="0"
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))
train_cycle(datapath='/home/datascience/',outpath='./',init_res=256,max_res=256,change_model=True,minibatch_base=32,batch_size=16,profiling=True,use_tf32=True)

Num GPUs Available:  1
/home/datascience/floods/
/home/datascience/floods/
True
16
INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0',)
Number of devices: 1
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/jo

17921 / 34305

## 64

In [1]:
import tensorflow as tf
import sys, os
from train import train_cycle
os.environ["CUDA_VISIBLE_DEVICES"]="0"
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))
train_cycle(datapath='/home/datascience/',outpath='./',init_res=256,max_res=256,change_model=True,minibatch_base=64,batch_size=64,profiling=True,use_tf32=True)

Num GPUs Available:  1
/home/datascience/floods/
/home/datascience/floods/
True
64
INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0',)
Number of devices: 1
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/jo

UnknownError: Graph execution error:

Detected at node 'model_5/256x256/Conv1_down/Conv2D_3' defined at (most recent call last):
    File "/home/datascience/conda/tensorflow28_p38_gpu_v1/lib/python3.8/threading.py", line 890, in _bootstrap
      self._bootstrap_inner()
    File "/home/datascience/conda/tensorflow28_p38_gpu_v1/lib/python3.8/threading.py", line 932, in _bootstrap_inner
      self.run()
    File "/home/datascience/myCProGan/train.py", line 352, in training_step
      disc_loss, global_loss, local_loss = loss.combined_Discriminator_loss(gen, disc, cvae, train_batch, batch_size, D_optimizer, lod_in=lod_in, training_set=None, labels=None, wgan_lambda = 10.0, wgan_epsilon = 0.001, wgan_target = 1.0, cond_weight = 1.0,  network_size=net_size, global_batch_size = global_batch_size)
    File "/home/datascience/myCProGan/loss.py", line 570, in combined_Discriminator_loss
      global_mixed_scores_out = fp32(combined_D.use_global_discriminator(global_mixed_images_out, lod_in))
    File "/home/datascience/myCProGan/networks2.py", line 130, in use_global_discriminator
      scores = self.global_discriminator([image, lod], training=training_flag)
    File "/home/datascience/conda/tensorflow28_p38_gpu_v1/lib/python3.8/site-packages/keras/utils/traceback_utils.py", line 64, in error_handler
      return fn(*args, **kwargs)
    File "/home/datascience/conda/tensorflow28_p38_gpu_v1/lib/python3.8/site-packages/keras/engine/base_layer.py", line 1096, in __call__
      outputs = call_fn(inputs, *args, **kwargs)
    File "/home/datascience/conda/tensorflow28_p38_gpu_v1/lib/python3.8/site-packages/keras/utils/traceback_utils.py", line 92, in error_handler
      return fn(*args, **kwargs)
    File "/home/datascience/conda/tensorflow28_p38_gpu_v1/lib/python3.8/site-packages/keras/engine/functional.py", line 451, in call
      return self._run_internal_graph(
    File "/home/datascience/conda/tensorflow28_p38_gpu_v1/lib/python3.8/site-packages/keras/engine/functional.py", line 589, in _run_internal_graph
      outputs = node.layer(*args, **kwargs)
    File "/home/datascience/conda/tensorflow28_p38_gpu_v1/lib/python3.8/site-packages/keras/utils/traceback_utils.py", line 64, in error_handler
      return fn(*args, **kwargs)
    File "/home/datascience/conda/tensorflow28_p38_gpu_v1/lib/python3.8/site-packages/keras/engine/base_layer.py", line 1096, in __call__
      outputs = call_fn(inputs, *args, **kwargs)
    File "/home/datascience/conda/tensorflow28_p38_gpu_v1/lib/python3.8/site-packages/keras/utils/traceback_utils.py", line 92, in error_handler
      return fn(*args, **kwargs)
    File "/home/datascience/conda/tensorflow28_p38_gpu_v1/lib/python3.8/site-packages/keras/layers/convolutional.py", line 248, in call
      outputs = self.convolution_op(inputs, self.kernel)
    File "/home/datascience/conda/tensorflow28_p38_gpu_v1/lib/python3.8/site-packages/keras/layers/convolutional.py", line 233, in convolution_op
      return tf.nn.convolution(
Node: 'model_5/256x256/Conv1_down/Conv2D_3'
Detected at node 'model_5/256x256/Conv1_down/Conv2D_3' defined at (most recent call last):
    File "/home/datascience/conda/tensorflow28_p38_gpu_v1/lib/python3.8/threading.py", line 890, in _bootstrap
      self._bootstrap_inner()
    File "/home/datascience/conda/tensorflow28_p38_gpu_v1/lib/python3.8/threading.py", line 932, in _bootstrap_inner
      self.run()
    File "/home/datascience/myCProGan/train.py", line 352, in training_step
      disc_loss, global_loss, local_loss = loss.combined_Discriminator_loss(gen, disc, cvae, train_batch, batch_size, D_optimizer, lod_in=lod_in, training_set=None, labels=None, wgan_lambda = 10.0, wgan_epsilon = 0.001, wgan_target = 1.0, cond_weight = 1.0,  network_size=net_size, global_batch_size = global_batch_size)
    File "/home/datascience/myCProGan/loss.py", line 570, in combined_Discriminator_loss
      global_mixed_scores_out = fp32(combined_D.use_global_discriminator(global_mixed_images_out, lod_in))
    File "/home/datascience/myCProGan/networks2.py", line 130, in use_global_discriminator
      scores = self.global_discriminator([image, lod], training=training_flag)
    File "/home/datascience/conda/tensorflow28_p38_gpu_v1/lib/python3.8/site-packages/keras/utils/traceback_utils.py", line 64, in error_handler
      return fn(*args, **kwargs)
    File "/home/datascience/conda/tensorflow28_p38_gpu_v1/lib/python3.8/site-packages/keras/engine/base_layer.py", line 1096, in __call__
      outputs = call_fn(inputs, *args, **kwargs)
    File "/home/datascience/conda/tensorflow28_p38_gpu_v1/lib/python3.8/site-packages/keras/utils/traceback_utils.py", line 92, in error_handler
      return fn(*args, **kwargs)
    File "/home/datascience/conda/tensorflow28_p38_gpu_v1/lib/python3.8/site-packages/keras/engine/functional.py", line 451, in call
      return self._run_internal_graph(
    File "/home/datascience/conda/tensorflow28_p38_gpu_v1/lib/python3.8/site-packages/keras/engine/functional.py", line 589, in _run_internal_graph
      outputs = node.layer(*args, **kwargs)
    File "/home/datascience/conda/tensorflow28_p38_gpu_v1/lib/python3.8/site-packages/keras/utils/traceback_utils.py", line 64, in error_handler
      return fn(*args, **kwargs)
    File "/home/datascience/conda/tensorflow28_p38_gpu_v1/lib/python3.8/site-packages/keras/engine/base_layer.py", line 1096, in __call__
      outputs = call_fn(inputs, *args, **kwargs)
    File "/home/datascience/conda/tensorflow28_p38_gpu_v1/lib/python3.8/site-packages/keras/utils/traceback_utils.py", line 92, in error_handler
      return fn(*args, **kwargs)
    File "/home/datascience/conda/tensorflow28_p38_gpu_v1/lib/python3.8/site-packages/keras/layers/convolutional.py", line 248, in call
      outputs = self.convolution_op(inputs, self.kernel)
    File "/home/datascience/conda/tensorflow28_p38_gpu_v1/lib/python3.8/site-packages/keras/layers/convolutional.py", line 233, in convolution_op
      return tf.nn.convolution(
Node: 'model_5/256x256/Conv1_down/Conv2D_3'
2 root error(s) found.
  (0) UNKNOWN:  CUDNN failed to allocate the scratch space for the runner or to find a working no-scratch runner.
	 [[{{node model_5/256x256/Conv1_down/Conv2D_3}}]]
	 [[Identity_11/_96]]
  (1) UNKNOWN:  CUDNN failed to allocate the scratch space for the runner or to find a working no-scratch runner.
	 [[{{node model_5/256x256/Conv1_down/Conv2D_3}}]]
0 successful operations.
0 derived errors ignored. [Op:__inference_training_step_tf_fuction_256_41858]

34305

# Batch Size float32

## 8

In [1]:
import tensorflow as tf
import sys, os
from train import train_cycle
os.environ["CUDA_VISIBLE_DEVICES"]="0"
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))
train_cycle(datapath='/home/datascience/',outpath='./',init_res=256,max_res=256,change_model=True,minibatch_base=64,batch_size=8,profiling=True,use_tf32=False)

Num GPUs Available:  1
/home/datascience/floods/
/home/datascience/floods/
False
8
INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0',)
Number of devices: 1
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/jo

17921

## 16

In [1]:
import tensorflow as tf
import sys, os
from train import train_cycle
os.environ["CUDA_VISIBLE_DEVICES"]="0"
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))
train_cycle(datapath='/home/datascience/',outpath='./',init_res=256,max_res=256,change_model=True,minibatch_base=64,batch_size=16,profiling=True,use_tf32=False)

Num GPUs Available:  1
/home/datascience/floods/
/home/datascience/floods/
False
16
INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0',)
Number of devices: 1
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/j

17921

## 64

In [66]:
import tensorflow as tf
import sys, os
from train import train_cycle
os.environ["CUDA_VISIBLE_DEVICES"]="0"
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))
train_cycle(datapath='/home/datascience/',outpath='./',init_res=256,max_res=256,change_model=True,minibatch_base=64,batch_size=64,profiling=True,use_tf32=False)

Num GPUs Available:  1
False
256
Number of devices: 1
Searching in : /home/datascience/tfrecordsprepro/*.tfrecords
Found 28 files. 
Initialization time is 0.0003867149353027344 seconds
Loading Data
Epoch 1 of 60
Number of Batches:  488
Instructions for updating:
Use `tf.compat.v1.graph_util.tensor_shape_from_node_def_name`
FLOP =  8606716502120
Average per batch was:  5.5624189376831055
Time taken by batch 6  was 5.854866981506348 seconds.
Time taken by epoch0 was 93.62191557884216 seconds.

Testing for epoch 0:
(256, 256)
FLOP =  6049885346822
Average per batch was:  1.0930707454681396


34305

# Number of GPUs

## 2

In [1]:
import tensorflow as tf
import sys, os
from train import train_cycle
os.environ["CUDA_VISIBLE_DEVICES"]="0,1"
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
train_cycle(datapath='/home/datascience/',outpath='./',init_res=256,max_res=256,change_model=True,minibatch_base=64,batch_size=16,profiling=True,use_tf32=True)

/home/datascience/floods/
/home/datascience/floods/
True
16
INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0', '/job:localhost/replica:0/task:0/device:GPU:1')
Number of devices: 2
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0

## 4

In [1]:
import tensorflow as tf
import sys, os
from train import train_cycle
os.environ["CUDA_VISIBLE_DEVICES"]="0,1,2,3"
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
train_cycle(datapath='/home/datascience/',outpath='./',init_res=256,max_res=256,change_model=True,minibatch_base=64,batch_size=32,profiling=True,use_tf32=True)

/home/datascience/floods/
/home/datascience/floods/
True
32
INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0', '/job:localhost/replica:0/task:0/device:GPU:1', '/job:localhost/replica:0/task:0/device:GPU:2', '/job:localhost/replica:0/task:0/device:GPU:3')
Number of devices: 4
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:

## 8

In [1]:
import tensorflow as tf
import sys, os
from train import train_cycle
os.environ["CUDA_VISIBLE_DEVICES"]="0,1,2,3,4,5,6,7"
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
train_cycle(datapath='/home/datascience/',outpath='./',init_res=256,max_res=256,change_model=True,minibatch_base=64,batch_size=64,profiling=True,use_tf32=True)

/home/datascience/floods/
/home/datascience/floods/
True
64
INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0', '/job:localhost/replica:0/task:0/device:GPU:1', '/job:localhost/replica:0/task:0/device:GPU:2', '/job:localhost/replica:0/task:0/device:GPU:3', '/job:localhost/replica:0/task:0/device:GPU:4', '/job:localhost/replica:0/task:0/device:GPU:5', '/job:localhost/replica:0/task:0/device:GPU:6', '/job:localhost/replica:0/task:0/device:GPU:7')
Number of devices: 8
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/devi

In [14]:
!python gan_main.py -h

usage: gan_main.py [-h] [--multi_node MULTI_NODE]
                   [--workers WORKERS [WORKERS ...]] [--index INDEX]
                   [--use_gs USE_GS] [--datapath DATAPATH] [--outpath OUTPATH]
                   [--nbepochs NBEPOCHS] [--batchsize BATCHSIZE]
                   [--use_gpus USE_GPUS]
                   [--GLOBAL_BATCH_SIZE GLOBAL_BATCH_SIZE]
                   [--nb_epochs NB_EPOCHS] [--batch_size BATCH_SIZE]
                   [--latent_size LATENT_SIZE] [--verbose VERBOSE]
                   [--nEvents NEVENTS] [--ascale ASCALE] [--yscale YSCALE]
                   [--xscale XSCALE] [--xpower XPOWER] [--angscale ANGSCALE]
                   [--analyse ANALYSE] [--dformat DFORMAT] [--thresh THRESH]
                   [--angtype ANGTYPE] [--particle PARTICLE] [--warm WARM]
                   [--lr LR] [--events_per_file EVENTS_PER_FILE] [--name NAME]
                   [--g_weights G_WEIGHTS] [--d_weights D_WEIGHTS]
                   [--tlab TLAB] [--profiling] [--u

In [57]:
import tensorflow as tf
tf.config.experimental.enable_tensor_float_32_execution(False)
tf.keras.mixed_precision.set_global_policy('mixed_bfloat16')

layer = tf.keras.layers.Conv2D(filters=4, kernel_size=2)
print(layer.compute_dtype)
print(layer.variable_dtype)

print(tf.config.experimental.tensor_float_32_execution_enabled())


bfloat16
float32
False


In [None]:
class Args:
  data = './data/penn'
  model = 'LSTM'
  emsize = 200
  nhid = 200

args=Args()

In [63]:
import pandas as pd

df = pd.read_csv('/home/datascience/gpu_stats/tf32_bs96.csv', header=None)

#print(df)
#power.draw [W]  utilization.gpu [%]

power_values = []

for index, row in df.iterrows():
    if index > 0:
        value = int(row[1][:-2])
        if value >= 90:
            power_values.append(float(row[0][:-2]))
            
print(max(power_values))
print(min(power_values))
print(sum(power_values)/len(power_values))
            

304.44
75.02
224.8809090909091
