In [None]:
import tensorflow as tf

In [None]:
!ls ../input

In [None]:
# Detect hardware, return appropriate distribution strategy
try:
    # TPU detection. No parameters necessary if TPU_NAME environment variable is set. On Kaggle this is always the case.
    TPU = tf.distribute.cluster_resolver.TPUClusterResolver()  
except ValueError:
    TPU = None

if TPU:
    print(f"\n... RUNNING ON TPU - {TPU.master()}...")
    tf.config.experimental_connect_to_cluster(TPU)
    tf.tpu.experimental.initialize_tpu_system(TPU)
    strategy = tf.distribute.experimental.TPUStrategy(TPU)
else:
    print(f"\n... RUNNING ON CPU/GPU ...")
    # Yield the default distribution strategy in Tensorflow
    #   --> Works on CPU and single GPU.
    strategy = tf.distribute.get_strategy() 

# What Is a Replica?
#    --> A single Cloud TPU device consists of FOUR chips, each of which has TWO TPU cores. 
#    --> Therefore, for efficient utilization of Cloud TPU, a program should make use of each of the EIGHT (4x2) cores. 
#    --> Each replica is essentially a copy of the training graph that is run on each core and 
#        trains a mini-batch containing 1/8th of the overall batch size
N_REPLICAS = strategy.num_replicas_in_sync
    
print(f"... # OF REPLICAS: {N_REPLICAS} ...\n")

print(f"\n... ACCELERATOR SETUP COMPLTED ...\n")

In [None]:
print(f"\n... XLA OPTIMIZATIONS STARTING ...\n")

print(f"\n... CONFIGURE JIT (JUST IN TIME) COMPILATION ...\n")
# enable XLA optmizations (10% speedup when using @tf.function calls)
tf.config.optimizer.set_jit(True)

print(f"\n... XLA OPTIMIZATIONS COMPLETED ...\n")

In [None]:
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
user_credential = user_secrets.get_gcloud_credential()

# Step 2: Set the credentials
user_secrets.set_tensorflow_credential(user_credential)

In [None]:
from kaggle_datasets import KaggleDatasets
DATA_DIR = KaggleDatasets().get_gcs_path("siim-cocolike-tfrecords")
DATA_DIR

In [None]:
MODEL_DIR = KaggleDatasets().get_gcs_path("retinanet-weights")
MODEL_DIR

In [None]:
!git clone https://github.com/tensorflow/tpu/

In [None]:
!pip install ./tpu/models/

In [None]:
!pip install --user 'git+https://github.com/cocodataset/cocoapi#egg=pycocotools&subdirectory=PythonAPI'

In [None]:
%%writefile my_retinanet.yaml
type: 'retinanet'
architecture:
    backbone: 'spinenet'
    multilevel_features: 'identity'
    num_classes: 2
train:
    total_steps: 12000,
    train_batch_size: 64
    train_file_pattern: gs://kds-0714d52387535e14062e376fa98f7f0707d4e2b6d65d7b6569482c68/fold_0/train*.tfrecord
    checkpoint:
        path: gs://kds-f15289519b214c67c5f884a932fc7b647ed0c475d14f3d2a6b67c717/detection_retinanet_spinenet-96-best/model.ckpt
        prefix: spinenet96/
eval:
    eval_file_pattern: gs://kds-ebf9447eebe1e7ebd75c826c574b61e41e4b1e7fad139ffad4b2cb72/fold_0/eval*.tfrecord
    num_steps_per_eval: 250
    use_json_file: False
batch_norm_activation:
    use_sync_bn: true
    activation: 'swish'
spinenet:
    model_id: '96'
    init_drop_connect_rate: 0.2
anchor:
    anchor_size: 3.0
retinanet_parser:
    output_size: [1024, 1024]
    aug_scale_min: 0.5
    aug_scale_max: 2.0

In [None]:
!python /kaggle/working/tpu/models/official/detection/main.py \
  --model="retinanet" \
  --model_dir="gs://effdet_siim_output/retinanet/" \
  --use_tpu=True \
  --tpu="grpc://10.0.0.2:8470" \
  --num_cores=8 \
  --mode=train \
  --config_file="my_retinanet.yaml" \
  --params_override=""