# Model Gym for Sky Hackathon

This is a gym for training models, different models will be generated for testing and comparing their performance (focus on accuracy) with each other.

## 0. Setup

In [72]:
!docker login nvcr.io

Authenticating with existing credentials...
https://docs.docker.com/engine/reference/commandline/login/#credentials-store

Login Succeeded


In [73]:
# View the versions of the TAO launcher
!tao info

Configuration of the TAO Toolkit Instance
dockers: ['nvidia/tao/tao-toolkit']
format_version: 2.0
toolkit_version: 4.0.1
published_date: 03/06/2023


In [74]:
import os
import json
import subprocess

In [75]:
from utils import generate_val_dataset

In [76]:
# ngc key
%env KEY=OW1paDZ2Zm1zaHNlM2ljbmZjdml0MDh2OHY6YzAyNGY2ZGMtNGQ3OS00NmI4LTg4YTItY2ViODM5N2EwMDIw

# workspace
%env LOCAL_PROJECT_DIR=/root/sky/BoxDetector/model_gym

env: KEY=OW1paDZ2Zm1zaHNlM2ljbmZjdml0MDh2OHY6YzAyNGY2ZGMtNGQ3OS00NmI4LTg4YTItY2ViODM5N2EwMDIw
env: LOCAL_PROJECT_DIR=/root/sky/BoxDetector/model_gym


In [12]:
# Installing NGC CLI on the local machine.
## Download and install
%env CLI=ngccli_cat_linux.zip
!mkdir -p $LOCAL_PROJECT_DIR/ngccli

# Remove any previously existing CLI installations
!rm -rf $LOCAL_PROJECT_DIR/ngccli/*
!wget "https://ngc.nvidia.com/downloads/$CLI" -P $LOCAL_PROJECT_DIR/ngccli --no-check-certificate
!unzip -u "$LOCAL_PROJECT_DIR/ngccli/$CLI" -d $LOCAL_PROJECT_DIR/ngccli/
!rm $LOCAL_PROJECT_DIR/ngccli/*.zip 
os.environ["PATH"]="{}/ngccli/ngc-cli:{}".format(os.getenv("LOCAL_PROJECT_DIR", ""), os.getenv("PATH", ""))

env: CLI=ngccli_cat_linux.zip
--2023-06-02 16:54:14--  https://ngc.nvidia.com/downloads/ngccli_cat_linux.zip
Connecting to 127.0.0.1:7890... connected.
Unable to establish SSL connection.
unzip:  cannot find or open /root/sky/BoxDetector/model_gym/ngccli/ngccli_cat_linux.zip, /root/sky/BoxDetector/model_gym/ngccli/ngccli_cat_linux.zip.zip or /root/sky/BoxDetector/model_gym/ngccli/ngccli_cat_linux.zip.ZIP.
rm: cannot remove ‘/root/sky/BoxDetector/model_gym/ngccli/*.zip’: No such file or directory


In [13]:
!ngc registry model list nvidia/tao/pretrained_object_detection:*

Connection failed; retrying... (Retries left: 5)
^C


In [77]:
def config_workspace(data_version, model_version, spec_version):
    os.environ["LOCAL_DATA_DIR"] = os.path.join(os.getenv("LOCAL_PROJECT_DIR", os.getcwd()), "datasets", data_version)
    os.environ["LOCAL_EXPERIMENT_DIR"] = os.path.join(os.getenv("LOCAL_PROJECT_DIR", os.getcwd()), "models", model_version)
    os.environ["LOCAL_SPECS_DIR"] = os.path.join(os.getenv("LOCAL_PROJECT_DIR", os.getcwd()),"specs", spec_version)

    # docker volume mapping
    os.environ["USER_EXPERIMENT_DIR"] = os.path.join("/workspace/tao-experiments/models", model_version)
    os.environ["DATA_DOWNLOAD_DIR"] = os.path.join("/workspace/tao-experiments/datasets", data_version)
    os.environ["SPECS_DIR"] = os.path.join("/workspace/tao-experiments/specs", spec_version)
    os.environ["PRETRAINED_MODEL_DIR"] = os.path.join("/workspace/tao-experiments/pretrained_models")
    
    # Mapping up the local directories to the TAO docker.
    mounts_file = os.path.expanduser("~/.tao_mounts.json")
    
    # Define the dictionary with the mapped drives
    drive_map = {
        "Mounts": [
                # Mapping the data directory
                {
                    "source": os.environ["LOCAL_PROJECT_DIR"],
                    "destination": "/workspace/tao-experiments"
                },
                # Mapping the specs directory.
                {
                    "source": os.environ["LOCAL_SPECS_DIR"],
                    "destination": os.environ["SPECS_DIR"]
                },
            ],
        "DockerOptions": {
            "user": "{}:{}".format(os.getuid(), os.getgid())
        }
    }
    
    # Writing the mounts file.
    with open(mounts_file, "w") as mfile:
        json.dump(drive_map, mfile, indent=4)

In [78]:
!echo $LOCAL_DATA_DIR

/root/sky/BoxDetector/model_gym/datasets/colorful


### CONFIG THE FOLLOWING VARIABLES to CONTROL Training

In [91]:
data_version='complex'
model_version='ssd-mobilenet-complex'
spec_version='mobilenet-complex'

In [92]:
config_workspace(data_version=data_version,
                 model_version=model_version,
                 spec_version=spec_version)

## 1. Prepare Data

Data is an important factor to influence the performance of trained model.

In this section, we can modify the data source and data augmentation method to generate different training dataset.

In [101]:
!tao ssd dataset_convert \
                    -d $SPECS_DIR/ssd_tfrecords_kitti_train.txt \
                    -o $DATA_DOWNLOAD_DIR/tfrecords/

2023-06-04 13:39:06,479 [INFO] root: Registry: ['nvcr.io']
2023-06-04 13:39:06,631 [INFO] tlt.components.instance_handler.local_instance: Running command in container: nvcr.io/nvidia/tao/tao-toolkit:4.0.0-tf1.15.5
Using TensorFlow backend.
2023-06-04 05:39:09.209538: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0
Using TensorFlow backend.
2023-06-04 05:39:17,470 [INFO] iva.detectnet_v2.dataio.build_converter: Instantiating a kitti converter
2023-06-04 05:39:17,470 [INFO] iva.detectnet_v2.dataio.dataset_converter_lib: Creating output directory /workspace/tao-experiments/datasets/complex/tfrecords
2023-06-04 05:39:17,499 [INFO] iva.detectnet_v2.dataio.kitti_converter_lib: Num images in
Train: 9998	Val: 0
2023-06-04 05:39:17,499 [INFO] iva.detectnet_v2.dataio.kitti_converter_lib: Skipped validation data...
2023-06-04 05:39:17,506 [INFO] iva.detectnet_v2.dataio.dataset_converter_lib: Writing partition 0, shard 0
2023-06

In [96]:
def prepare_dataset(val_ratio=0.1):
    os.makedirs(os.getenv("LOCAL_DATA_DIR"), exist_ok=True)
    os.makedirs(os.getenv("LOCAL_EXPERIMENT_DIR"), exist_ok=True)
    os.makedirs(os.path.join(os.getenv("LOCAL_DATA_DIR"), "box"), exist_ok=True)
    subprocess.run("unzip -d " + os.getenv("LOCAL_DATA_DIR") + " " + os.getenv("LOCAL_DATA_DIR") + "/box.zip", shell=True)
    input_image_dir = os.path.join(os.getenv("LOCAL_DATA_DIR"), "box", "rgb")
    input_label_dir = os.path.join(os.getenv("LOCAL_DATA_DIR"), "box", "object_detection")
    val_dir = os.path.join(os.getenv("LOCAL_DATA_DIR"), "box", "val")
    generate_val_dataset(input_image_dir,
                         input_label_dir,
                         val_dir,
                         val_ratio=val_ratio)

In [97]:
prepare_dataset()

Archive:  /root/sky/BoxDetector/model_gym/datasets/complex/box.zip
  inflating: /root/sky/BoxDetector/model_gym/datasets/complex/box/object_detection/0.txt  
  inflating: /root/sky/BoxDetector/model_gym/datasets/complex/box/object_detection/1.txt  
  inflating: /root/sky/BoxDetector/model_gym/datasets/complex/box/object_detection/10.txt  
  inflating: /root/sky/BoxDetector/model_gym/datasets/complex/box/object_detection/100.txt  
  inflating: /root/sky/BoxDetector/model_gym/datasets/complex/box/object_detection/1000.txt  
  inflating: /root/sky/BoxDetector/model_gym/datasets/complex/box/object_detection/1001.txt  
  inflating: /root/sky/BoxDetector/model_gym/datasets/complex/box/object_detection/1002.txt  
  inflating: /root/sky/BoxDetector/model_gym/datasets/complex/box/object_detection/1003.txt  
  inflating: /root/sky/BoxDetector/model_gym/datasets/complex/box/object_detection/1004.txt  
  inflating: /root/sky/BoxDetector/model_gym/datasets/complex/box/object_detection/1005.txt  
  

replace /root/sky/BoxDetector/model_gym/datasets/complex/box/object_detection/1898.txt? [y]es, [n]o, [A]ll, [N]one, [r]ename:  NULL
(EOF or read error, treating as "[N]one" ...)


  inflating: /root/sky/BoxDetector/model_gym/datasets/complex/box/rgb/0.png  
  inflating: /root/sky/BoxDetector/model_gym/datasets/complex/box/rgb/1.png  
  inflating: /root/sky/BoxDetector/model_gym/datasets/complex/box/rgb/10.png  
  inflating: /root/sky/BoxDetector/model_gym/datasets/complex/box/rgb/100.png  
  inflating: /root/sky/BoxDetector/model_gym/datasets/complex/box/rgb/1000.png  
  inflating: /root/sky/BoxDetector/model_gym/datasets/complex/box/rgb/1001.png  
  inflating: /root/sky/BoxDetector/model_gym/datasets/complex/box/rgb/1002.png  
  inflating: /root/sky/BoxDetector/model_gym/datasets/complex/box/rgb/1003.png  
  inflating: /root/sky/BoxDetector/model_gym/datasets/complex/box/rgb/1004.png  
  inflating: /root/sky/BoxDetector/model_gym/datasets/complex/box/rgb/1005.png  
  inflating: /root/sky/BoxDetector/model_gym/datasets/complex/box/rgb/1006.png  
  inflating: /root/sky/BoxDetector/model_gym/datasets/complex/box/rgb/1007.png  
  inflating: /root/sky/BoxDetector/mo

In [99]:
print("TFRecords conversion spec file:")
!cat $LOCAL_SPECS_DIR/yolov3_tfrecords_kitti.txt

TFRecords conversion spec file:
kitti_config {
  root_directory_path: "/workspace/tao-experiments/datasets/complex/box"
  image_dir_name: "rgb"
  label_dir_name: "object_detection"
  image_extension: ".png"
  partition_mode: "random"
  num_partitions: 2
  val_split: 0
  num_shards: 10
}
image_directory_path: "/workspace/tao-experiments/datasets/complex/box"
target_class_mapping {
    key: "box"
    value: "box"
}


In [143]:
print("Converting the training set to TFRecords.")
!mkdir -p $LOCAL_DATA_DIR/tfrecords && rm -rf $LOCAL_DATA_DIR/tfrecords/*
!tao ssd dataset_convert \
         -d $SPECS_DIR/yolov3_tfrecords_kitti.txt \
         -o $DATA_DOWNLOAD_DIR/tfrecords/kitti_train

Converting the training set to TFRecords.
2023-05-31 16:06:34,471 [INFO] root: Registry: ['nvcr.io']
2023-05-31 16:06:34,632 [INFO] tlt.components.instance_handler.local_instance: Running command in container: nvcr.io/nvidia/tao/tao-toolkit:4.0.0-tf1.15.5
Using TensorFlow backend.
2023-05-31 08:06:37.127575: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0
Using TensorFlow backend.
2023-05-31 08:06:45,529 [INFO] iva.detectnet_v2.dataio.build_converter: Instantiating a kitti converter
2023-05-31 08:06:45,557 [INFO] iva.detectnet_v2.dataio.kitti_converter_lib: Num images in
Train: 9998	Val: 0
2023-05-31 08:06:45,557 [INFO] iva.detectnet_v2.dataio.kitti_converter_lib: Skipped validation data...
2023-05-31 08:06:45,565 [INFO] iva.detectnet_v2.dataio.dataset_converter_lib: Writing partition 0, shard 0
2023-05-31 08:06:46,055 [INFO] iva.detectnet_v2.dataio.dataset_converter_lib: Writing partition 0, shard 1
2023-05-31 08:06

## 2. Prepare Model

In this section, we can choose backbone and head of model and modify their configs to genreate different models.

### Fetch pretrained model

In [15]:
!mkdir -p $LOCAL_PROJECT_DIR/pretrained_models/resnet18/

In [37]:
!mkdir -p $LOCAL_PROJECT_DIR/pretrained_models/resnet34/

In [None]:
!mkdir -p $LOCAL_PROJECT_DIR/pretrained_models/mobilenetv2/

In [16]:
!ngc registry model download-version nvidia/tao/pretrained_object_detection:resnet18 --dest $LOCAL_PROJECT_DIR/pretrained_models/resnet18

Downloaded 82.38 MB in 12s, Download speed: 6.85 MB/s               
--------------------------------------------------------------------------------
   Transfer id: pretrained_object_detection_vresnet18
   Download status: Completed
   Downloaded local path: /root/sky/BoxDetector/model_gym/pretrained_models/resnet18/pretrained_object_detection_vresnet18-1
   Total files downloaded: 1
   Total downloaded size: 82.38 MB
   Started at: 2023-05-27 21:28:37.855064
   Completed at: 2023-05-27 21:28:49.880051
   Duration taken: 12s
--------------------------------------------------------------------------------


In [38]:
!ngc registry model download-version nvidia/tao/pretrained_object_detection:resnet34 --dest $LOCAL_PROJECT_DIR/pretrained_models/resnet34

Downloaded 158.03 MB in 21s, Download speed: 7.51 MB/s               
--------------------------------------------------------------------------------
   Transfer id: pretrained_object_detection_vresnet34
   Download status: Completed
   Downloaded local path: /root/sky/BoxDetector/model_gym/pretrained_models/resnet34/pretrained_object_detection_vresnet34
   Total files downloaded: 1
   Total downloaded size: 158.03 MB
   Started at: 2023-05-28 10:04:05.346518
   Completed at: 2023-05-28 10:04:26.391269
   Duration taken: 21s
--------------------------------------------------------------------------------


In [190]:
!ngc registry model download-version nvidia/tao_pretrained_object_detection:mobilenetv2 --dest $LOCAL_PROJECT_DIR/pretrained_models/mobilenetv2

Connection failed; retrying... (Retries left: 5)
Transient error StatusCode.UNAVAILABLE encountered while exporting traces, retrying in 1s.
Connection failed; retrying... (Retries left: 4)
Transient error StatusCode.UNAVAILABLE encountered while exporting traces, retrying in 2s.
Connection failed; retrying... (Retries left: 3)
Transient error StatusCode.UNAVAILABLE encountered while exporting traces, retrying in 4s.
Transient error StatusCode.UNAVAILABLE encountered while exporting traces, retrying in 8s.
Connection failed; retrying... (Retries left: 2)
Transient error StatusCode.UNAVAILABLE encountered while exporting traces, retrying in 16s.
Connection failed; retrying... (Retries left: 1)
Error: client is unable to make a connection.
Transient error StatusCode.UNAVAILABLE encountered while exporting traces, retrying in 32s.
Transient error StatusCode.UNAVAILABLE encountered while exporting traces, retrying in 1s.
Transient error StatusCode.UNAVAILABLE encountered while exporting tra

## 3. Train Model

In this section, we can modify the training strategy to train the selected model.

In [93]:
!cat $LOCAL_SPECS_DIR/ssd_train_mobilenetv2_kitti.txt

random_seed: 42
ssd_config {
  aspect_ratios_global: "[1.0, 2.0, 0.5, 3.0, 1.0/3.0]"
  scales: "[0.05, 0.1, 0.25, 0.4, 0.55, 0.7, 0.85]"
  two_boxes_for_ar1: true
  clip_boxes: false
  variances: "[0.1, 0.1, 0.2, 0.2]"
  arch: "resnet"
  nlayers: 18
  freeze_bn: false
  freeze_blocks: 0
}
training_config {
  batch_size_per_gpu: 16
  num_epochs: 80
  enable_qat: false
  learning_rate {
  soft_start_annealing_schedule {
    min_learning_rate: 5e-5
    max_learning_rate: 2e-2
    soft_start: 0.15
    annealing: 0.8
    }
  }
  regularizer {
    type: L1
    weight: 3e-5
  }
}
eval_config {
  validation_period_during_training: 10
  average_precision_mode: SAMPLE
  batch_size: 16
  matching_iou_threshold: 0.5
}
nms_config {
  confidence_threshold: 0.01
  clustering_iou_threshold: 0.6
  top_k: 200
}
augmentation_config {
    output_width: 300
    output_height: 300
    output_channel: 3
}
dataset_config {
  data_sources: {
    tfrecords_path: "/workspace/tao-experiments/datasets/complex/tfre

In [94]:
!mkdir -p $LOCAL_EXPERIMENT_DIR/experiment_dir_unpruned

### Finetune model

In [12]:
!tao yolo_v3 kmeans -l /workspace/tao-experiments/datasets/complex/box/object_detection \
                        -i /workspace/tao-experiments/datasets/complex/box/rgb \
                        -x 320 \
                        -y 320

2023-06-02 16:58:35,997 [INFO] root: Registry: ['nvcr.io']
2023-06-02 16:58:36,106 [INFO] tlt.components.instance_handler.local_instance: Running command in container: nvcr.io/nvidia/tao/tao-toolkit:4.0.0-tf1.15.5
Using TensorFlow backend.
2023-06-02 08:58:38.695352: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0
Using TensorFlow backend.
Start optimization iteration: 1
Start optimization iteration: 11
Start optimization iteration: 21
Start optimization iteration: 31
Please use following anchor sizes in YOLO config:
(5.62, 26.25)
(16.25, 18.75)
(24.38, 21.88)
(32.50, 21.25)
(21.25, 40.00)
(35.62, 31.25)
(48.12, 45.62)
(65.00, 66.25)
(96.25, 105.62)
Telemetry data couldn't be sent, but the command ran successfully.
Execution status: PASS
2023-06-02 16:58:58,986 [INFO] tlt.components.docker_handler.docker_handler: Stopping container.


In [98]:
!echo $USER_EXPERIMENT_DIR

/workspace/tao-experiments/models/ssd-mobilenet-complex


In [99]:
!echo $SPECS_DIR

/workspace/tao-experiments/specs/mobilenet-complex


In [None]:
print("To run with multigpu, please change --gpus based on the number of available GPUs in your machine.")
!tao ssd train --gpus 2 \
               -e $SPECS_DIR/ssd_train_mobilenetv2_kitti.txt \
               -r $USER_EXPERIMENT_DIR/experiment_dir_unpruned \
               -k $KEY \
               

To run with multigpu, please change --gpus based on the number of available GPUs in your machine.
2023-06-04 15:03:05,117 [INFO] root: Registry: ['nvcr.io']
2023-06-04 15:03:05,269 [INFO] tlt.components.instance_handler.local_instance: Running command in container: nvcr.io/nvidia/tao/tao-toolkit:4.0.0-tf1.15.5
Using TensorFlow backend.
2023-06-04 07:03:07.796586: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0
Using TensorFlow backend.
Using TensorFlow backend.
2023-06-04 07:03:18,791 [INFO] iva.common.logging.logging: Log file already exists at /workspace/tao-experiments/models/ssd-mobilenet-complex/experiment_dir_unpruned/status.json
2023-06-04 07:03:18,791 [INFO] iva.ssd.utils.spec_loader: Merging specification from /workspace/tao-experiments/specs/mobilenet-complex/ssd_train_mobilenetv2_kitti.txt






2023-06-04 07:03:18,821 [INFO] iva.common.logging.logging: Log file already exists at /workspace/tao-experiments

In [107]:
!cat $LOCAL_EXPERIMENT_DIR/experiment_dir_unpruned/ssd_training_log_mobilenet_v2.csv
%set_env EPOCH=040

epoch,AP_box,loss,lr,mAP,validation_loss
1,nan,11.986258,0.00027144174,nan,nan
2,nan,7.218335,0.00073680625,nan,nan
3,nan,6.1828117,0.0019999996,nan,nan
4,nan,5.1083107,0.0054288344,nan,nan
5,nan,4.5862713,0.014736122,nan,nan
env: EPOCH=040


In [155]:
!tao yolo_v3 evaluate --gpu_index=0 \
                  -e $SPECS_DIR/yolov3_train_mobilenetv2.txt \
                  -m $USER_EXPERIMENT_DIR/experiment_dir_unpruned/weights/yolov3_mobilenetv2_epoch_$EPOCH.tlt \
                  -k $KEY

2023-05-31 20:19:00,102 [INFO] root: Registry: ['nvcr.io']
2023-05-31 20:19:00,255 [INFO] tlt.components.instance_handler.local_instance: Running command in container: nvcr.io/nvidia/tao/tao-toolkit:4.0.0-tf1.15.5
Using TensorFlow backend.
2023-05-31 12:19:02.857113: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0
Using TensorFlow backend.
































__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Input (InputLayer)              (None, 3, 384, 1248) 0                                            
__________________________________________________________________________________________________
conv1 (Conv2D)                  (None, 64, 192, 624) 9408        Input[0][0]                      
_____________________________________________________________

## 4. Compress Model

The deployed device is an edge device. So our model should be pre-compressed and then deploy for achieving better accuracy-latency trade-off.

In this section, we use different pruning configs to gerneate different versions of compressed models.

In [19]:
!mkdir -p $LOCAL_EXPERIMENT_DIR/experiment_dir_pruned

In [20]:
!tao ssd prune --gpu_index=0 \
               -e $SPECS_DIR/ssd_train_mobilnetv2_kitti.txt \
               -m $USER_EXPERIMENT_DIR/experiment_dir_unpruned/weights/ssd_mobilenet_v2_epoch_$EPOCH.tlt \
               -o $USER_EXPERIMENT_DIR/experiment_dir_pruned/ssd_mobilenet_v2_pruned_point3.tlt \
               -eq intersection \
               -pth 0.3 \
               -k $KEY

2023-06-02 18:48:03,382 [INFO] root: Registry: ['nvcr.io']
2023-06-02 18:48:03,533 [INFO] tlt.components.instance_handler.local_instance: Running command in container: nvcr.io/nvidia/tao/tao-toolkit:4.0.0-tf1.15.5
Using TensorFlow backend.
2023-06-02 10:48:06.051598: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0
Using TensorFlow backend.






























2023-06-02 10:48:31,080 [INFO] modulus.pruning.pruning: Exploring graph for retainable indices
2023-06-02 10:48:43,522 [INFO] modulus.pruning.pruning: Pruning model and appending pruned nodes to new graph
2023-06-02 10:53:46,281 [INFO] __main__: Pruning ratio (pruned model / original model): 0.06765043331990556
Telemetry data couldn't be sent, but the command ran successfully.
Execution status: PASS
2023-06-02 18:53:59,550 [INFO] tlt.components.docker_handler.docker_handler: Stopping container.


In [165]:
!echo $USER_EXPERIMENT_DIR

/workspace/tao-experiments/models/yolov3-resnet18


In [21]:
!tao yolo_v3 prune --gpu_index=0 \
               -e $SPECS_DIR/yolov3_train_resnet18.txt \
               -m $USER_EXPERIMENT_DIR/experiment_dir_unpruned/weights/yolov3_resnet18_epoch_$EPOCH.tlt \
               -o $USER_EXPERIMENT_DIR/experiment_dir_pruned/yolov3_resnet18_pruned_point4.tlt \
               -eq intersection \
               -pth 0.4 \
               -k $KEY

2023-06-02 18:54:00,620 [INFO] root: Registry: ['nvcr.io']
2023-06-02 18:54:00,776 [INFO] tlt.components.instance_handler.local_instance: Running command in container: nvcr.io/nvidia/tao/tao-toolkit:4.0.0-tf1.15.5
Using TensorFlow backend.
2023-06-02 10:54:03.288732: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0
Using TensorFlow backend.






























2023-06-02 10:54:28,682 [INFO] modulus.pruning.pruning: Exploring graph for retainable indices
2023-06-02 10:54:42,344 [INFO] modulus.pruning.pruning: Pruning model and appending pruned nodes to new graph
2023-06-02 10:59:42,958 [INFO] __main__: Pruning ratio (pruned model / original model): 0.03593943949685808
Telemetry data couldn't be sent, but the command ran successfully.
Execution status: PASS
2023-06-02 18:59:56,031 [INFO] tlt.components.docker_handler.docker_handler: Stopping container.


In [22]:
!tao yolo_v3 prune --gpu_index=0 \
               -e $SPECS_DIR/yolov3_train_resnet18.txt \
               -m $USER_EXPERIMENT_DIR/experiment_dir_unpruned/weights/yolov3_resnet18_epoch_$EPOCH.tlt \
               -o $USER_EXPERIMENT_DIR/experiment_dir_pruned/yolov3_resnet18_pruned_point5.tlt \
               -eq intersection \
               -pth 0.5 \
               -k $KEY

2023-06-02 18:59:57,054 [INFO] root: Registry: ['nvcr.io']
2023-06-02 18:59:57,206 [INFO] tlt.components.instance_handler.local_instance: Running command in container: nvcr.io/nvidia/tao/tao-toolkit:4.0.0-tf1.15.5
Using TensorFlow backend.
2023-06-02 10:59:59.558016: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0
Using TensorFlow backend.






























2023-06-02 11:00:24,588 [INFO] modulus.pruning.pruning: Exploring graph for retainable indices
2023-06-02 11:00:38,157 [INFO] modulus.pruning.pruning: Pruning model and appending pruned nodes to new graph
2023-06-02 11:05:45,152 [INFO] __main__: Pruning ratio (pruned model / original model): 0.018546306997035276
Telemetry data couldn't be sent, but the command ran successfully.
Execution status: PASS
2023-06-02 19:05:58,638 [INFO] tlt.components.docker_handler.docker_handler: Stopping container.


In [23]:
!tao yolo_v3 prune --gpu_index=0 \
               -e $SPECS_DIR/yolov3_train_resnet18.txt \
               -m $USER_EXPERIMENT_DIR/experiment_dir_unpruned/weights/yolov3_resnet18_epoch_$EPOCH.tlt \
               -o $USER_EXPERIMENT_DIR/experiment_dir_pruned/yolov3_resnet18_pruned_point6.tlt \
               -eq intersection \
               -pth 0.6 \
               -k $KEY

2023-06-02 19:05:59,699 [INFO] root: Registry: ['nvcr.io']
2023-06-02 19:05:59,854 [INFO] tlt.components.instance_handler.local_instance: Running command in container: nvcr.io/nvidia/tao/tao-toolkit:4.0.0-tf1.15.5
Using TensorFlow backend.
2023-06-02 11:06:02.258474: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0
Using TensorFlow backend.






























2023-06-02 11:06:27,066 [INFO] modulus.pruning.pruning: Exploring graph for retainable indices
2023-06-02 11:06:39,355 [INFO] modulus.pruning.pruning: Pruning model and appending pruned nodes to new graph
2023-06-02 11:11:46,486 [INFO] __main__: Pruning ratio (pruned model / original model): 0.009228214454338225
Telemetry data couldn't be sent, but the command ran successfully.
Execution status: PASS
2023-06-02 19:11:59,674 [INFO] tlt.components.docker_handler.docker_handler: Stopping container.


In [24]:
!tao yolo_v3 prune --gpu_index=0 \
               -e $SPECS_DIR/yolov3_train_resnet18.txt \
               -m $USER_EXPERIMENT_DIR/experiment_dir_unpruned/weights/yolov3_resnet18_epoch_$EPOCH.tlt \
               -o $USER_EXPERIMENT_DIR/experiment_dir_pruned/yolov3_resnet18_pruned_point7.tlt \
               -eq intersection \
               -pth 0.7 \
               -k $KEY

2023-06-02 19:12:00,776 [INFO] root: Registry: ['nvcr.io']
2023-06-02 19:12:00,933 [INFO] tlt.components.instance_handler.local_instance: Running command in container: nvcr.io/nvidia/tao/tao-toolkit:4.0.0-tf1.15.5
Using TensorFlow backend.
2023-06-02 11:12:03.371864: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0
Using TensorFlow backend.






























2023-06-02 11:12:28,664 [INFO] modulus.pruning.pruning: Exploring graph for retainable indices
2023-06-02 11:12:41,250 [INFO] modulus.pruning.pruning: Pruning model and appending pruned nodes to new graph
2023-06-02 11:17:41,076 [INFO] __main__: Pruning ratio (pruned model / original model): 0.00553705393823131
Telemetry data couldn't be sent, but the command ran successfully.
Execution status: PASS
2023-06-02 19:17:54,320 [INFO] tlt.components.docker_handler.docker_handler: Stopping container.


In [25]:
!cat $LOCAL_SPECS_DIR/ssd_retrain_resnet18_kitti.txt

random_seed: 42
ssd_config {
  aspect_ratios_global: "[1.0, 2.0, 0.5, 3.0, 1.0/3.0]"
  scales: "[0.05, 0.1, 0.25, 0.4, 0.55, 0.7, 0.85]"
  two_boxes_for_ar1: true
  clip_boxes: false
  variances: "[0.1, 0.1, 0.2, 0.2]"
  arch: "resnet"
  nlayers: 18
  freeze_bn: false
}
training_config {
  batch_size_per_gpu: 32
  num_epochs: 80
  enable_qat: false
  learning_rate {
  soft_start_annealing_schedule {
    min_learning_rate: 5e-5
    max_learning_rate: 2e-2
    soft_start: 0.1
    annealing: 0.6
    }
  }
  regularizer {
    type: NO_REG
    weight: 3e-9
  }
}
eval_config {
  validation_period_during_training: 10
  average_precision_mode: SAMPLE
  batch_size: 32
  matching_iou_threshold: 0.5
}
nms_config {
  confidence_threshold: 0.01
  clustering_iou_threshold: 0.6
  top_k: 200
}
augmentation_config {
    output_width: 300
    output_height: 300
    output_channel: 3
}
dataset_config {
  data_sources: {
    tfrecords_path: "/workspace/tao-experiments/datasets/baseline/tfrecords/kitti_tra

### Retrain the pruned model

In [30]:
# Retraining using the pruned model as pretrained weights 
!tao yolo_v3 train --gpus 2 \
               -e $SPECS_DIR/yolov3_retrain_resnet18_point3.txt \
               -r $USER_EXPERIMENT_DIR/experiment_dir_retrain_point3 \
               -k $KEY

2023-06-02 19:19:42,994 [INFO] root: Registry: ['nvcr.io']
2023-06-02 19:19:43,152 [INFO] tlt.components.instance_handler.local_instance: Running command in container: nvcr.io/nvidia/tao/tao-toolkit:4.0.0-tf1.15.5
Using TensorFlow backend.
2023-06-02 11:19:45.684563: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0
Using TensorFlow backend.
Using TensorFlow backend.


INFO: Log file already exists at /workspace/tao-experiments/models/yolov3-resnet18/experiment_dir_retrain_point3/status.json
INFO: Starting Yolo_V3 Training job






INFO: Log file already exists at /workspace/tao-experiments/models/yolov3-resnet18/experiment_dir_retrain_point3/status.json
INFO: Starting Yolo_V3 Training job






























INFO: Log file already exists at /workspace/tao-experiments/models/yolov3-resnet18/experiment_dir_retrain_point3/status.json


___________________________________________________________________________

In [None]:
# Retraining using the pruned model as pretrained weights 
!tao yolo_v3 train --gpus 2 \
               -e $SPECS_DIR/yolov3_retrain_resnet18_point4.txt \
               -r $USER_EXPERIMENT_DIR/experiment_dir_retrain_point4 \
               -k $KEY

2023-06-02 19:46:36,394 [INFO] root: Registry: ['nvcr.io']
2023-06-02 19:46:36,560 [INFO] tlt.components.instance_handler.local_instance: Running command in container: nvcr.io/nvidia/tao/tao-toolkit:4.0.0-tf1.15.5
Using TensorFlow backend.
2023-06-02 11:46:39.130498: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0
Using TensorFlow backend.
Using TensorFlow backend.


INFO: Log file already exists at /workspace/tao-experiments/models/yolov3-resnet18/experiment_dir_retrain_point4/status.json
INFO: Starting Yolo_V3 Training job






INFO: Log file already exists at /workspace/tao-experiments/models/yolov3-resnet18/experiment_dir_retrain_point4/status.json
INFO: Starting Yolo_V3 Training job






























INFO: Log file already exists at /workspace/tao-experiments/models/yolov3-resnet18/experiment_dir_retrain_point4/status.json


___________________________________________________________________________

In [None]:
# Retraining using the pruned model as pretrained weights 
!tao yolo_v3 train --gpus 2 \
               -e $SPECS_DIR/yolov3_retrain_resnet18_point5.txt \
               -r $USER_EXPERIMENT_DIR/experiment_dir_retrain_point5 \
               -k $KEY

In [None]:
# Retraining using the pruned model as pretrained weights 
!tao yolo_v3 train --gpus 2 \
               -e $SPECS_DIR/yolov3_retrain_resnet18_point6.txt \
               -r $USER_EXPERIMENT_DIR/experiment_dir_retrain_point6 \
               -k $KEY

2023-06-02 23:13:21,997 [INFO] root: Registry: ['nvcr.io']
2023-06-02 23:13:22,146 [INFO] tlt.components.instance_handler.local_instance: Running command in container: nvcr.io/nvidia/tao/tao-toolkit:4.0.0-tf1.15.5
Using TensorFlow backend.
2023-06-02 15:13:24.802641: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0
Using TensorFlow backend.
Using TensorFlow backend.


INFO: Starting Yolo_V3 Training job


INFO: Log file already exists at /workspace/tao-experiments/models/yolov3-resnet18/experiment_dir_retrain_point6/status.json
INFO: Starting Yolo_V3 Training job


































INFO: Log file already exists at /workspace/tao-experiments/models/yolov3-resnet18/experiment_dir_retrain_point6/status.json


__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
In

In [29]:
# Retraining using the pruned model as pretrained weights 
!tao yolo_v3 train --gpus 2 \
               -e $SPECS_DIR/yolov3_retrain_resnet18_point7.txt \
               -r $USER_EXPERIMENT_DIR/experiment_dir_retrain_point7 \
               -k $KEY

2023-06-02 19:18:48,888 [INFO] root: Registry: ['nvcr.io']
2023-06-02 19:18:49,036 [INFO] tlt.components.instance_handler.local_instance: Running command in container: nvcr.io/nvidia/tao/tao-toolkit:4.0.0-tf1.15.5
Using TensorFlow backend.
2023-06-02 11:18:51.503799: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0
Using TensorFlow backend.
Using TensorFlow backend.


INFO: Starting Yolo_V3 Training job
INFO: [Errno 2] No such file or directory: '/workspace/tao-experiments/specs/yolov3_resnet18/yolov3_retrain_mobilenetv2_point7.txt'


INFO: Starting Yolo_V3 Training job
INFO: [Errno 2] No such file or directory: '/workspace/tao-experiments/specs/yolov3_resnet18/yolov3_retrain_mobilenetv2_point7.txt'
Traceback (most recent call last):
  File "</usr/local/lib/python3.6/dist-packages/iva/yolo_v3/scripts/train.py>", line 3, in <module>
  File "<frozen iva.yolo_v3.scripts.train>", line 151, in <module>
  File "<frozen iva.

In [45]:
# Now check the evaluation stats in the csv file and pick the model with highest eval accuracy.
!cat $LOCAL_EXPERIMENT_DIR/experiment_dir_retrain_point7/ssd_training_log_resnet18.csv
%set_env EPOCH=080

epoch,AP_box,loss,lr,mAP,validation_loss
1,nan,9.785362,0.00021147424,nan,nan
2,nan,7.2166786,0.00044721356,nan,nan
3,nan,6.0870733,0.00094574154,nan,nan
4,nan,5.5413914,0.0019999999,nan,nan
5,nan,5.041072,0.0042294846,nan,nan
6,nan,4.195364,0.008944271,nan,nan
7,nan,3.4341812,0.01891483,nan,nan
8,nan,3.2259798,0.039999995,nan,nan
9,nan,3.1089578,0.04,nan,nan
10,0.17095562553217236,2.7456648,0.04,0.17095562553217236,243.37866904518822
11,nan,2.5834005,0.04,nan,nan
12,nan,2.4956985,0.04,nan,nan
13,nan,2.291366,0.04,nan,nan
14,nan,2.1711721,0.04,nan,nan
15,nan,2.175612,0.04,nan,nan
16,nan,2.154991,0.04,nan,nan
17,nan,2.1083677,0.04,nan,nan
18,nan,2.0530705,0.04,nan,nan
19,nan,2.0434904,0.04,nan,nan
20,0.8958708984862853,2.033781,0.04,0.8958708984862853,51.175397005948156
21,nan,2.0929792,0.04,nan,nan
22,nan,1.9624553,0.04,nan,nan
23,nan,1.9911603,0.04,nan,nan
24,nan,1.9279572,0.04,nan,nan
25,nan,1.9502152,0.04,nan,nan
26,nan,1.8996353,0.04,nan,nan
27,nan,1.9020282,0.04,nan,nan
28,nan,1.9

### Evaluate models

In [79]:
!tao ssd evaluate --gpu_index=0 \
                  -e $SPECS_DIR/ssd_retrain_resnet18_kitti.txt \
                  -m $USER_EXPERIMENT_DIR/experiment_dir_retrain_point3/weights/ssd_resnet18_epoch_$EPOCH.tlt \
                  -k $KEY

2023-05-30 15:38:54,270 [INFO] root: Registry: ['nvcr.io']
2023-05-30 15:38:54,404 [INFO] tlt.components.instance_handler.local_instance: Running command in container: nvcr.io/nvidia/tao/tao-toolkit:4.0.0-tf1.15.5
Using TensorFlow backend.
2023-05-30 07:38:56.938270: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0
Using TensorFlow backend.
2023-05-30 07:39:05,120 [INFO] iva.ssd.utils.spec_loader: Merging specification from /workspace/tao-experiments/specs/complex/ssd_retrain_resnet18_kitti.txt






2023-05-30 07:39:05,127 [INFO] root: Starting SSD evaluation.


























Using TLT model for inference, setting batch size to the one in eval_config: 32
Producing predictions: 100%|████████████████████| 32/32 [00:12<00:00,  2.51it/s]
Start to calculate AP for each class
*******************************
box           AP    0.909
              mAP   0.909
*******************************
2023-05-30 07:42:42,733

In [80]:
!tao ssd evaluate --gpu_index=0 \
                  -e $SPECS_DIR/ssd_retrain_resnet18_kitti.txt \
                  -m $USER_EXPERIMENT_DIR/experiment_dir_retrain_point4/weights/ssd_resnet18_epoch_$EPOCH.tlt \
                  -k $KEY

2023-05-30 15:42:56,837 [INFO] root: Registry: ['nvcr.io']
2023-05-30 15:42:56,980 [INFO] tlt.components.instance_handler.local_instance: Running command in container: nvcr.io/nvidia/tao/tao-toolkit:4.0.0-tf1.15.5
Using TensorFlow backend.
2023-05-30 07:42:59.385640: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0
Using TensorFlow backend.
2023-05-30 07:43:07,858 [INFO] iva.ssd.utils.spec_loader: Merging specification from /workspace/tao-experiments/specs/complex/ssd_retrain_resnet18_kitti.txt






2023-05-30 07:43:07,865 [INFO] root: Starting SSD evaluation.


























Using TLT model for inference, setting batch size to the one in eval_config: 32
Producing predictions: 100%|████████████████████| 32/32 [00:12<00:00,  2.56it/s]
Start to calculate AP for each class
*******************************
box           AP    0.909
              mAP   0.909
*******************************
2023-05-30 07:46:24,749

In [81]:
!tao ssd evaluate --gpu_index=0 \
                  -e $SPECS_DIR/ssd_retrain_resnet18_kitti.txt \
                  -m $USER_EXPERIMENT_DIR/experiment_dir_retrain_point5/weights/ssd_resnet18_epoch_$EPOCH.tlt \
                  -k $KEY

2023-05-30 15:46:39,156 [INFO] root: Registry: ['nvcr.io']
2023-05-30 15:46:39,304 [INFO] tlt.components.instance_handler.local_instance: Running command in container: nvcr.io/nvidia/tao/tao-toolkit:4.0.0-tf1.15.5
Using TensorFlow backend.
2023-05-30 07:46:41.757424: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0
Using TensorFlow backend.
2023-05-30 07:46:49,946 [INFO] iva.ssd.utils.spec_loader: Merging specification from /workspace/tao-experiments/specs/complex/ssd_retrain_resnet18_kitti.txt






2023-05-30 07:46:49,952 [INFO] root: Starting SSD evaluation.


























Using TLT model for inference, setting batch size to the one in eval_config: 32
Producing predictions: 100%|████████████████████| 32/32 [00:12<00:00,  2.59it/s]
Start to calculate AP for each class
*******************************
box           AP    0.908
              mAP   0.908
*******************************
2023-05-30 07:50:19,631

In [82]:
!tao ssd evaluate --gpu_index=0 \
                  -e $SPECS_DIR/ssd_retrain_resnet18_kitti.txt \
                  -m $USER_EXPERIMENT_DIR/experiment_dir_retrain_point6/weights/ssd_resnet18_epoch_$EPOCH.tlt \
                  -k $KEY

2023-05-30 15:50:34,403 [INFO] root: Registry: ['nvcr.io']
2023-05-30 15:50:34,559 [INFO] tlt.components.instance_handler.local_instance: Running command in container: nvcr.io/nvidia/tao/tao-toolkit:4.0.0-tf1.15.5
Using TensorFlow backend.
2023-05-30 07:50:37.092285: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0
Using TensorFlow backend.
2023-05-30 07:50:45,454 [INFO] iva.ssd.utils.spec_loader: Merging specification from /workspace/tao-experiments/specs/complex/ssd_retrain_resnet18_kitti.txt






2023-05-30 07:50:45,460 [INFO] root: Starting SSD evaluation.


























Using TLT model for inference, setting batch size to the one in eval_config: 32
Producing predictions: 100%|████████████████████| 32/32 [00:12<00:00,  2.66it/s]
Start to calculate AP for each class
*******************************
box           AP    0.907
              mAP   0.907
*******************************
2023-05-30 07:54:12,732

In [83]:
!tao ssd evaluate --gpu_index=0 \
                  -e $SPECS_DIR/ssd_retrain_resnet18_kitti.txt \
                  -m $USER_EXPERIMENT_DIR/experiment_dir_retrain_point7/weights/ssd_resnet18_epoch_$EPOCH.tlt \
                  -k $KEY

2023-05-30 15:54:27,962 [INFO] root: Registry: ['nvcr.io']
2023-05-30 15:54:28,118 [INFO] tlt.components.instance_handler.local_instance: Running command in container: nvcr.io/nvidia/tao/tao-toolkit:4.0.0-tf1.15.5
Using TensorFlow backend.
2023-05-30 07:54:30.543208: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0
Using TensorFlow backend.
2023-05-30 07:54:38,781 [INFO] iva.ssd.utils.spec_loader: Merging specification from /workspace/tao-experiments/specs/complex/ssd_retrain_resnet18_kitti.txt






2023-05-30 07:54:38,788 [INFO] root: Starting SSD evaluation.


























Using TLT model for inference, setting batch size to the one in eval_config: 32
Producing predictions: 100%|████████████████████| 32/32 [00:11<00:00,  2.74it/s]
Start to calculate AP for each class
*******************************
box           AP    0.906
              mAP   0.906
*******************************
2023-05-30 07:58:22,080

## 5. Export Model

This is the last section of this gym. Models will be export for the further deployment.

In [89]:
!echo $USER_EXPERIMENT_DIR

/workspace/tao-experiments/models/ssd-complex


In [90]:
# # tao <task> export will fail if .etlt already exists. So we clear the export folder before tao <task> export
# !rm -rf $LOCAL_EXPERIMENT_DIR/export
# Generate .etlt file using tao container
!mkdir -p $LOCAL_EXPERIMENT_DIR/export

# Export in FP32 mode. Change --data_type to fp16 for FP16 mode
!tao ssd export --gpu_index=0 \
                -m $USER_EXPERIMENT_DIR/experiment_dir_unpruned/weights/ssd_resnet18_epoch_080.tlt \
                -k $KEY \
                -o $USER_EXPERIMENT_DIR/export/ssd_resnet18.etlt \
                -e $SPECS_DIR/ssd_train_resnet18_kitti.txt \
                --gen_ds_config

2023-06-04 10:54:36,999 [INFO] root: Registry: ['nvcr.io']
2023-06-04 10:54:37,150 [INFO] tlt.components.instance_handler.local_instance: Running command in container: nvcr.io/nvidia/tao/tao-toolkit:4.0.0-tf1.15.5
Using TensorFlow backend.
2023-06-04 02:54:39.774848: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0
Using TensorFlow backend.
2023-06-04 02:54:52,108 [INFO] iva.common.export.keras_exporter: Using input nodes: ['Input']
2023-06-04 02:54:52,108 [INFO] iva.common.export.keras_exporter: Using output nodes: ['NMS']
2023-06-04 02:54:52,108 [INFO] iva.ssd.utils.spec_loader: Merging specification from /workspace/tao-experiments/specs/complex/ssd_train_resnet18_kitti.txt
NOTE: UFF has been tested with TensorFlow 1.14.0.
Converting NMS as custom op: NMS_TRT
Converting FirstDimTile_5 as custom op: BatchTilePlugin_TRT
Converting FirstDimTile_4 as custom op: BatchTilePlugin_TRT
Converting FirstDimTile_3 as custom op:

In [40]:
# # tao <task> export will fail if .etlt already exists. So we clear the export folder before tao <task> export
# !rm -rf $LOCAL_EXPERIMENT_DIR/export
# Generate .etlt file using tao container
!mkdir -p $LOCAL_EXPERIMENT_DIR/export

# Export in FP32 mode. Change --data_type to fp16 for FP16 mode
!tao yolo_v3 export --gpu_index=0 \
                -m $USER_EXPERIMENT_DIR/experiment_dir_unpruned/weights/yolov3_resnet18_epoch_080.tlt \
                -k $KEY \
                -o $USER_EXPERIMENT_DIR/export/yolov3_resnet18_point3.etlt \
                -e $SPECS_DIR/ssd_train_resnet18_kitti.txt \
                --gen_ds_config

2023-06-03 01:16:30,268 [INFO] root: Registry: ['nvcr.io']
2023-06-03 01:16:30,420 [INFO] tlt.components.instance_handler.local_instance: Running command in container: nvcr.io/nvidia/tao/tao-toolkit:4.0.0-tf1.15.5
Using TensorFlow backend.
2023-06-02 17:16:33.083707: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0
Using TensorFlow backend.
2023-06-02 17:16:45,465 [INFO] iva.common.export.keras_exporter: Using input nodes: ['Input']
2023-06-02 17:16:45,465 [INFO] iva.common.export.keras_exporter: Using output nodes: ['BatchedNMS']
The ONNX operator number change on the optimization: 379 -> 173
2023-06-02 17:17:32,042 [INFO] keras2onnx: The ONNX operator number change on the optimization: 379 -> 173
Telemetry data couldn't be sent, but the command ran successfully.
Execution status: PASS
2023-06-03 01:20:41,380 [INFO] tlt.components.docker_handler.docker_handler: Stopping container.


In [41]:
# # tao <task> export will fail if .etlt already exists. So we clear the export folder before tao <task> export
# !rm -rf $LOCAL_EXPERIMENT_DIR/export
# Generate .etlt file using tao container
!mkdir -p $LOCAL_EXPERIMENT_DIR/export

# Export in FP32 mode. Change --data_type to fp16 for FP16 mode
!tao yolo_v3 export --gpu_index=0 \
                -m $USER_EXPERIMENT_DIR/experiment_dir_retrain_point4/weights/yolov3_resnet18_epoch_080.tlt \
                -k $KEY \
                -o $USER_EXPERIMENT_DIR/export/yolov3_resnet18_point4.etlt \
                -e $SPECS_DIR/yolov3_retrain_resnet18_point4.txt \
                --gen_ds_config

2023-06-03 01:24:51,124 [INFO] root: Registry: ['nvcr.io']
2023-06-03 01:24:51,282 [INFO] tlt.components.instance_handler.local_instance: Running command in container: nvcr.io/nvidia/tao/tao-toolkit:4.0.0-tf1.15.5
Using TensorFlow backend.
2023-06-02 17:24:53.960178: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0
Using TensorFlow backend.
2023-06-02 17:25:06,660 [INFO] iva.common.export.keras_exporter: Using input nodes: ['Input']
2023-06-02 17:25:06,660 [INFO] iva.common.export.keras_exporter: Using output nodes: ['BatchedNMS']
The ONNX operator number change on the optimization: 379 -> 173
2023-06-02 17:25:50,901 [INFO] keras2onnx: The ONNX operator number change on the optimization: 379 -> 173
^C


In [43]:
# # tao <task> export will fail if .etlt already exists. So we clear the export folder before tao <task> export
# !rm -rf $LOCAL_EXPERIMENT_DIR/export
# Generate .etlt file using tao container
!mkdir -p $LOCAL_EXPERIMENT_DIR/export

# Export in FP32 mode. Change --data_type to fp16 for FP16 mode
!tao yolo_v3 export --gpu_index=0 \
                -m $USER_EXPERIMENT_DIR/experiment_dir_retrain_point5/weights/yolov3_resnet18_epoch_080.tlt \
                -k $KEY \
                -o $USER_EXPERIMENT_DIR/export/yolov3_resnet18_point5.etlt \
                -e $SPECS_DIR/yolov3_retrain_resnet18_point5.txt \
                --gen_ds_config

2023-06-03 01:30:30,465 [INFO] root: Registry: ['nvcr.io']
2023-06-03 01:30:30,617 [INFO] tlt.components.instance_handler.local_instance: Running command in container: nvcr.io/nvidia/tao/tao-toolkit:4.0.0-tf1.15.5
Using TensorFlow backend.
2023-06-02 17:30:33.140064: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0
Using TensorFlow backend.
Traceback (most recent call last):
  File "</usr/local/lib/python3.6/dist-packages/iva/yolo_v3/scripts/export.py>", line 3, in <module>
  File "<frozen iva.yolo_v3.scripts.export>", line 12, in <module>
  File "<frozen iva.common.export.app>", line 302, in launch_export
  File "<frozen iva.common.export.app>", line 247, in run_export
AssertionError: Default output file /workspace/tao-experiments/models/yolov3-resnet18/export/yolov3_resnet18_point5.etlt already exists
Telemetry data couldn't be sent, but the command ran successfully.
Execution status: FAIL
2023-06-03 01:30:53,163 [I

In [44]:
# # tao <task> export will fail if .etlt already exists. So we clear the export folder before tao <task> export
# !rm -rf $LOCAL_EXPERIMENT_DIR/export
# Generate .etlt file using tao container
!mkdir -p $LOCAL_EXPERIMENT_DIR/export

# Export in FP32 mode. Change --data_type to fp16 for FP16 mode
!tao yolo_v3 export --gpu_index=0 \
                -m $USER_EXPERIMENT_DIR/experiment_dir_retrain_point6/weights/yolov3_resnet18_epoch_080.tlt \
                -k $KEY \
                -o $USER_EXPERIMENT_DIR/export/yolov3_resnet18_point6.etlt \
                -e $SPECS_DIR/yolov3_retrain_resnet18_point6.txt \
                --gen_ds_config

2023-06-03 01:30:54,415 [INFO] root: Registry: ['nvcr.io']
2023-06-03 01:30:54,569 [INFO] tlt.components.instance_handler.local_instance: Running command in container: nvcr.io/nvidia/tao/tao-toolkit:4.0.0-tf1.15.5
Using TensorFlow backend.
2023-06-02 17:30:57.149769: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0
Using TensorFlow backend.
2023-06-02 17:31:09,341 [INFO] iva.common.export.keras_exporter: Using input nodes: ['Input']
2023-06-02 17:31:09,342 [INFO] iva.common.export.keras_exporter: Using output nodes: ['BatchedNMS']
The ONNX operator number change on the optimization: 379 -> 173
2023-06-02 17:31:50,026 [INFO] keras2onnx: The ONNX operator number change on the optimization: 379 -> 173
Telemetry data couldn't be sent, but the command ran successfully.
Execution status: PASS
2023-06-03 01:35:00,122 [INFO] tlt.components.docker_handler.docker_handler: Stopping container.


In [184]:
# # tao <task> export will fail if .etlt already exists. So we clear the export folder before tao <task> export
# !rm -rf $LOCAL_EXPERIMENT_DIR/export
# Generate .etlt file using tao container
!mkdir -p $LOCAL_EXPERIMENT_DIR/export

# Export in FP32 mode. Change --data_type to fp16 for FP16 mode
!tao yolo_v3 export --gpu_index=0 \
                -m $USER_EXPERIMENT_DIR/experiment_dir_retrain_point7/weights/yolov3_mobilenetv2_epoch_080.tlt \
                -k $KEY \
                -o $USER_EXPERIMENT_DIR/export/yolov3_mobilenetv2_point7.etlt \
                -e $SPECS_DIR/yolov3_retrain_mobilenetv2_point7.txt \
                --gen_ds_config

2023-06-02 13:41:50,504 [INFO] root: Registry: ['nvcr.io']
2023-06-02 13:41:50,660 [INFO] tlt.components.instance_handler.local_instance: Running command in container: nvcr.io/nvidia/tao/tao-toolkit:4.0.0-tf1.15.5
Using TensorFlow backend.
2023-06-02 05:41:53.224420: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0
Using TensorFlow backend.
2023-06-02 05:42:05,299 [INFO] iva.common.export.keras_exporter: Using input nodes: ['Input']
2023-06-02 05:42:05,300 [INFO] iva.common.export.keras_exporter: Using output nodes: ['BatchedNMS']
The ONNX operator number change on the optimization: 379 -> 173
2023-06-02 05:42:45,866 [INFO] keras2onnx: The ONNX operator number change on the optimization: 379 -> 173
Telemetry data couldn't be sent, but the command ran successfully.
Execution status: PASS
2023-06-02 13:45:47,904 [INFO] tlt.components.docker_handler.docker_handler: Stopping container.
