# Mount source code

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Check GPU specs

In [None]:
!nvidia-smi

Wed Apr 10 20:49:01 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA A100-SXM4-40GB          Off | 00000000:00:04.0 Off |                    0 |
| N/A   33C    P0              46W / 400W |      2MiB / 40960MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
                                                                    

# path and scripts definition

In [None]:
resnet_experiment_root_path = "/content/drive/MyDrive/CS5260_A6" # Change this according to the Google Drive folder name
resnet_experiment_requirements_path = f"{resnet_experiment_root_path}/requirements.txt"
resnet_experiment_model_path = f"{resnet_experiment_root_path}/model"
resnet_experiment_train_script_path = f"{resnet_experiment_root_path}/train.py"
resnet_experiment_eval_script_path = f"{resnet_experiment_root_path}/eval.py"
colossal_ai_run = "/usr/local/bin/colossalai run"

# Install dependencies

In [None]:
pip_install_command = f"pip install -r {resnet_experiment_requirements_path}"
!{pip_install_command}

Collecting colossalai (from -r /content/drive/MyDrive/CS5260_A6/requirements.txt (line 1))
  Downloading colossalai-0.3.6.tar.gz (1.1 MB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.1 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━[0m [32m0.8/1.1 MB[0m [31m23.8 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m22.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pre-commit (from colossalai->-r /content/drive/MyDrive/CS5260_A6/requirements.txt (line 1))
  Downloading pre_commit-3.7.0-py2.py3-none-any.whl (204 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m204.2/204.2 kB[0m [31m27.2 MB/s[0m eta [36m0:00:00[0m
Collecting fabric (from colossalai->-r /content/drive/MyDrive/CS5260_A6/requirements.txt (line 1))
  Downloading fabric-3.2.2-py3-none-any.whl (59 

# Make sure colossalai is installed successfully

In [None]:
!/usr/local/bin/colossalai check -i

#### Installation Report ####

------------ Environment ------------
Colossal-AI version: 0.3.6
PyTorch version: 2.2.1
System CUDA version: 12.2
CUDA version required by PyTorch: 12.1

Note:
1. The table above checks the versions of the libraries/tools in the current environment
2. If the System CUDA version is N/A, you can set the CUDA_HOME environment variable to locate it
3. If the CUDA version required by PyTorch is N/A, you probably did not install a CUDA-compatible PyTorch. This value is give by torch.version.cuda and you can go to https://pytorch.org/get-started/locally/ to download the correct version.

------------ CUDA Extensions AOT Compilation ------------
Found AOT CUDA Extension: ✓
PyTorch version used for AOT compilation: N/A
CUDA version used for AOT compilation: N/A

Note:
1. AOT (ahead-of-time) compilation of the CUDA kernels occurs during installation when the environment variable BUILD_EXT=1 is set
2. If AOT compilation is not enabled, stay calm as the CUDA kernels 

# Training

In [None]:
# Number of GPUs available. Since we are in Google Colab which has only 1 GPU, set it to 1.
# Change this accordingly if running in other environment.
num_gpu = 1

# train with torch DDP with fp32
train_with_DDP_fp32_command = f"{colossal_ai_run} --nproc_per_node {num_gpu} {resnet_experiment_train_script_path} -c {resnet_experiment_model_path}/ckpt-fp32"
!{train_with_DDP_fp32_command}

# train with torch DDP with mixed precision training
train_with_DDP_fp16_command = f"{colossal_ai_run} --nproc_per_node {num_gpu} {resnet_experiment_train_script_path} -c {resnet_experiment_model_path}/ckpt-fp16 -p torch_ddp_fp16"
!{train_with_DDP_fp16_command}

# train with low level zero
train_with_low_level_zero_command = f"{colossal_ai_run} --nproc_per_node {num_gpu} {resnet_experiment_train_script_path} -c {resnet_experiment_model_path}/ckpt-low_level_zero -p low_level_zero"
!{train_with_low_level_zero_command}

  _register_pytree_node(OrderedDict, _odict_flatten, _odict_unflatten)
[04/10/24 20:50:42] INFO     colossalai - colossalai - INFO:                                        
                             /usr/local/lib/python3.10/dist-packages/colossalai/initialize.py:67    
                             launch                                                                 
                    INFO     colossalai - colossalai - INFO: Distributed environment is initialized,
                             world size: 1                                                          
Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to ./data/cifar-10-python.tar.gz
100%|██████████| 170498071/170498071 [00:12<00:00, 13200020.40it/s]
Extracting ./data/cifar-10-python.tar.gz to ./data
Files already downloaded and verified
[extension] Compiling the JIT cpu_adam_x86 kernel during runtime now
[extension] Time taken to compile cpu_adam_x86 op: 36.72586536407471 seconds
[extension] Compilin

# Evaluation and reproduce performance table

In [None]:
import re
import pandas as pd

ddp_fp32_result_path = './ddp_fp32.txt'
ddp_fp16_result_path = './ddp_fp16.txt'
low_level_zero_result_path = './low_level_zero.txt'

eval_with_DDP_fp32_command = f"python {resnet_experiment_eval_script_path} -c {resnet_experiment_model_path}/ckpt-fp32 -e 80 > {ddp_fp32_result_path}"
!{eval_with_DDP_fp32_command}
eval_with_DDP_fp16_command = f"python {resnet_experiment_eval_script_path} -c {resnet_experiment_model_path}/ckpt-fp16 -e 80 > {ddp_fp16_result_path}"
!{eval_with_DDP_fp16_command}
eval_with_low_level_zero_command = f"python {resnet_experiment_eval_script_path} -c {resnet_experiment_model_path}/ckpt-low_level_zero -e 80 > {low_level_zero_result_path}"
!{eval_with_low_level_zero_command}

with open(ddp_fp32_result_path, 'r') as file:
    ddp_fp32_result_txt = file.read()

with open(ddp_fp16_result_path, 'r') as file:
    ddp_fp16_result_txt = file.read()

with open(low_level_zero_result_path, 'r') as file:
    low_level_zero_result_txt = file.read()

accuracy_pattern = r'[0-9]+\.[0-9]+'
match_ddp_fp32 = re.search(accuracy_pattern, ddp_fp32_result_txt).group()
match_ddp_fp16 = re.search(accuracy_pattern, ddp_fp16_result_txt).group()
match_low_level_zero = re.search(accuracy_pattern, low_level_zero_result_txt).group()

headers = ['Model', 'Booster DDP with FP32', 'Booster DDP with FP16', 'Booster Low Level Zero']
performances = ['ResNet-18', f'{match_ddp_fp32}%', f'{match_ddp_fp16}%', f'{match_low_level_zero}%']
reproduced_result = pd.DataFrame([performances], columns=headers).reset_index(drop=True)
reproduced_result.head()

Unnamed: 0,Model,Booster DDP with FP32,Booster DDP with FP16,Booster Low Level Zero
0,ResNet-18,84.54%,84.77%,84.78%
