In [2]:
import pandas as pd
import numpy as np

# 1 Assignment Overview
## 1.1 Profiling and Benchmarking
### 1.1.3 End-to-End Benchmarking

We start with forward and backward passes, 5 warmup steps, 10 benchmark steps. We notice low variance among measurements.

In [None]:
# context_length=256
results_forwardandbackward_w5_n10 = {'small': {'forward_only': False, 'warmup_steps': 5, 'benchmark_steps': 10, 'avg': np.float64(0.038934001384768636), 'std': np.float64(0.00019227064981187754)}, 'medium': {'forward_only': False, 'warmup_steps': 5, 'benchmark_steps': 10, 'avg': np.float64(0.09081623799866065), 'std': np.float64(0.0005792887672763797)}, 'large': {'forward_only': False, 'warmup_steps': 5, 'benchmark_steps': 10, 'avg': np.float64(0.2036572418292053), 'std': np.float64(0.000794752277815363)}, 'xl': {'forward_only': False, 'warmup_steps': 5, 'benchmark_steps': 10, 'avg': np.float64(0.3937284264015034), 'std': np.float64(0.0009801636726452544)}, '2.7B': {'forward_only': False, 'warmup_steps': 5, 'benchmark_steps': 10, 'avg': np.float64(0.5568876736913808), 'std': np.float64(0.0001973441361972124)}}
df = pd.DataFrame(results_forwardandbackward_w5_n10).T
print(df.to_markdown())

|        | forward_only   |   warmup_steps |   benchmark_steps |       avg |         std |
|:-------|:---------------|---------------:|------------------:|----------:|------------:|
| small  | False          |              5 |                10 | 0.038934  | 0.000192271 |
| medium | False          |              5 |                10 | 0.0908162 | 0.000579289 |
| large  | False          |              5 |                10 | 0.203657  | 0.000794752 |
| xl     | False          |              5 |                10 | 0.393728  | 0.000980164 |
| 2.7B   | False          |              5 |                10 | 0.556888  | 0.000197344 |


Now, we look at only forward pass. We find that the times are around 1/3, which seems accurate since backward pass is around double the FLOPs of forward pass.

In [None]:
results_forwardonly_w5_n10 = {'small': {'forward_only': True, 'warmup_steps': 5, 'benchmark_steps': 10, 'avg': np.float64(0.01579690339276567), 'std': np.float64(0.00012533750080744773)}, 'medium': {'forward_only': True, 'warmup_steps': 5, 'benchmark_steps': 10, 'avg': np.float64(0.030950926861260088), 'std': np.float64(0.0006028892862904648)}, 'large': {'forward_only': True, 'warmup_steps': 5, 'benchmark_steps': 10, 'avg': np.float64(0.06545800276799127), 'std': np.float64(7.117279307500996e-05)}, 'xl': {'forward_only': True, 'warmup_steps': 5, 'benchmark_steps': 10, 'avg': np.float64(0.12748205178650096), 'std': np.float64(0.00012059490126578412)}, '2.7B': {'forward_only': True, 'warmup_steps': 5, 'benchmark_steps': 10, 'avg': np.float64(0.17424825453199447), 'std': np.float64(6.339024647792802e-05)}}
df = pd.DataFrame(results_forwardonly_w5_n10).T
print(df.to_markdown())

|        | forward_only   |   warmup_steps |   benchmark_steps |       avg |         std |
|:-------|:---------------|---------------:|------------------:|----------:|------------:|
| small  | True           |              5 |                10 | 0.0157969 | 0.000125338 |
| medium | True           |              5 |                10 | 0.0309509 | 0.000602889 |
| large  | True           |              5 |                10 | 0.065458  | 7.11728e-05 |
| xl     | True           |              5 |                10 | 0.127482  | 0.000120595 |
| 2.7B   | True           |              5 |                10 | 0.174248  | 6.33902e-05 |


Without warmup, we see a much higher standard deviation and also an impact on smaller / earlier models. This could be because there is overhead associated with the first couple of runs that is not an issue in the long run, which is what we care about.

In [5]:
results_forwardandbackward_w0_n10 = {'small': {'forward_only': False, 'warmup_steps': 0, 'benchmark_steps': 10, 'avg': np.float64(0.0712938507203944), 'std': np.float64(0.09671228201616748)}, 'medium': {'forward_only': False, 'warmup_steps': 0, 'benchmark_steps': 10, 'avg': np.float64(0.10467507569119334), 'std': np.float64(0.0287743982342857)}, 'large': {'forward_only': False, 'warmup_steps': 0, 'benchmark_steps': 10, 'avg': np.float64(0.2092732895980589), 'std': np.float64(0.011061761055204388)}, 'xl': {'forward_only': False, 'warmup_steps': 0, 'benchmark_steps': 10, 'avg': np.float64(0.3976040959940292), 'std': np.float64(0.007775187865667317)}, '2.7B': {'forward_only': False, 'warmup_steps': 0, 'benchmark_steps': 10, 'avg': np.float64(0.5620815886883065), 'std': np.float64(0.014736830905364812)}}
df = pd.DataFrame(results_forwardandbackward_w0_n10).T
print(df.to_markdown())

|        | forward_only   |   warmup_steps |   benchmark_steps |       avg |        std |
|:-------|:---------------|---------------:|------------------:|----------:|-----------:|
| small  | False          |              0 |                10 | 0.0712939 | 0.0967123  |
| medium | False          |              0 |                10 | 0.104675  | 0.0287744  |
| large  | False          |              0 |                10 | 0.209273  | 0.0110618  |
| xl     | False          |              0 |                10 | 0.397604  | 0.00777519 |
| 2.7B   | False          |              0 |                10 | 0.562082  | 0.0147368  |


In [6]:
results_forwardonly_w0_n10 = {'small': {'forward_only': True, 'warmup_steps': 0, 'benchmark_steps': 10, 'avg': np.float64(0.04331547362962738), 'std': np.float64(0.07872015609390083)}, 'medium': {'forward_only': True, 'warmup_steps': 0, 'benchmark_steps': 10, 'avg': np.float64(0.04426585491746664), 'std': np.float64(0.03240265923601385)}, 'large': {'forward_only': True, 'warmup_steps': 0, 'benchmark_steps': 10, 'avg': np.float64(0.07094462101813406), 'std': np.float64(0.011289025703075812)}, 'xl': {'forward_only': True, 'warmup_steps': 0, 'benchmark_steps': 10, 'avg': np.float64(0.13053130987100303), 'std': np.float64(0.007699488997647922)}, '2.7B': {'forward_only': True, 'warmup_steps': 0, 'benchmark_steps': 10, 'avg': np.float64(0.17898933509131892), 'std': np.float64(0.013183273384352073)}}
df = pd.DataFrame(results_forwardonly_w0_n10).T
print(df.to_markdown())

|        | forward_only   |   warmup_steps |   benchmark_steps |       avg |        std |
|:-------|:---------------|---------------:|------------------:|----------:|-----------:|
| small  | True           |              0 |                10 | 0.0433155 | 0.0787202  |
| medium | True           |              0 |                10 | 0.0442659 | 0.0324027  |
| large  | True           |              0 |                10 | 0.0709446 | 0.011289   |
| xl     | True           |              0 |                10 | 0.130531  | 0.00769949 |
| 2.7B   | True           |              0 |                10 | 0.178989  | 0.0131833  |


We see that even a couple of warmup steps helps standard deviation greatly and average greatly.

In [7]:
results_forwardandbackward_w2_n10 = {'small': {'forward_only': False, 'warmup_steps': 2, 'benchmark_steps': 10, 'avg': np.float64(0.039019133895635605), 'std': np.float64(0.0001920012991025753)}, 'medium': {'forward_only': False, 'warmup_steps': 2, 'benchmark_steps': 10, 'avg': np.float64(0.0909191724145785), 'std': np.float64(0.0009752176591215706)}, 'large': {'forward_only': False, 'warmup_steps': 2, 'benchmark_steps': 10, 'avg': np.float64(0.20466322761494665), 'std': np.float64(0.0004757270254612791)}, 'xl': {'forward_only': False, 'warmup_steps': 2, 'benchmark_steps': 10, 'avg': np.float64(0.3947079855017364), 'std': np.float64(0.00036701886755863614)}, '2.7B': {'forward_only': False, 'warmup_steps': 2, 'benchmark_steps': 10, 'avg': np.float64(0.5580237713409588), 'std': np.float64(0.0002497751298431857)}}
df = pd.DataFrame(results_forwardandbackward_w2_n10).T
print(df.to_markdown())

|        | forward_only   |   warmup_steps |   benchmark_steps |       avg |         std |
|:-------|:---------------|---------------:|------------------:|----------:|------------:|
| small  | False          |              2 |                10 | 0.0390191 | 0.000192001 |
| medium | False          |              2 |                10 | 0.0909192 | 0.000975218 |
| large  | False          |              2 |                10 | 0.204663  | 0.000475727 |
| xl     | False          |              2 |                10 | 0.394708  | 0.000367019 |
| 2.7B   | False          |              2 |                10 | 0.558024  | 0.000249775 |


In [8]:
results_forwardonly_w2_n10 = {'small': {'forward_only': True, 'warmup_steps': 2, 'benchmark_steps': 10, 'avg': np.float64(0.01597115001641214), 'std': np.float64(0.00028466750118917957)}, 'medium': {'forward_only': True, 'warmup_steps': 2, 'benchmark_steps': 10, 'avg': np.float64(0.03091297169448808), 'std': np.float64(0.0006388669542270524)}, 'large': {'forward_only': True, 'warmup_steps': 2, 'benchmark_steps': 10, 'avg': np.float64(0.06533648789627478), 'std': np.float64(0.0001878294531717484)}, 'xl': {'forward_only': True, 'warmup_steps': 2, 'benchmark_steps': 10, 'avg': np.float64(0.1273713317932561), 'std': np.float64(0.000170664876891159)}, '2.7B': {'forward_only': True, 'warmup_steps': 2, 'benchmark_steps': 10, 'avg': np.float64(0.17423666048562153), 'std': np.float64(6.371911033421974e-05)}}
df = pd.DataFrame(results_forwardonly_w2_n10).T
print(df.to_markdown())

|        | forward_only   |   warmup_steps |   benchmark_steps |       avg |         std |
|:-------|:---------------|---------------:|------------------:|----------:|------------:|
| small  | True           |              2 |                10 | 0.0159712 | 0.000284668 |
| medium | True           |              2 |                10 | 0.030913  | 0.000638867 |
| large  | True           |              2 |                10 | 0.0653365 | 0.000187829 |
| xl     | True           |              2 |                10 | 0.127371  | 0.000170665 |
| 2.7B   | True           |              2 |                10 | 0.174237  | 6.37191e-05 |
