|<h2>Course:</h2>|<h1><a href="https://udemy.com/course/dullms_x/?couponCode=202508" target="_blank">A deep understanding of AI language model mechanisms</a></h1>|
|-|:-:|
|<h2>Part 2:</h2>|<h1>Large language models<h1>|
|<h2>Section:</h2>|<h1>Build a GPT<h1>|
|<h2>Lecture:</h2>|<h1><b>CodeChallenge: GELU vs. ReLU<b></h1>|

<br>

<h5><b>Teacher:</b> Mike X Cohen, <a href="https://sincxpress.com" target="_blank">sincxpress.com</a></h5>
<h5><b>Course URL:</b> <a href="https://udemy.com/course/dullms_x/?couponCode=202508" target="_blank">udemy.com/course/dullms_x/?couponCode=202508</a></h5>
<i>Using the code without the course may lead to confusion or errors.</i>

In [None]:
import numpy as np
from scipy.special import erf
import torch
import torch.nn as nn
import torch.nn.functional as F
import time

import matplotlib.pyplot as plt
# vector plots
import matplotlib_inline.backend_inline
matplotlib_inline.backend_inline.set_matplotlib_formats('svg')

# Exercise 1: Implement ReLU and GELU from the math

In [None]:
def relu(x):
  y = x * (x>=0)
  return y

def gelu_exact(x):
  y = x/2 * (1 + erf(x/np.sqrt(2)))
  return y

def gelu_approx(x):
  y = x/2 * (1 + np.tanh( np.sqrt(2/np.pi) * (x+.044715*x**3) ))
  return y

In [None]:
# x-values to calculate
x = np.linspace(-3,3,101)

# get the data
relu_y = relu(x)
geluE_y = gelu_exact(x)
geluA_y = gelu_approx(x)

# plot
plt.figure(figsize=(10,4))
plt.plot(x,relu_y,linewidth=2,label='ReLU')
plt.plot(x[::3],geluE_y[::3],'o',linewidth=2,label='GELU approx')
plt.plot(x,geluE_y,linewidth=2,label='GELU exact')

plt.legend()
plt.gca().set(xlabel='Raw input',ylabel='Transformed output',xlim=x[[0,-1]],
              title=f'Exact and approximate GELU correlate at {np.corrcoef(geluE_y,geluA_y)[0,1]:.3f}')

plt.show()

# Exercise 2: Using Pytorch functions (not classes)

In [None]:
input = torch.linspace(-3,3,333)
dx = input[1]-input[0]

# the functions
relu_out = F.relu(input)
gelu_out = F.gelu(input)

# their empirical derivatives
d_relu = torch.diff(relu_out) / dx
d_gelu = torch.diff(gelu_out) / dx

In [None]:
_,axs = plt.subplots(2,1,figsize=(10,5))

axs[0].plot(input,relu_out,label='ReLU',linewidth=2)
axs[0].plot(input,gelu_out,label='GELU',linewidth=2)
axs[0].set(xlim=input[[0,-1]],ylabel='Output value')
axs[0].legend()

axs[1].plot(input[:-1],d_relu,label='d/df(ReLU)',linewidth=2)
axs[1].plot(input[:-1],d_gelu,label='d/df(GELU)',linewidth=2)
axs[1].set(xlim=input[[0,-1]],xlabel='Input value',ylabel='Derivative')
axs[1].legend()

plt.tight_layout()
plt.show()

# Exercise 3: Using PyTorch classes

In [None]:
relu_class = nn.ReLU()
gelu_class = nn.GELU()

In [None]:
relu_out_c = relu_class(input)
type(relu_out_c)

In [None]:
relu_out - relu_out_c

# Exercise 4: Time the implementations on the CPU

In [None]:
numReps = 10_000
x = np.linspace(-3,3,10001)

# relu
start_time = time.time()
for _ in range(numReps):
  relu(x)
print(f'--- {time.time()-start_time:6.3f} sec. for ReLU')

# exact GELU
start_time = time.time()
for _ in range(numReps):
  gelu_exact(x)
print(f'--- {time.time()-start_time:6.3f} sec. for exact GELU')

# approximate gelu
start_time = time.time()
for _ in range(numReps):
  gelu_approx(x)
print(f'--- {time.time()-start_time:6.3f} sec. for approximate GELU')

# Exercise 5: Time the implementations on the GPU

In [None]:
# re-import libraries after restarting the session
import torch
import time
import math
import torch.nn.functional as F

# Recreate the functions

In [None]:
def relu(x):
  return x * (x>=0)

def gelu_exact(x):
  return x * .5 * (1 + torch.erf(x/math.sqrt(2)))

def gelu_approx(x):
  return x/2 * (1 + torch.tanh( math.sqrt(2/math.pi) * (x+.044715*x**3) ))

# Test on the CPU

In [None]:
# testing random numbers
x = torch.randn(1_000_000, device='cpu')
numReps = 100

print('** --------- ON THE CPU --------- **')

# relu
start_time = time.time()
for _ in range(numReps): relu(x)
print(f'--- {time.time()-start_time:6.4f} sec. for ReLU')

# exact gelu
start_time = time.time()
for _ in range(numReps): gelu_exact(x)
print(f'--- {time.time()-start_time:6.4f} sec. for exact GELU')

# approximate gelu
start_time = time.time()
for _ in range(numReps): gelu_approx(x)
print(f'--- {time.time()-start_time:6.4f} sec. for approximate GELU')

# F.gelu
start_time = time.time()
for _ in range(numReps): F.gelu(x)
print(f'--- {time.time()-start_time:6.4f} sec. for F.gelu')

# Test on the GPU

In [None]:
# testing random numbers
x = torch.randn(1_000_000, device='cuda')
numReps = 100

print('** --------- ON THE GPU --------- **')

# relu
torch.cuda.synchronize()
start_time = time.time()
for _ in range(numReps): relu(x)
print(f'--- {time.time()-start_time:6.4f} sec. for ReLU')

# exact gelu
torch.cuda.synchronize()
start_time = time.time()
for _ in range(numReps): gelu_exact(x)
print(f'--- {time.time()-start_time:6.4f} sec. for exact GELU')

# approximate gelu
torch.cuda.synchronize()
start_time = time.time()
for _ in range(numReps): gelu_approx(x)
print(f'--- {time.time()-start_time:6.4f} sec. for approximate GELU')

# F.gelu
torch.cuda.synchronize()
start_time = time.time()
for _ in range(numReps): F.gelu(x)
print(f'--- {time.time()-start_time:6.4f} sec. for F.gelu')