In [1]:
# this notebooks explainns gradient clipping and learning rate scheduler

In [4]:
import numpy as np
import matplotlib.pyplot as plt


import torch
import torch.nn as nn
import torch.nn.functional as F

from transformers import get_cosine_schedule_with_warmup, get_linear_schedule_with_warmup


Simple demo of grad clipping

In [19]:
# tensor with gradients (like a weight matrix)
w = torch.tensor([[-1,3.3,2,-5,3,-2,-4,-5,1.5]], requires_grad = True) # consider this is weights in a model

# loss is sum of squares (L2), a dummy loss fun
loss = (w**2).sum()

# backprop
loss.backward()

#print grad and their norm before clipping
print('BEFORE CLIPPING')
print(f'Gradient vals: {w.grad[0].tolist()}')
print(f'Gradeient norm: {torch.norm(w.grad):.3f}')

#apply grad clipping
preClipVals = w.grad[0].detach() + 0
nn.utils.clip_grad_norm_([w],max_norm=1) #this is an inplace fun

print('AFTER CLIPPING')
print(f'Gradient vals: {w.grad[0].tolist()}')
print(f'Gradeient norm: {torch.norm(w.grad):.3f}')


BEFORE CLIPPING
Gradient vals: [-2.0, 6.599999904632568, 4.0, -10.0, 6.0, -4.0, -8.0, -10.0, 3.0]
Gradeient norm: 19.712
AFTER CLIPPING
Gradient vals: [-0.10146141052246094, 0.3348226547241211, 0.20292282104492188, -0.5073070526123047, 0.3043842315673828, -0.20292282104492188, -0.40584564208984375, -0.5073070526123047, 0.1521921157836914]
Gradeient norm: 1.000


In [20]:
w.grad[0]

tensor([-0.1015,  0.3348,  0.2029, -0.5073,  0.3044, -0.2029, -0.4058, -0.5073,
         0.1522])

![title](../images/PrePostClip.png)

In [21]:
# The correlation of pre and post clipped grad values is 1
# that means the indiviidual values doesnt change, the entire matrix of weights values is shrinking down
# such that the norm of their grad is 1