<a href="https://colab.research.google.com/github/soumyadip1995/BabyGPT/blob/main/Notebook/mixed_precision.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

A very preliminary auto mixed precision has been added. It can be achieved with a cuda enabled gpu.  A combination of pytorch's autocast and gradscaler is used for mixed precision. See more in the pytorch tutorial. Unfortunately the gpu blew up during training and cpu for now only supports bfloat16. Takes a hell of a long time to train. If anyone can improve upon it that would be awesome.

In [None]:
### enable cuda

In [2]:
words = open('/content/ALL_eminem.txt', 'r').read().split()
import torch
from torch import nn
import math
from math import sqrt
from torch.nn  import functional as F

chars = sorted(list(set(words)))
string2integer = {ch: i for i, ch in enumerate(chars)}
# print(string2integer)

integer2string = {i:ch for ch,i in string2integer.items()}
encode = lambda s: [string2integer[c] for c in s]
# print(encode)

decode = lambda l: ''.join([integer2string[i] for i in l])
# print(decode)

data = torch.tensor(encode(words), dtype = torch.long)
# print(data)
# data.size()

vocab_size = len(chars)

In [3]:

class NewGELU(nn.Module):
    """
    Implementation of the GELU activation function currently in Google BERT repo (identical to OpenAI GPT).
    Reference: Gaussian Error Linear Units (GELU) paper: https://arxiv.org/abs/1606.08415
    """
    def forward(self, x):
        return 0.5 * x * (1.0 + torch.tanh(math.sqrt(2.0 / math.pi) * (x + 0.044715 * torch.pow(x, 3.0))))

In [4]:



class Attention(nn.Module):
  def __init__(self, embedded_dim, num_heads):
    super(Attention, self).__init__()
    self.atten = nn.Linear(embedded_dim, 3 * embedded_dim)
    self.projection = nn.Linear(embedded_dim, embedded_dim)
    self.num_heads = num_heads
    self.embedded_dim = embedded_dim
    self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

  def forward(self, x):
    B,T,C = x.size()
    q, k ,v  = self.atten(x).split(self.embedded_dim, dim=2)
    q = q.view(B, T, self.num_heads, C // self.num_heads).transpose(1, 2)
    k = k.view(B, T, self.num_heads, C // self.num_heads).transpose(1, 2)
    v = v.view(B, T, self.num_heads, C // self.num_heads).transpose(1, 2)


    # manual implementation of attention
    # from karpathy
    att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
    att = att.masked_fill(self.tril[:T, :T] == 0, float('-inf'))
    att = F.softmax(att, dim=-1)
    y = att @ v # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)
    y = y.transpose(1, 2).contiguous().view(B, T, C) # re-assemble all head outputs side by side

    # output projection
    y = self.projection(y)
    return y

dropout = 0.2
class FeedForward(nn.Module):
  def __init__(self, embedded_dim):
    super(FeedForward, self).__init__()
    self.net = nn.Sequential(nn.Linear(embedded_dim, 4 * embedded_dim),
    nn.Linear(4 * embedded_dim, embedded_dim),
    NewGELU(),
    nn.Dropout(dropout))

  def forward(self, x):
    return self.net(x)

### A simple Transformer Block
class Transformer(nn.Module):
  def __init__(self, embedded_dim, num_heads):
    super(Transformer, self).__init__()
    self.attention = Attention(embedded_dim,  num_heads)
    self.feed_forward = FeedForward(embedded_dim)
    self.layer_norm_1 = nn.LayerNorm(embedded_dim)
    self.layer_norm_2 = nn.LayerNorm(embedded_dim)

  def forward(self, x):

    x = x + self.attention(self.layer_norm_1(x))
    x = x + self.feed_forward(self.layer_norm_2(x))
    return x


class BabyGPTmodel(nn.Module):
  def __init__(self, vocab_size, block_size, num_layers, embedded_dim, num_heads):
    super(BabyGPTmodel, self).__init__()
    self.token = nn.Embedding(vocab_size, embedded_dim)
    self.positional_embeddings = nn.Embedding(block_size, embedded_dim)
    self.layers1 = nn.ModuleList([Transformer(embedded_dim, num_heads) for _ in range(num_heads)])
    self.ln_f = nn.LayerNorm(embedded_dim, eps = 1e-12) # final layer
    self.ln_head = nn.Linear(embedded_dim, vocab_size)


    # init all weights
    ## from karpathy
    self.apply(self._init_weights)
    # apply special scaled init to the residual projections, per GPT-2 paper
    for pn, p in self.named_parameters():
      if pn.endswith('projection.weight'):
        torch.nn.init.normal_(p, mean=0.0, std=0.02/math.sqrt(2 * num_layers))

        # report number of parameters
        print("number of parameters: %d" % (sum(p.nelement() for p in self.parameters()),))

  def _init_weights(self, module):
      if isinstance(module, nn.Linear):
          torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
          if module.bias is not None:
              torch.nn.init.zeros_(module.bias)
      elif isinstance(module, nn.Embedding):
          torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

  def forward(self, idx):
    device = idx.device
    b, t = idx.size()
    tok_emb = self.token(idx)
    position_ids = torch.arange(0, t, dtype = torch.long).unsqueeze(0)
    pos_emb = self.positional_embeddings(position_ids)
    x = tok_emb + pos_emb
    for layers1 in self.layers1:
      x = layers1(x)
    x = self.ln_f(x)
    logits = self.ln_head(x[:, -1, :])
    return logits






vocab_size =   len(chars)
block_size = 64
embedded_dim = 256
num_heads = 4
num_layers = 4

gpt = BabyGPTmodel(vocab_size, block_size, num_layers, embedded_dim, num_heads)

optimizer = torch.optim.AdamW(gpt.parameters(), lr=1e-3, weight_decay=1e-1)

number of parameters: 14527087
number of parameters: 14527087
number of parameters: 14527087
number of parameters: 14527087


In [5]:

#generate a small batch of data of inputs x and targets y
batch_size = 64


ix = torch.randint(len(data) - block_size, (batch_size,))
x = torch.stack([data[i:i+block_size] for i in ix])
y = torch.stack([data[i+block_size] for i in ix])
# print((x, y))

In [15]:
## Training
for i in range(50):
    logits = gpt(x)
    loss = F.cross_entropy(logits, y)
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()
    print(i, loss.item())

0 10.018148422241211
1 9.170607566833496
2 8.259078979492188
3 8.082959175109863
4 7.2497758865356445
5 6.725225448608398
6 6.417089939117432
7 5.987790584564209
8 5.63279914855957
9 5.302235126495361
10 4.982885360717773
11 4.6512651443481445
12 4.3140387535095215
13 4.021770477294922
14 3.695655345916748
15 3.3894197940826416
16 3.058220863342285
17 2.7483205795288086
18 2.4375452995300293
19 2.1218338012695312
20 1.841033697128296
21 1.602030873298645
22 1.3430485725402832
23 1.1174341440200806
24 0.9393818378448486
25 0.7844941020011902
26 0.6616969108581543
27 0.5513318181037903
28 0.4566781520843506
29 0.3715827167034149
30 0.31922006607055664
31 0.27264896035194397
32 0.2315821349620819
33 0.19958558678627014
34 0.17476673424243927
35 0.15039648115634918
36 0.13078688085079193
37 0.1163887307047844
38 0.10363826155662537
39 0.0904260203242302
40 0.0806245356798172
41 0.07658655196428299
42 0.06799966096878052
43 0.06264723837375641
44 0.05733387544751167
45 0.0550566241145134
46

In [None]:
import torch, time, gc

# Timing utilities
start_time = None

def start_timer():
    global start_time
    gc.collect()
    torch.cuda.empty_cache()
    torch.cuda.reset_max_memory_allocated()
    torch.cuda.synchronize()
    start_time = time.time()

def end_timer_and_print(local_msg):
    torch.cuda.synchronize()
    end_time = time.time()
    print("\n" + local_msg)
    print("Total execution time = {:.3f} sec".format(end_time - start_time))
    print("Max memory used by tensors = {:.4f} bytes".format(torch.cuda.max_memory_allocated()))




    ### default precision according to pytorch tutorials


gpt = BabyGPTmodel(vocab_size, block_size, num_layers, embedded_dim, num_heads)
optimizer = torch.optim.AdamW(gpt.parameters(), lr=1e-3, weight_decay=1e-1)



start_timer()
## Training
for i in range(50):
    logits = gpt(x)
    loss = F.cross_entropy(logits, y)
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()
    print(i, loss.item())
end_timer_and_print("Default precision:")

In [8]:
### adding torch.autocast
device = 'cuda' if torch.cuda.is_available() else 'cpu'
for i in range(50):
        # Runs the forward pass under ``autocast``.
        with torch.autocast(device_type= 'cpu', dtype=torch.bfloat16):
            logits = gpt(x)
            # output is float16 because linear layers ``autocast`` to float16.
            assert logits.dtype is torch.bfloat16

            loss = F.cross_entropy(logits, y)
            # loss is float32 because ``mse_loss`` layers ``autocast`` to float32.
            assert loss.dtype is torch.float32

        # Exits ``autocast`` before backward().
        # Backward passes under ``autocast`` are not recommended.
        # Backward ops run in the same ``dtype`` ``autocast`` chose for corresponding forward ops.
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        print(i, loss.item())                   # set_to_none=True here can modestly improve performance

0 10.077563285827637
1 9.06834602355957
2 8.122036933898926


KeyboardInterrupt: ignored

Takes a hell of along time to train on CPU. cpu only supports bfloat16.

In [9]:
#### training combining scalar as well as autocast


use_amp = True
scaler = torch.cuda.amp.GradScaler(enabled=use_amp)


scaler = torch.cuda.amp.GradScaler()
device = 'cuda' if torch.cuda.is_available() else 'cpu'


## Training
for i in range(50):
    with torch.autocast(device_type= device, dtype= torch.bfloat16, enabled=use_amp):
       logits = gpt(x)
       loss = F.cross_entropy(logits, y)
       scaler.scale(loss).backward()
       scaler.step(optimizer)
       scaler.update()
       optimizer.zero_grad()
       print(i, loss.item())

end_timer_and_print("Mixed precision:")



0 7.657021999359131
1 6.991031169891357
2 6.679247856140137
3 6.33894681930542
4 5.901840686798096


KeyboardInterrupt: ignored