In [14]:
import os, sys
import ipdb  # for debugging
from tqdm import tqdm
from datetime import datetime
import platform, shutil  # detect platform type
import requests, zipfile, io
import math

# Pytorch
import torch
import torch.nn as nn
from torch.nn import functional as F
from torch.nn import DataParallel

import sentencepiece as spm  # For the tokenizer

# These lines improve performance for Ampere Architecture (e.g: A100s)
torch.backends.cuda.matmul.allow_tf32 = True  # allow tf32 on matmul
torch.backends.cudnn.allow_tf32 = True  # allow tf32 on cudnn
# Empty GPU cache memory
torch.cuda.empty_cache()

### 1-) Set Parameters

In [25]:
# Set main parameters

# ARCHITECTURE PARAMETERS
batch_size = 8  # How many samples do we train at once (set as needed, typical range 8 to 128)
# 8 is good for a GPU with 4GB of memory, 128 is good for a GPU with 24GB of memory
context = 512  # Sequence length used for training, 512 is a good compromise for our level of resources
embed_size = 384  # Embedding size
n_layers = 8  # Number of transformer layers
n_heads = 8  # Number of heads within each layer
BIAS = True  # Do we want Bias parameters?

# HYPERPARAMETERS
lr = 3e-4  # Initial learning rate
dropout = 0.05  # Dropout percentage
weight_decay = 0.01  # Weight decay regularizer
grad_clip = 1.0  # Gradient clipping to prevent gradient explosion

# TRAINING parameters
train_iters = 100000  # Maximum number of training iterations
eval_interval = 50  # How often do we evaluate the performance?
eval_iters = 3  # Number of iterations while we evaluate performance
compile = False  # Compile will accelerate performance in compatible systems
load_pretrained = False  # Do we want to load a pretrained model to continue training?

checkpoint_dir = "models/"  # Where do we store checkpoints?

checkpoint_fn = "latest.pt"
# Name of checkpoint file to be saved during training

checkpoint_load_fn = "latest.pt"
# Name of checkpoint file to be loaded when load_pretrained is True
# You can load llm2.pt to experiment with a checkpoint that already reached 2.31 of loss

dtype = torch.bfloat16  # our target internal data type

# MODE
# Do we want to run the model in inference mode?
inference = False

# DEVICE - Sets device to GPU or CPU (use GPU always)
device = "cuda" if torch.cuda.is_available() else "cpu"
print("device: You will be using: ", device)


device: You will be using:  cuda


### 2-) Load Dataset and tokenize the text

In [16]:
with open("data/wiki.txt", "r", encoding="utf-8") as f:
    text = f.read()
print(f"Dataset size: {len(text)} characters")
print(text[30000:30300])

Dataset size: 178255102 characters
terms.
For example, there are objects in two groups (as shown on the right). The objects are various shapes, where one group has 3 of them while the other has 2. When the two groups combine into one, the overall amount (sum) of the shapes become 5.

Vertical Addition

The animation above demonstrate


In [17]:
sp = spm.SentencePieceProcessor(
    model_file="data/wiki_tokenizer.model"
)  # spm.SentencePieceProcessor is more advanced than NLTK's word_tokenize
vocab_size = sp.GetPieceSize()
print(vocab_size)

4096


In [18]:
def encode(s):
    return sp.Encode(s)


def decode(s):
    return sp.Decode(s)

In [19]:
print(encode("The quick brown fox jumps over the lazy dog."))
print(decode(encode("The quick brown fox jumps over the lazy dog.")))

[310, 4031, 116, 2895, 1090, 570, 285, 1172, 599, 1853, 4039, 751, 264, 314, 817, 4049, 3429, 4051]
The quick brown fox jumps over the lazy dog.


In [20]:
if os.path.exists("data/encoded_data.pt"):
    data = torch.load("data/encoded_data.pt")
else:  # shows how to create encoded_data.pt
    encoded_data = torch.tensor(encode(text), dtype=torch.long)
    torch.save(encoded_data, "data/encoded_data.pt")


In [21]:
len(data)

59211077

### 3-) The Model

In [22]:
data_size = len(data)
spl = int(0.9 * data_size)
train_data = data[:spl]  # 90% of the data for training
val_data = data[spl:]  # 10% of the data for validation

print(
    f"Total data size: {data_size/1e6:.2f} Millions | Train data size: {len(train_data)/1e6:.2f} Millions | Validation data size: {len(val_data)/1e6:.2f}"
)

Total data size: 59.21 Millions | Train data size: 53.29 Millions | Validation data size: 5.92


In [23]:
# we will get 8 times of 512 tokens in each batch (8,512)
def get_batch(split):
    data = train_data if split == "train" else val_data
    inds = torch.randint(
        len(data) - context, (batch_size,)
    )  # batch_size 8 and context 512. Each batch will have 8 examples. and each example will have 512 tokens.
    X = torch.stack(
        [data[i : i + context] for i in inds]
    )  # If we did not do (len(data) - context) instead of len(data), we would get out of range error
    y = torch.stack(
        [data[i + 1 : i + context + 1] for i in inds]
    )  # if above is 1000:1512 => this 1001:1513. we move window one token forward

    return X.to(device), y.to(device)

In [24]:
xb,yb = get_batch("train")
print(xb.shape, yb.shape)
print(xb[0][0:10], yb[0][0:10])

torch.Size([8, 512]) torch.Size([8, 512])
tensor([1178, 1347,   61,   13,   13, 4060, 4069,  396,  405,  562],
       device='cuda:0') tensor([1347,   61,   13,   13, 4060, 4069,  396,  405,  562, 4035],
       device='cuda:0')


### 4-) The Transformer Architecture with classes

In [32]:
#batch_size = 8 | context = 512  | embed_size = 384  | n_layers = 8  | n_heads = 8  

class GPT(nn.Module):
    def __init__(self):
        super().__init__()
        self.embeddings= nn.Embedding(vocab_size, embed_size)
        #self.positions= nn.Embedding(context, embed_size)
        self.positions= PositionalEncoding(context, embed_size)
        # we want n_layers times (self-attention+feed-forward) layers. Each layer has n_heads attention heads
        self.blocks = nn.Sequential(*[TransformerBlock(n_heads) for _ in range(n_layers)])
        self.ln = nn.LayerNorm(embed_size) # it is Z-score Normalization
        self.final_linear = nn.Linear(embed_size, vocab_size, bias=BIAS) # must be 384 x 4096 so then we can predict next word. we have vocab_size times options.
        
        self.apply(self._init_weights) # initialize weights in our model using Xavier initialization method.
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, std=0.02, mean=0)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias) # initialize bias to 0 (as done in GPT)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, std=0.02, mean=0.0)
        
        
    def forward(self, input, targets=None):
        loss = None
        # BS batch_size, SL sequence_length = context
        BS, SL = input.shape # input shape is batch_size x sequence_length so 8,512
        self.emb = self.embeddings(input) # will be 8 x 512 x 384. 8 batches each batch has 512 tokens, each token has 384 dimensions.
        self.pos = self.positions(torch.arange(SL, device=device))# will be 8 x 512
        x = self.emb + self.pos # will be 8 x 512 x 384
        
        x = self.blocks(x)  # so TransformerBlock forward function's input will be x. will return 8 x 512 x 384
        x = self.ln(x) # Normalization
        
        logits = self.final_linear(x) # 8 x 512 x 4096, predictions possible 4096 candidates.
        
        if targets is not None:
            # calculate loss and backpropagate
            BS, SL, VS = logits.shape  # 8 x 512 x 4096
            logits = logits.view(BS*SL, VS) # 8*512 x 4096 we have 8 batches of 512 sequences, so 8*512 means we make all 1 line 4096
            targets = targets.view(BS*SL)  # 8*512, so 8*512 means we make all 1 line 4096
            loss = F.cross_entropy(logits, targets) # converts values between 0 to 1 
        else:
            loss = None
            
        return logits, loss
    
    def generate(self, input, max_length=500):
        
        for _ in range(max_length):
            input = input[:, -context:]  # 1, input length until max of SL, because our mentality is predict next token after each sequence length
            logits, _ = self(input) # (1, input length, 4096)
            logits = logits[:, -1, :]  # predict the last token
            probs = F.softmax(logits, dim=-1)  # (1, 4096)
            next_token = torch.multinomial(probs, 1).squeeze(1)  # gives the biggest probability token.
            input = torch.cat((input, next_token), dim=1)  # add the sampled token to the input sequence, so then we can create sentences
            
        return input
            
        

class PositionalEncoding(nn.Module):
    def __init__(self,context,embed_size):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(dropout)
        pe = torch.zeros(context, embed_size)
        position = torch.arange(0, context).unsqueeze(1)
        div_term = torch.exp(
            torch.arange(0, embed_size, 2) * -(torch.log(torch.tensor(10000.0)) / embed_size)
        )
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)    #.transpose(0, 1)
        self.register_buffer("pe", pe)

    def forward(self, x):
        x = x + (self.pe[:, : x.shape[1], :]).require_grad(False)  # (batch_size, seq_len, d_model)
        return self.dropout(x)
    
class ForwardLayer(nn.Module):
    def __init__(self, embedded_size):
        super().__init__()
        self.network = nn.Sequential(
            nn.Linear(embedded_size, 6 * embedded_size, bias=BIAS),
            nn.GELU(),
            nn.Linear(6 * embedded_size, embedded_size, bias=BIAS),
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.network(x)

class TransformerBlock(nn.Module):
    def __init__(self, n_heads):
        super().__init__()
        pass

In [33]:
x,y = get_batch("train")

model = GPT()
model = model.to(device)

logits, loss = model(x, y)

print("Loss: ", loss)

Loss:  tensor(8.3960, device='cuda:0', grad_fn=<NllLossBackward0>)
