In [2]:
import os, sys
import ipdb  # for debugging
from tqdm import tqdm
from datetime import datetime
import platform, shutil  # detect platform type
import requests, zipfile, io

# Pytorch
import torch
import torch.nn as nn
from torch.nn import functional as F

import sentencepiece as spm  # For the tokenizer

# These lines improve performance for Ampere Architecture (e.g: A100s)
torch.backends.cuda.matmul.allow_tf32 = True  # allow tf32 on matmul
torch.backends.cudnn.allow_tf32 = True  # allow tf32 on cudnn
# Empty GPU cache memory
torch.cuda.empty_cache()

### 1-) Requirements installed

In [3]:
#!pip install wandb
# 22af9a162cd0b2ad0d4643a01a00657222e874bd
#!pip install jupyter notebook

In [4]:
#!nvidia-smi

In [5]:
# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Print the device being used
print(f"Using device: {device}")


Using device: cuda


### 2-) Parameters

- Architecture Parameters

In [6]:
# Transformers encoder inputs will be (8, 512, 384) # 8 batches, 512 features/token each iteration, 384 token vector
batch_size = 8 # number of samples in each iteration
context_size = 512  # how many words/tokens will be taken each iteration
embedded_size = 384  # vector size for the each token
n_layers = 7 # number of layers in the encoder layer
n_heads = 7 # number of heads in the multi-head attention mechanism
BIAS = True # if True, add bias to the output of the linear layer

- HyperParameters 

In [7]:
lr = 3e-4
dropout = 0.05 # randomly zero out some input units with probability dropout. This avoids overfitting.
weight_decay = 0.01 # regularization parameter for weight decay. Smaller values will result in stronger regularization. like L1 or L2 regularization.
grad_clip = 1.0 # avoid exploding gradients.

- Training Parameters

In [27]:
train_iterations = 100000
eval_interval = 50 # each 50 iterations, the model will be evaluated on the validation set
eval_iters = 10
compile = False # if True, the model will be compiled before training
checkpoint_path = 'data/models/' # path to save the model checkpoint
checkpoint_fn = 'data/my_latest.pt' # filename for the model checkpoint
checkpoint_load_fn = 'data/my_latest.pt' # filename for the model checkpoint
dtype = torch.bfloat16 # data type for the model

- Mode

In [9]:
inference = False # if True, the model will be used for inference
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

Using device: cuda


### 3-) wandb Logging

In [16]:
#import wandb
#wandb.login()

wandb: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
wandb: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
wandb: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:wandb: Appending key for api.wandb.ai to your netrc file: C:\Users\User\_netrc


True

In [17]:
wandb_log = True 
wandb_project = "llm1"
wandb_run_name = "llm1" +datetime.now().strftime("%Y%m%d-%H%M%S")

if wandb_log:
    import wandb
    wandb.init(project=wandb_project, name=wandb_run_name)
    """ wandb.config.update({
        "lr": lr,
        "dropout": dropout,
        "weight_decay": weight_decay,
        "grad_clip": grad_clip,
        "train_iterations": train_iterations,
        "eval_interval": eval_interval,
        "eval_iters": eval_iters,
        "compile": compile,
        "checkpoint_path": checkpoint_path,
        "checkpoint_fn": checkpoint_fn,
        "checkpoint_load_fn": checkpoint_load_fn,
        "dtype": dtype,
        "inference": inference,
        "device": device
    }) """


wandb: Currently logged in as: ahmet-erdonmez77 (ahmet-erdonmez77-dci). Use `wandb login --relogin` to force relogin


### 4-) Load Dataset

In [19]:
with open('data/wiki.txt', 'r', encoding="utf-8") as f:
    text = f.read()
print(f"Dataset size: {len(text)} characters")
print(text[30000:30300])

Dataset size: 178255102 characters
terms.
For example, there are objects in two groups (as shown on the right). The objects are various shapes, where one group has 3 of them while the other has 2. When the two groups combine into one, the overall amount (sum) of the shapes become 5.

Vertical Addition

The animation above demonstrate


### 5-) Tokenize the dataset

In [20]:
sp = spm.SentencePieceProcessor(model_file="data/wiki_tokenizer.model")  # spm.SentencePieceProcessor is more advanced than NLTK's word_tokenize
vocab_size = sp.vocab_size()
print(vocab_size)

4096


In [21]:
def encode(s):
    return sp.Encode(s)
def decode(s):
    return sp.Decode(s)

In [22]:
print(encode("The quick brown fox jumps over the lazy dog."))
print(decode(encode("The quick brown fox jumps over the lazy dog.")))

[310, 4031, 116, 2895, 1090, 570, 285, 1172, 599, 1853, 4039, 751, 264, 314, 817, 4049, 3429, 4051]
The quick brown fox jumps over the lazy dog.


- Tutor created "encoded_data.pt" for time saving

In [23]:
if os.path.exists('data/encoded_data.pt'):
    data = torch.load('data/encoded_data.pt')
else: # shows how to create encoded_data.pt
    encoded_data = torch.tensor(encode(text), dtype=torch.long)
    torch.save(encoded_data, 'data/encoded_data.pt')

In [26]:
data, len(data)

(tensor([4031,   13, 4061,  ...,   13,   13,   13]), 59211077)

### 6-) Define the model

In [28]:
data_size = len(data)
spl = int(0.9 * data_size)
train_data = data[:spl] # 90% of the data for training
val_data = data[spl:] # 10% of the data for validation

print(f"Total data size: {data_size/1e6:.2f} Millions | Train data size: {len(train_data)/1e6:.2f} Millions | Validation data size: {len(val_data)/1e6:.2f}")

Total data size: 59.21 Millions | Train data size: 53.29 Millions | Validation data size: 5.92


In [30]:
# we will get 8 times of 512 tokens in each batch (8,512)
def get_batch(split):
    data = train_data if split == "train" else val_data
    inds = torch.randint(len(data) - context_size, (batch_size,)) # batch_size 8 and context_size 512. Each batch will have 8 examples. and each example will have 512 tokens.
    X = torch.stack([data[i:i+context_size] for i in inds])  # If we did not do (len(data) - context_size) instead of len(data), we would get out of range error
    y = torch.stack([data[i+1: i+context_size+1] for i in inds])# if above is 1000:1512 => this 1001:1513. we move window one token forward
    
    return X.to(device), y.to(device)

In [32]:
x,y = get_batch("train")
x.shape, y.shape

(torch.Size([8, 512]), torch.Size([8, 512]))

- THE MAGIC IS WE FORWARD WINDOW 1 TOKEN AHEAD EACH TIME.

In [33]:
print(x[0][:10])
print(y[0][:10])

tensor([ 709,  379,  658,   13,   13, 3463,  442,  709,  379,  658],
       device='cuda:0')
tensor([ 379,  658,   13,   13, 3463,  442,  709,  379,  658,  299],
       device='cuda:0')
