# Building a GPT from scratch

In [3]:
cd '/content/drive/MyDrive/Data Projects/02 GPT from scratch'

/content/drive/MyDrive/Data Projects/02 GPT from scratch


In [4]:
#Loading the Shakespear dataset form input.txt
with open('input.txt', 'r', encoding = 'utf-8') as f:
  text = f.read()

In [7]:
#first 1000 characters
print(text[:1000])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor citizens, the patricians good.
What authority surfeits on would relieve us: if they
would yield us but the superfluity, while it were
wholesome, we might guess they relieved us humanely;
but they think we are too dear: the leanness that
afflicts us, the object of our misery, is as an
inventory to particularise their abundance; our
sufferance is a gain to them Let us revenge this with
our pikes, ere we become rakes: for the gods know I
speak this in hunger for bread, not in thirst for revenge.



In [10]:
#total number of characters
print("Total number of characters:" ,len(text))

Total number of characters: 1115394


In [15]:
#unique characters in the dataset
chars = sorted(list(set(text)))
#vocabulary size
vocab_size = len(chars)
print("Character set:", *chars, sep = "", end = "\n\n")
print("Vocabulary size:", vocab_size)

Character set:
 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz

Vocabulary size: 65


## Building Tokenizer

In [18]:
##creating character map
char_map = list(enumerate(chars))
char_map[:10]

[(0, '\n'),
 (1, ' '),
 (2, '!'),
 (3, '$'),
 (4, '&'),
 (5, "'"),
 (6, ','),
 (7, '-'),
 (8, '.'),
 (9, '3')]

In [24]:
#encoding map
encode_map = {ch:i for i,ch in char_map}
#decoding map
decode_map = {i:ch for i,ch in char_map}
print(encode_map)
print(decode_map)

{'\n': 0, ' ': 1, '!': 2, '$': 3, '&': 4, "'": 5, ',': 6, '-': 7, '.': 8, '3': 9, ':': 10, ';': 11, '?': 12, 'A': 13, 'B': 14, 'C': 15, 'D': 16, 'E': 17, 'F': 18, 'G': 19, 'H': 20, 'I': 21, 'J': 22, 'K': 23, 'L': 24, 'M': 25, 'N': 26, 'O': 27, 'P': 28, 'Q': 29, 'R': 30, 'S': 31, 'T': 32, 'U': 33, 'V': 34, 'W': 35, 'X': 36, 'Y': 37, 'Z': 38, 'a': 39, 'b': 40, 'c': 41, 'd': 42, 'e': 43, 'f': 44, 'g': 45, 'h': 46, 'i': 47, 'j': 48, 'k': 49, 'l': 50, 'm': 51, 'n': 52, 'o': 53, 'p': 54, 'q': 55, 'r': 56, 's': 57, 't': 58, 'u': 59, 'v': 60, 'w': 61, 'x': 62, 'y': 63, 'z': 64}
{0: '\n', 1: ' ', 2: '!', 3: '$', 4: '&', 5: "'", 6: ',', 7: '-', 8: '.', 9: '3', 10: ':', 11: ';', 12: '?', 13: 'A', 14: 'B', 15: 'C', 16: 'D', 17: 'E', 18: 'F', 19: 'G', 20: 'H', 21: 'I', 22: 'J', 23: 'K', 24: 'L', 25: 'M', 26: 'N', 27: 'O', 28: 'P', 29: 'Q', 30: 'R', 31: 'S', 32: 'T', 33: 'U', 34: 'V', 35: 'W', 36: 'X', 37: 'Y', 38: 'Z', 39: 'a', 40: 'b', 41: 'c', 42: 'd', 43: 'e', 44: 'f', 45: 'g', 46: 'h', 47: 'i',

In [25]:
def encoder(text: str):
  encoded_arr = [encode_map[s] for s in text]
  return encoded_arr

In [26]:
def decoder(arr):
  decoded_text = ''.join([decode_map[i] for i in arr])
  return decoded_text

In [28]:
#testing encoder and decoder
arr = encoder("ABCD")
arr

[13, 14, 15, 16]

In [29]:
decoder(arr)

'ABCD'

## Tokenizing Shakespear dataset

In [32]:
encoded_shakespear = encoder(text)
encoded_shakespear[:20]

[18, 47, 56, 57, 58, 1, 15, 47, 58, 47, 64, 43, 52, 10, 0, 14, 43, 44, 53, 56]

In [34]:
#converting to tensor for parallel processing
import torch
data = torch.tensor(encoded_shakespear, dtype=torch.long)
print(data.shape, data.dtype)
print(data[:20])

torch.Size([1115394]) torch.int64
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56])


## Splitting data into training and validation sets

In [35]:
n = int(0.9*len(data))
train_data = data[:n] #first 90% will be train data
val_data = data[n:] #last 10% will be test data

## Manually creating batches

In [36]:
#setting batch size = 4 and block size = 8
batch_size = 4 #number of blocks to process parallely
block_size = 8 #size of each block/context length

In [38]:
torch.manual_seed(1337)

def get_batch():
  #generating 4 random numbers between between 0 and total data length - block_size
  ix = torch.randint(low=0, high=len(data)-block_size, size=(batch_size,))
  x = torch.stack([data[i: i+block_size] for i in ix])
  y = torch.stack([data[i+1: i+block_size+1] for i in ix])
  return x, y

xb, yb = get_batch()
print('inputs:')
print(xb.shape)
print(xb)
print('targets:')
print(yb.shape)
print(yb)

print('----')

for b in range(batch_size): # batch dimension
    for t in range(block_size): # time dimension
        context = xb[b, :t+1]
        target = yb[b,t]
        print(f"when input is {context.tolist()} the target: {target}")

inputs:
torch.Size([4, 8])
tensor([[59, 52, 49, 47, 52, 42,  1, 40],
        [53, 54, 43, 44, 59, 50,  1, 50],
        [27, 24, 33, 25, 26, 21, 13, 10],
        [47, 41, 43,  1, 53, 60, 43, 56]])
targets:
torch.Size([4, 8])
tensor([[52, 49, 47, 52, 42,  1, 40, 56],
        [54, 43, 44, 59, 50,  1, 50, 39],
        [24, 33, 25, 26, 21, 13, 10,  0],
        [41, 43,  1, 53, 60, 43, 56, 58]])
----
when input is [59] the target: 52
when input is [59, 52] the target: 49
when input is [59, 52, 49] the target: 47
when input is [59, 52, 49, 47] the target: 52
when input is [59, 52, 49, 47, 52] the target: 42
when input is [59, 52, 49, 47, 52, 42] the target: 1
when input is [59, 52, 49, 47, 52, 42, 1] the target: 40
when input is [59, 52, 49, 47, 52, 42, 1, 40] the target: 56
when input is [53] the target: 54
when input is [53, 54] the target: 43
when input is [53, 54, 43] the target: 44
when input is [53, 54, 43, 44] the target: 59
when input is [53, 54, 43, 44, 59] the target: 50
when input 

In [39]:
print(xb) # our input to the transformer

tensor([[59, 52, 49, 47, 52, 42,  1, 40],
        [53, 54, 43, 44, 59, 50,  1, 50],
        [27, 24, 33, 25, 26, 21, 13, 10],
        [47, 41, 43,  1, 53, 60, 43, 56]])
