# **Importing Text Data**

In [2]:
import urllib.request
url = ("https://raw.githubusercontent.com/rasbt/"
"LLMs-from-scratch/main/ch02/01_main-chapter-code/"
"the-verdict.txt")
file_path = "/content/drive/My Drive/LLM/Data/the-verdict.txt"
urllib.request.urlretrieve(url, file_path)

('/content/drive/My Drive/LLM/Data/the-verdict.txt',
 <http.client.HTTPMessage at 0x79d7ef359c50>)

In [3]:
with open(file_path,"r",encoding="utf-8") as f:
  raw_text = f.read()

print("Total number of character: ", len(raw_text))
print(raw_text[:99])

Total number of character:  20479
I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no 


### Tokenization

In [4]:
import re

In [5]:
preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', raw_text)
preprocessed = [item.strip() for item in preprocessed if item.strip()]
print(len(preprocessed))

4690


In [6]:
print(preprocessed[:20])

['I', 'HAD', 'always', 'thought', 'Jack', 'Gisburn', 'rather', 'a', 'cheap', 'genius', '--', 'though', 'a', 'good', 'fellow', 'enough', '--', 'so', 'it', 'was']


### Assign token ids to tokens

In [7]:
all_words = sorted(set(preprocessed))
all_words.extend(['<|endoftext|>','<|unk|>']) # endoftoken is added to the beginning of text or document to distinguish between independent documents
vocab_size = len(all_words)
print(vocab_size)

1132


In [8]:
vocab = {token:i for i,token in enumerate(all_words)}

In [9]:
print(list(vocab.items())[-5:])

[('younger', 1127), ('your', 1128), ('yourself', 1129), ('<|endoftext|>', 1130), ('<|unk|>', 1131)]


In [10]:
class SimpleTokenizer:
  def __init__(self, vocab) -> None:
    self.token_to_id = vocab
    self.id_to_token = {v:k for k,v in vocab.items()} # for post processing the LLM output

  def encode(self, text):
    split_text = re.split(r'([,.:;?_!"()\']|--|\s)', text)
    split_text = [item.strip() for item in split_text if item.strip()]
    split_text = [word if word in self.token_to_id else '<|unk|>' for word in split_text ]
    ids = [self.token_to_id[t] for t in split_text]
    return ids

  def decode(self, ids):
    tokens = [self.id_to_token[i] for i in ids]
    text = " ".join(tokens)
    text = re.sub(r'\s+([,.:;?!"()\'])', r'\1', text)
    return text

In [11]:
tokenizer = SimpleTokenizer(vocab)
text = """
A slight shade of constraint crossed Mrs. Gisburn's open countenance. "It's his ridiculous modesty, you know. He says they're not fit to have about; he's sent them all away except one--my portrait--and that I have to keep upstairs."
"""
encoded_ids = tokenizer.encode(text)
print(encoded_ids)

[11, 901, 873, 722, 286, 301, 67, 7, 38, 2, 850, 733, 295, 7, 1, 56, 2, 850, 549, 842, 679, 5, 1126, 596, 7, 48, 858, 994, 2, 819, 711, 443, 1016, 530, 118, 9, 533, 2, 850, 869, 990, 145, 187, 405, 729, 6, 697, 782, 6, 157, 987, 53, 530, 1016, 591, 1055, 7, 1]


In [12]:
decoded_text = tokenizer.decode(encoded_ids)
print(decoded_text)

A slight shade of constraint crossed Mrs. Gisburn' s open countenance." It' s his ridiculous modesty, you know. He says they' re not fit to have about; he' s sent them all away except one -- my portrait -- and that I have to keep upstairs."


In [13]:
unseen_text = "Hello, I was checking out if there's any update." # lets test our tokenizer from text outside the training data
print(tokenizer.decode(tokenizer.encode(unseen_text)))

<|unk|>, I was <|unk|> out if there' s any <|unk|>.


### Byte Pair Encoding (more sophisticated tokenizer)

In [14]:
!pip install tiktoken



In [15]:
import tiktoken

In [16]:
print(tiktoken.__version__)

0.9.0


In [17]:
tk_tokenizer = tiktoken.get_encoding("gpt2")

In [18]:
text = "Hello, I was checking out if there's any update. <|endoftext|> Mr. Rickham wanted to see it, she began, as if excusing herself."
encoded_text = tk_tokenizer.encode(text, allowed_special={'<|endoftext|>'})

print(encoded_text)

[15496, 11, 314, 373, 10627, 503, 611, 612, 338, 597, 4296, 13, 220, 50256, 1770, 13, 8759, 2763, 2227, 284, 766, 340, 11, 673, 2540, 11, 355, 611, 2859, 3500, 5223, 13]


In [19]:
decoded_text = tk_tokenizer.decode(encoded_text)

print(decoded_text)

Hello, I was checking out if there's any update. <|endoftext|> Mr. Rickham wanted to see it, she began, as if excusing herself.


Lets try BPE tokenizer on unknown word

In [20]:
unknown_word = "Akwirw ier"
encoded_text = tk_tokenizer.encode(unknown_word)
print(encoded_text)

[33901, 86, 343, 86, 220, 959]


In [21]:
#lets see how BPE breaks down the unknown word
for id in encoded_text:
  print(tk_tokenizer.decode([id]))

Ak
w
ir
w
 
ier


In [22]:
#reconstruct the original token from token ids
print(tk_tokenizer.decode(encoded_text))

Akwirw ier


### Data Sampling

In [23]:
#use Dataset and DataLoader classes from PyTorch for sampling
import torch
from torch.utils.data import Dataset, DataLoader

In [24]:
class GPTDataset(Dataset):
  def __init__(self, text, tokenizer, max_length, stride):
    self.inputs = []
    self.targets = []

    encoded_text = tokenizer.encode(text)

    for i in range(0,len(encoded_text) - max_length, stride):
      input = encoded_text[i:i+max_length]
      target = encoded_text[i+1:i+max_length+1]

      self.inputs.append(torch.tensor(input))
      self.targets.append(torch.tensor(target))

  def __len__(self):
    return len(self.inputs)

  def __getitem__(self, index):
    return self.inputs[index], self.targets[index]

In [25]:
def create_dataloader(text, batch_size=8, max_length=4, stride=4, drop_last=True, num_workers=0, shuffle=True):
  tokenizer = tiktoken.get_encoding('gpt2')
  dataset = GPTDataset(text,tokenizer, max_length, stride)

  dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=shuffle, drop_last=drop_last, num_workers=num_workers)

  return dataloader

In [26]:
dataloader = create_dataloader(raw_text, shuffle=False)

In [27]:
data_iter = iter(dataloader)
first_batch = next(data_iter)
print(first_batch)

[tensor([[   40,   367,  2885,  1464],
        [ 1807,  3619,   402,   271],
        [10899,  2138,   257,  7026],
        [15632,   438,  2016,   257],
        [  922,  5891,  1576,   438],
        [  568,   340,   373,   645],
        [ 1049,  5975,   284,   502],
        [  284,  3285,   326,    11]]), tensor([[  367,  2885,  1464,  1807],
        [ 3619,   402,   271, 10899],
        [ 2138,   257,  7026, 15632],
        [  438,  2016,   257,   922],
        [ 5891,  1576,   438,   568],
        [  340,   373,   645,  1049],
        [ 5975,   284,   502,   284],
        [ 3285,   326,    11,   287]])]


### How to create Embeddings (or initial embeddings for LLM without word2vec)

In [28]:
#lets consider a sample vocab of size 6
sample_vocab_size = 6
sample_vector_dim = 3
sample_token_ids = torch.tensor([5,3,1,4])

In [37]:
#create a embedding layer with random initial weights
torch.manual_seed(123)
sample_embedding_layer = torch.nn.Embedding(sample_vocab_size, sample_vector_dim)
print(sample_embedding_layer.weight)

Parameter containing:
tensor([[ 0.3374, -0.1778, -0.1690],
        [ 0.9178,  1.5810,  1.3010],
        [ 1.2753, -0.2010, -0.1606],
        [-0.4015,  0.9666, -1.1481],
        [-1.1589,  0.3255, -0.6315],
        [-2.8400, -0.7849, -1.4096]], requires_grad=True)


In [38]:
#convert token_ids to vectors
sample_vectors = sample_embedding_layer(sample_token_ids)

print(sample_vectors)

tensor([[-2.8400, -0.7849, -1.4096],
        [-0.4015,  0.9666, -1.1481],
        [ 0.9178,  1.5810,  1.3010],
        [-1.1589,  0.3255, -0.6315]], grad_fn=<EmbeddingBackward0>)


#### Another way of implementing Embedding layer is using fully connected layer by one-hot encoding the token_id

In [39]:
#lets create a linear layer which replicates the embedding layer
linear = torch.nn.Linear(sample_vocab_size, sample_vector_dim, bias=False)
linear.weight = torch.nn.Parameter(sample_embedding_layer.weight.T) # copy weights of earlier embedding layer to this linear layer

#before passing the token ids to the linear layer, lets first encode it using one-hot encoding
one_hot_encoded = torch.nn.functional.one_hot(sample_token_ids, num_classes=sample_vocab_size).float()
print(one_hot_encoded)

tensor([[0., 0., 0., 0., 0., 1.],
        [0., 0., 0., 1., 0., 0.],
        [0., 1., 0., 0., 0., 0.],
        [0., 0., 0., 0., 1., 0.]])


In [36]:
#lets use this one_hot_encoded ids to get embeddings
print(linear(one_hot_encoded))

tensor([[-2.8400, -0.7849, -1.4096],
        [-0.4015,  0.9666, -1.1481],
        [ 0.9178,  1.5810,  1.3010],
        [-1.1589,  0.3255, -0.6315]], grad_fn=<MmBackward0>)


As can be seen from both the outputs (embedding layer and linear), both the approaches results in same embeddings. The linear-one hot encoding approach is to demonstrate the underlying working of torch.nn.Embedding layer

### Creating Embeddings and encoding word positions for our dataset

In [40]:
max_length = 4
vocab_size = 50257
vector_dim = 256
token_embedding_layer = torch.nn.Embedding(vocab_size, vector_dim)
pos_embedding_layer = torch.nn.Embedding(max_length, vector_dim)

In [43]:
inputs, targets = first_batch
token_embedding = token_embedding_layer(inputs)
pos_embedding = pos_embedding_layer(torch.arange(max_length))

print(token_embedding.shape)
print(pos_embedding.shape)

torch.Size([8, 4, 256])
torch.Size([4, 256])


In [42]:
input_embedding = token_embedding + pos_embedding

print(input_embedding.shape)

torch.Size([8, 4, 256])


In [54]:
input_embedding

tensor([[[ 9.7434e-01, -1.3013e+00, -2.5114e-01,  ...,  3.3158e-02,
          -1.9717e+00, -3.4217e-01],
         [-3.5572e-01, -1.2831e+00,  2.2670e+00,  ..., -1.3845e+00,
          -5.1235e-02, -8.2005e-01],
         [-2.1654e-01, -3.2127e+00,  1.6109e-01,  ...,  1.4438e+00,
           2.0039e+00, -1.4922e+00],
         [-1.0840e+00, -1.3802e+00,  1.1380e+00,  ..., -6.9833e-01,
           2.9245e+00, -1.2730e+00]],

        [[-1.8876e-01,  9.7286e-01,  5.9313e-01,  ...,  1.6101e-01,
          -1.5308e+00, -4.1526e-01],
         [-1.6282e-02, -8.0122e-01, -2.1941e+00,  ..., -1.0970e+00,
           8.2290e-01, -3.1128e-01],
         [ 3.2220e-02, -1.6379e+00, -3.4350e-01,  ...,  7.3930e-01,
          -5.5135e-02, -1.0781e+00],
         [-2.4982e-01,  1.0538e+00,  6.3436e-01,  ..., -8.1458e-01,
           1.6490e+00, -1.7432e+00]],

        [[ 1.2460e+00, -5.1565e-01, -3.4308e-01,  ..., -6.7881e-01,
          -2.0225e+00,  3.2037e+00],
         [ 2.3916e-01,  5.9045e-01, -1.7812e+00,  .