In [10]:
# Reading the text file

with open("the-verdict.txt", "r", encoding="utf=8") as f:
    raw_text = f.read()
print(f"Example of contents in our text file: \n {raw_text[:32]}\n")    
print(f"Length of charachters in our raw text file = {len(raw_text)} charachters")

Example of contents in our text file: 
 I HAD always thought Jack Gisbur

Length of charachters in our raw text file = 20479 charachters


In [11]:
# Byte pair tokenization
import tiktoken

tokenizer = tiktoken.get_encoding("gpt2")
print(tokenizer)

<Encoding 'gpt2'>


In [15]:
# Encoding the text

token_ids = tokenizer.encode(raw_text)
len(token_ids)

5145

## Input-Pair Targets

- For each text chunk, we want the inputs and targets
- Since we want the model to predict the next word, the targets are the inputs shifted by one position to the right
- context size = How many tokens are included in the input
- Example:
   - X = [1, 2, 3, 4]
   - y = [5, 6, 7, 8]
     - If X = [1]          y = 5
     - If X = [1, 2]       y = 6
     - If X = [1, 2, 3]    y = 7
     - If X = [1, 2, 3, 4] y = 8
   - context size = 4       

In [20]:
# Creating a sample by removing the first 50 tokens

sample_tokens = token_ids[50:]

In [27]:
# Example of the technique for creating input-pair tokens

context_size = 4
# This means the model is trained to look at a sequence of 4 tokens to predict the next word
# For example if x is the first 4 tokens = [1, 2, 3, 4] y next 4 tokens = [2, 3, 4, 5]
X = sample_tokens[:context_size]
y = sample_tokens[1:context_size+1]
print(f"f(x) = {X}")
print(f"f(y) =      {y}")
# Checking the condions
# x = [290]                  : y = 4920
# X = [290, 4920]            : y = [2241]
# X = [290, 4920, 2241]      : y = [287]
# X = [290, 4920, 2241, 287] : y = [257]

f(x) = [290, 4920, 2241, 287]
f(y) =      [4920, 2241, 287, 257]


In [28]:
# Processing the inputs along with the targets where inputs are shifted one position to the right to create next-word prediction task

for i in range(1, context_size+1):
    # Input
    context = sample_tokens[:i]
    # Target
    desired = sample_tokens[i]
    print(f"{context}  ---> {desired}")

[290]  ---> 4920
[290, 4920]  ---> 2241
[290, 4920, 2241]  ---> 287
[290, 4920, 2241, 287]  ---> 257
