# Char level


In [3]:
import numpy as np
import pandas as pd

In [4]:
with open("../dataset_research_paper_docs/input_text.txt", "r", encoding="utf-8") as f:
    text = f.read()

In [5]:
len(text)

1115393

In [6]:
print(text[:500])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor


In [7]:
chars = set(text)
print(len(chars))
chars

65


{'\n',
 ' ',
 '!',
 '$',
 '&',
 "'",
 ',',
 '-',
 '.',
 '3',
 ':',
 ';',
 '?',
 'A',
 'B',
 'C',
 'D',
 'E',
 'F',
 'G',
 'H',
 'I',
 'J',
 'K',
 'L',
 'M',
 'N',
 'O',
 'P',
 'Q',
 'R',
 'S',
 'T',
 'U',
 'V',
 'W',
 'X',
 'Y',
 'Z',
 'a',
 'b',
 'c',
 'd',
 'e',
 'f',
 'g',
 'h',
 'i',
 'j',
 'k',
 'l',
 'm',
 'n',
 'o',
 'p',
 'q',
 'r',
 's',
 't',
 'u',
 'v',
 'w',
 'x',
 'y',
 'z'}

In [8]:
chars = list(set(text))
chars[:10]

['Y', '!', "'", 'k', ';', 'a', 'p', 'C', 'z', 't']

---


In [9]:
chars = sorted(list(set(text)))
vocab_size = len(chars)
print("".join(chars))
print(vocab_size)


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
65


## `create` a mapping table for string to integer


In [10]:
strtoint = {ch: i for i, ch in enumerate(chars)}
inttostr = {i: ch for i, ch in enumerate(chars)}

encode_txt = lambda s: [strtoint[c] for c in s]
# returns list of integer for input string given

decode_txt = lambda l: "".join(inttostr[i] for i in l)
# returns string from given integers

In [11]:
list(strtoint.items())[-10:]  # lookuptable

[('q', 55),
 ('r', 56),
 ('s', 57),
 ('t', 58),
 ('u', 59),
 ('v', 60),
 ('w', 61),
 ('x', 62),
 ('y', 63),
 ('z', 64)]

In [12]:
list(inttostr.items())[-10:]  # lookuptable

[(55, 'q'),
 (56, 'r'),
 (57, 's'),
 (58, 't'),
 (59, 'u'),
 (60, 'v'),
 (61, 'w'),
 (62, 'x'),
 (63, 'y'),
 (64, 'z')]

---


Character level token


In [13]:
encode_txt("what zxy moountain ")

[61, 46, 39, 58, 1, 64, 62, 63, 1, 51, 53, 53, 59, 52, 58, 39, 47, 52, 1]

In [14]:
decode_txt([61, 46, 39, 58, 1, 64, 62, 63, 1, 51, 53, 53, 59, 52, 58, 39, 47, 52, 1])

'what zxy moountain '

In [15]:
print(encode_txt("hello people"))

enc_text = encode_txt("hello people")

print(decode_txt(enc_text))

[46, 43, 50, 50, 53, 1, 54, 43, 53, 54, 50, 43]
hello people


Google uses [sentencepiece](https://github.com/google/sentencepiece) for tokenization.

SentencePiece is an unsupervised text tokenizer and detokenizer mainly for Neural Network-based text generation systems where the vocabulary size is predetermined prior to the neural model training. SentencePiece implements subword units (e.g., byte-pair-encoding (BPE) [Sennrich et al.]) and unigram language model [Kudo.]) with the extension of direct training from raw sentences. SentencePiece allows us to make a purely end-to-end system that does not depend on language-specific pre/postprocessing.


OpenAI uses Byte Pair Encoding [BPE](https://github.com/openai/tiktoken) for tokenization.

BPE is a simple form of data compression that iteratively replaces the most frequent pair of bytes in a sequence with a single, unused byte. In the context of tokenization, BPE is used to create a vocabulary of subword units that can efficiently represent text data. The algorithm starts with a base vocabulary of individual characters and then merges the most frequent pairs of characters or subwords to form new tokens. This process continues until a predefined vocabulary size is reached. BPE is particularly effective for handling out-of-vocabulary words and capturing common patterns in text, making it a popular choice for tokenization in natural language processing tasks.


In [16]:
# !pip install tiktoken

In [17]:
import tiktoken

In [18]:
enc = tiktoken.get_encoding("gpt2")

In [19]:
print(enc.encode("Hi everyone"))
that = enc.encode("Hi everyone")
enc.decode(that)

[17250, 2506]


'Hi everyone'

---


`Encode` the whole shakespeare text


In [20]:
text[:50]

'First Citizen:\nBefore we proceed any further, hear'

In [21]:
import torch

# encode whole text
data = torch.tensor(encode_txt(text), dtype=torch.long)
print(data.shape, data.dtype)
# print first 500 character encoding
print(data[:500])

torch.Size([1115393]) torch.int64
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59,  1, 39, 56, 43,  1, 39, 50, 50,
         1, 56, 43, 57, 53, 50, 60, 43, 42,  1, 56, 39, 58, 46, 43, 56,  1, 58,
        53,  1, 42, 47, 43,  1, 58, 46, 39, 52,  1, 58, 53,  1, 44, 39, 51, 47,
        57, 46, 12,  0,  0, 13, 50, 50, 10,  0, 30, 43, 57, 53, 50, 60, 43, 42,
         8,  1, 56, 43, 57, 53, 50, 60, 43, 42,  8,  0,  0, 18, 47, 56, 57, 58,
         1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 18, 47, 56, 57, 58,  6,  1, 63,
        53, 59,  1, 49, 52, 53, 61,  1, 15, 39, 47, 59, 57,  1, 25, 39, 56, 41,
      

# `split` the data to train test


In [22]:
print(data.shape)

torch.Size([1115393])


In [23]:
n = int(0.9 * len(data))
print(n)


# first 90% in the train and rest 10% in the val


train_data = data[:n]
val_data = data[n:]

1003853


while training we dont give the model the full sequence rather we give part of the sequence and do it in batches.


block size or context length : how many tokens the model can see at a time


In [24]:
block_size = 8
train_data[: block_size + 1]

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58])

In [25]:
print(
    "given ->",
    train_data[:block_size],
    "predict ->",
    train_data[block_size],
    "total ->",
    train_data[: block_size + 1],
)

given -> tensor([18, 47, 56, 57, 58,  1, 15, 47]) predict -> tensor(58) total -> tensor([18, 47, 56, 57, 58,  1, 15, 47, 58])


In [26]:
# x is the input to the transformer --first block size characters
# y is offset by 1 to x ----- next block size character. - y is the target for each position to the input

x = train_data[:block_size]
y = train_data[1 : block_size + 1]

for t in range(block_size):
    context = x[: t + 1]
    target = y[t]
    print("when input is ", context, "o/p ---> ", target)

when input is  tensor([18]) o/p --->  tensor(47)
when input is  tensor([18, 47]) o/p --->  tensor(56)
when input is  tensor([18, 47, 56]) o/p --->  tensor(57)
when input is  tensor([18, 47, 56, 57]) o/p --->  tensor(58)
when input is  tensor([18, 47, 56, 57, 58]) o/p --->  tensor(1)
when input is  tensor([18, 47, 56, 57, 58,  1]) o/p --->  tensor(15)
when input is  tensor([18, 47, 56, 57, 58,  1, 15]) o/p --->  tensor(47)
when input is  tensor([18, 47, 56, 57, 58,  1, 15, 47]) o/p --->  tensor(58)


there is a new dimension batch dimension
while training we dont give the model the full sequence rather we give part of the sequence and do it in batches.

batches of sequences of block size length are fed for efficiency to process in parallel

batch of sequence of block size length are stacked in tensor and fed to process


In [27]:
block_size = 8  # length of the input sequence
batch_size = 4  # no of input sequence to process in parallel

In [28]:
# four independent rows


ix = torch.randint(len(data) - block_size, (batch_size,))
torch.randint(len(data) - block_size, (batch_size,))

tensor([ 649750,  876736,  932932, 1062866])

in a batch,completely independent sequences are selected randomly of block size


In [29]:
x = torch.stack([data[i : i + block_size] for i in ix])
y = torch.stack([data[i + 1 : i + block_size + 1] for i in ix])
print(x)
print(y)

tensor([[61,  1, 51, 39, 56, 41, 46,  1],
        [50, 47, 41, 43,  6,  0, 18, 53],
        [50, 53, 60, 43, 42,  1, 58, 46],
        [ 1, 47, 57,  1, 41, 53, 51, 47]])
tensor([[ 1, 51, 39, 56, 41, 46,  1, 61],
        [47, 41, 43,  6,  0, 18, 53, 56],
        [53, 60, 43, 42,  1, 58, 46, 43],
        [47, 57,  1, 41, 53, 51, 47, 52]])


In [30]:
torch.manual_seed(1337)

block_size = 8  # length of the input sequence
batch_size = 4  # no of input sequence to process in parallel


def get_batch(split):
    data = train_data if split == "train" else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i : i + block_size] for i in ix])
    y = torch.stack([data[i + 1 : i + block_size + 1] for i in ix])
    return x, y


xb, yb = get_batch("train")
print("inputs: ", xb.shape)
print(xb)
print("----\n ")
print("targets: ", xb.shape)
print(yb)

# xb is the input to the transformer

inputs:  torch.Size([4, 8])
tensor([[53, 59,  6,  1, 58, 56, 47, 40],
        [49, 43, 43, 54,  1, 47, 58,  1],
        [13, 52, 45, 43, 50, 53,  8,  0],
        [ 1, 39,  1, 46, 53, 59, 57, 43]])
----
 
targets:  torch.Size([4, 8])
tensor([[59,  6,  1, 58, 56, 47, 40, 59],
        [43, 43, 54,  1, 47, 58,  1, 58],
        [52, 45, 43, 50, 53,  8,  0, 26],
        [39,  1, 46, 53, 59, 57, 43,  0]])


In [31]:
print(batch_size, block_size)

4 8


In [32]:
for i in range(batch_size):
    for j in range(block_size):
        context = xb[i, : j + 1]
        target = yb[i, j]
        print("when input is ", context.tolist(), "output --> ", target)

when input is  [53] output -->  tensor(59)
when input is  [53, 59] output -->  tensor(6)
when input is  [53, 59, 6] output -->  tensor(1)
when input is  [53, 59, 6, 1] output -->  tensor(58)
when input is  [53, 59, 6, 1, 58] output -->  tensor(56)
when input is  [53, 59, 6, 1, 58, 56] output -->  tensor(47)
when input is  [53, 59, 6, 1, 58, 56, 47] output -->  tensor(40)
when input is  [53, 59, 6, 1, 58, 56, 47, 40] output -->  tensor(59)
when input is  [49] output -->  tensor(43)
when input is  [49, 43] output -->  tensor(43)
when input is  [49, 43, 43] output -->  tensor(54)
when input is  [49, 43, 43, 54] output -->  tensor(1)
when input is  [49, 43, 43, 54, 1] output -->  tensor(47)
when input is  [49, 43, 43, 54, 1, 47] output -->  tensor(58)
when input is  [49, 43, 43, 54, 1, 47, 58] output -->  tensor(1)
when input is  [49, 43, 43, 54, 1, 47, 58, 1] output -->  tensor(58)
when input is  [13] output -->  tensor(52)
when input is  [13, 52] output -->  tensor(45)
when input is  [13

---


## start feeding to NN

- Bigram model
  - simple model for language modeling that predicts the next token based on the current token using a lookup table.
  - each token in the vocabulary has a corresponding embedding vector in the lookup table.


In [33]:
# table of size (vocab_size, vocab_size) where each row corresponds to a token in the vocabulary and contains the logits for predicting the next token.

# here each token is made to (65\*65)

# Embedding =  matrix of shape (num_embeddings, embedding_dim)

#when
# logits = self.token_embedding_table(idx)
# internally
# logits[b, t] = W[idx[b, t]]

# The embedding table is formed by initializing a (vocab_size Ã— vocab_size) matrix with random values and then gradually shaping each row through gradient descent so that it learns the logits for predicting the next token given the current token.


In [34]:
import torch
import torch.nn as nn
from torch.nn import functional as F


class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()

        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)  # 65*65

    def forward(self, idx, targets):
        # idx and target are both (B,T)tensor of integer B-batch ,T-time/block_size/context length, C-channel. (here b=4,T=8,C=vocabsize ie 65)

        logits = self.token_embedding_table(idx)
        return logits


m = BigramLanguageModel(vocab_size)
out = m(xb, yb)
print(out.shape)

#idx or xb =(4,8)
#returned logits= (4,8,65)(4batch of 8)

torch.Size([4, 8, 65])


In [35]:
xb.shape

torch.Size([4, 8])

In [36]:
x[1]

tensor([50, 47, 41, 43,  6,  0, 18, 53])

#returned logits= (4,8,65)(4batch of 65dim vector for each of the 8 tokens in the sequence)

- Each integer in the 8-length vector becomes a 65-length vector
- for x[4,8] 4 vec of 8dimlength each logits returns as (4,8,65) ,4batch of 65dim vector for each of the 8 tokens in the sequence
- [ 0, 32, 46, 53, 59,  1, 40, 43] each integer in 8-length vector becomes a 65-length vector


In [37]:
out[:1].shape

torch.Size([1, 8, 65])

In [38]:
out[:1]

tensor([[[-3.7245e-01, -2.8004e-01, -9.1454e-02,  5.1224e-01,  1.5810e+00,
          -2.0063e+00, -1.2925e+00,  2.5901e-01, -4.1397e-01,  4.6904e-01,
          -7.6566e-01,  1.9072e+00, -3.2599e-01, -3.4377e-01, -1.4415e+00,
           1.9263e+00, -1.9445e+00,  2.5717e-01, -1.5743e+00, -8.5837e-01,
          -3.8297e-01, -5.3723e-01, -1.2176e+00, -1.9508e-01, -8.3292e-01,
           1.3672e+00, -1.0501e+00,  3.0392e-01,  1.8110e+00,  6.3502e-01,
          -8.1953e-02,  1.0644e+00, -1.4712e-01, -1.6037e+00, -2.7010e-01,
           1.5030e-02, -2.8009e-01,  1.4896e+00, -4.6460e-01, -9.3095e-01,
           8.1198e-01,  7.4637e-01,  1.4684e-01,  3.1170e+00, -1.5428e+00,
          -2.2848e+00,  5.7553e-01,  1.2452e+00,  7.6347e-01,  5.0461e-02,
          -9.5784e-01, -9.6295e-01, -5.0778e-01,  1.0184e-01,  1.9141e+00,
          -3.2491e-01, -2.2191e-01,  9.7756e-03,  2.0640e+00, -1.6050e+00,
          -6.9845e-01, -6.7874e-01,  8.6619e-01, -9.5812e-01, -9.1966e-01],
         [-1.2796e+00,  

In [None]:
import torch
import torch.nn as nn
from torch.nn import functional as F


class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()

        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)  # 65*65

    def forward(self, idx, targets):
        # idx and target are both (B,T)tensor of integer B-batch ,T-time/block_size/context length, C-channel. (here b=4,T=8,C=vocabsize ie 65)

        logits = self.token_embedding_table(idx)    #(B,T,C)ie(4,8,65)
        B,T,C=logits.shape
        logits=logits.view(B*T,C)       #(32*65) stretching the vec

        targets=targets.view(B*T)      # (32)
        loss=F.cross_entropy(logits,targets)
        
        return logits,loss


m = BigramLanguageModel(vocab_size)
logits,loss = m(xb, yb)
print("logits",logits.shape,"\n loss= ",loss)

#idx or xb =(4,8)
#returned logits= (4,8,65)(4batch of 8)
# losscalculation
# The 65-logit vector represents a distribution over choices.
# The target integer selects the correct choice, and the loss measures how much probability the model assigned to that choice.

logits torch.Size([32, 65]) 
 loss=  tensor(4.2990, grad_fn=<NllLossBackward0>)


In [43]:
print(logits)

tensor([[ 1.4743,  0.7709, -1.5866,  ..., -0.3907,  1.1608, -2.4852],
        [-2.1540,  1.2059,  0.8688,  ...,  0.7107,  0.5719,  1.1146],
        [ 1.4189,  0.8547,  0.8418,  ..., -1.2843, -1.1059,  1.0624],
        ...,
        [-2.1540,  1.2059,  0.8688,  ...,  0.7107,  0.5719,  1.1146],
        [ 0.1980,  0.4785, -0.1131,  ...,  0.7403, -1.4363, -0.7458],
        [-0.9344, -0.0267, -0.4144,  ...,  0.1355, -1.4978, -1.0967]],
       grad_fn=<ViewBackward0>)


In [44]:
logits.shape

torch.Size([32, 65])

In [47]:
B,T=4,8
print(yb.view(B*T))
yb.view(B*T).shape


tensor([59,  6,  1, 58, 56, 47, 40, 59, 43, 43, 54,  1, 47, 58,  1, 58, 52, 45,
        43, 50, 53,  8,  0, 26, 39,  1, 46, 53, 59, 57, 43,  0])


torch.Size([32])