# Char level


In [None]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.nn import functional as F

In [223]:
with open("../dataset_research_paper_docs/input_text.txt", "r", encoding="utf-8") as f:
    text = f.read()

In [224]:
len(text)

1115393

In [225]:
print(text[:500])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor


In [226]:
chars = set(text)
print(len(chars))
chars

65


{'\n',
 ' ',
 '!',
 '$',
 '&',
 "'",
 ',',
 '-',
 '.',
 '3',
 ':',
 ';',
 '?',
 'A',
 'B',
 'C',
 'D',
 'E',
 'F',
 'G',
 'H',
 'I',
 'J',
 'K',
 'L',
 'M',
 'N',
 'O',
 'P',
 'Q',
 'R',
 'S',
 'T',
 'U',
 'V',
 'W',
 'X',
 'Y',
 'Z',
 'a',
 'b',
 'c',
 'd',
 'e',
 'f',
 'g',
 'h',
 'i',
 'j',
 'k',
 'l',
 'm',
 'n',
 'o',
 'p',
 'q',
 'r',
 's',
 't',
 'u',
 'v',
 'w',
 'x',
 'y',
 'z'}

In [227]:
chars = list(set(text))
chars[:10]

['S', 'e', 'B', 'N', '!', 'v', 'R', 'i', 'k', 'O']

---


In [228]:
chars = sorted(list(set(text)))
vocab_size = len(chars)
print("".join(chars))
print(vocab_size)


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
65


## `create` a mapping table for string to integer


In [229]:
strtoint = {ch: i for i, ch in enumerate(chars)}
inttostr = {i: ch for i, ch in enumerate(chars)}

encode_txt = lambda s: [strtoint[c] for c in s]
# returns list of integer for input string given

decode_txt = lambda l: "".join(inttostr[i] for i in l)
# returns string from given integers

In [230]:
list(strtoint.items())[:10]  # lookuptable

[('\n', 0),
 (' ', 1),
 ('!', 2),
 ('$', 3),
 ('&', 4),
 ("'", 5),
 (',', 6),
 ('-', 7),
 ('.', 8),
 ('3', 9)]

In [231]:
list(strtoint.items())[-10:]  # lookuptable

[('q', 55),
 ('r', 56),
 ('s', 57),
 ('t', 58),
 ('u', 59),
 ('v', 60),
 ('w', 61),
 ('x', 62),
 ('y', 63),
 ('z', 64)]

In [232]:
list(inttostr.items())[-10:]  # lookuptable

[(55, 'q'),
 (56, 'r'),
 (57, 's'),
 (58, 't'),
 (59, 'u'),
 (60, 'v'),
 (61, 'w'),
 (62, 'x'),
 (63, 'y'),
 (64, 'z')]

---


Character level token


In [233]:
encode_txt("what zxy moountain ")

[61, 46, 39, 58, 1, 64, 62, 63, 1, 51, 53, 53, 59, 52, 58, 39, 47, 52, 1]

In [234]:
decode_txt([61, 46, 39, 58, 1, 64, 62, 63, 1, 51, 53, 53, 59, 52, 58, 39, 47, 52, 1])

'what zxy moountain '

In [235]:
print(encode_txt("hello people"))

enc_text = encode_txt("hello people")

print(decode_txt(enc_text))

[46, 43, 50, 50, 53, 1, 54, 43, 53, 54, 50, 43]
hello people


Google uses [sentencepiece](https://github.com/google/sentencepiece) for tokenization.

SentencePiece is an unsupervised text tokenizer and detokenizer mainly for Neural Network-based text generation systems where the vocabulary size is predetermined prior to the neural model training. SentencePiece implements subword units (e.g., byte-pair-encoding (BPE) [Sennrich et al.]) and unigram language model [Kudo.]) with the extension of direct training from raw sentences. SentencePiece allows us to make a purely end-to-end system that does not depend on language-specific pre/postprocessing.


OpenAI uses Byte Pair Encoding [BPE](https://github.com/openai/tiktoken) for tokenization.

BPE is a simple form of data compression that iteratively replaces the most frequent pair of bytes in a sequence with a single, unused byte. In the context of tokenization, BPE is used to create a vocabulary of subword units that can efficiently represent text data. The algorithm starts with a base vocabulary of individual characters and then merges the most frequent pairs of characters or subwords to form new tokens. This process continues until a predefined vocabulary size is reached. BPE is particularly effective for handling out-of-vocabulary words and capturing common patterns in text, making it a popular choice for tokenization in natural language processing tasks.


In [236]:
# !pip install tiktoken

In [237]:
import tiktoken

In [238]:
enc = tiktoken.get_encoding("gpt2")

In [239]:
print(enc.encode("Hi everyone"))
that = enc.encode("Hi everyone")
enc.decode(that)

[17250, 2506]


'Hi everyone'

---


`Encode` the whole shakespeare text


In [240]:
text[:50]

'First Citizen:\nBefore we proceed any further, hear'

In [241]:
# encode whole text
data = torch.tensor(encode_txt(text), dtype=torch.long)
print(data.shape, data.dtype)
# print first 500 character encoding
print(data[:500])

torch.Size([1115393]) torch.int64
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59,  1, 39, 56, 43,  1, 39, 50, 50,
         1, 56, 43, 57, 53, 50, 60, 43, 42,  1, 56, 39, 58, 46, 43, 56,  1, 58,
        53,  1, 42, 47, 43,  1, 58, 46, 39, 52,  1, 58, 53,  1, 44, 39, 51, 47,
        57, 46, 12,  0,  0, 13, 50, 50, 10,  0, 30, 43, 57, 53, 50, 60, 43, 42,
         8,  1, 56, 43, 57, 53, 50, 60, 43, 42,  8,  0,  0, 18, 47, 56, 57, 58,
         1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 18, 47, 56, 57, 58,  6,  1, 63,
        53, 59,  1, 49, 52, 53, 61,  1, 15, 39, 47, 59, 57,  1, 25, 39, 56, 41,
      

# `split` the data to train test


In [242]:
print(data.shape)

torch.Size([1115393])


In [243]:
n = int(0.9 * len(data))
print(n)


# first 90% in the train and rest 10% in the val


train_data = data[:n]
val_data = data[n:]

1003853


while training we dont give the model the full sequence rather we give part of the sequence and do it in batches.


block size or context length : how many tokens the model can see at a time


In [244]:
block_size = 8
train_data[: block_size + 1]

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58])

In [245]:
print(
    "given ->",
    train_data[:block_size],
    "predict ->",
    train_data[block_size],
    "total ->",
    train_data[: block_size + 1],
)

given -> tensor([18, 47, 56, 57, 58,  1, 15, 47]) predict -> tensor(58) total -> tensor([18, 47, 56, 57, 58,  1, 15, 47, 58])


In [246]:
# x is the input to the transformer --first block size characters
# y is offset by 1 to x ----- next block size character. - y is the target for each position to the input

x = train_data[:block_size]
y = train_data[1 : block_size + 1]

for t in range(block_size):
    context = x[: t + 1]
    target = y[t]
    print("when input is ", context, "o/p ---> ", target)

when input is  tensor([18]) o/p --->  tensor(47)
when input is  tensor([18, 47]) o/p --->  tensor(56)
when input is  tensor([18, 47, 56]) o/p --->  tensor(57)
when input is  tensor([18, 47, 56, 57]) o/p --->  tensor(58)
when input is  tensor([18, 47, 56, 57, 58]) o/p --->  tensor(1)
when input is  tensor([18, 47, 56, 57, 58,  1]) o/p --->  tensor(15)
when input is  tensor([18, 47, 56, 57, 58,  1, 15]) o/p --->  tensor(47)
when input is  tensor([18, 47, 56, 57, 58,  1, 15, 47]) o/p --->  tensor(58)


there is a new dimension batch dimension
while training we dont give the model the full sequence rather we give part of the sequence and do it in batches.

batches of sequences of block size length are fed for efficiency to process in parallel

batch of sequence of block size length are stacked in tensor and fed to process


In [247]:
block_size = 8  # length of the input sequence
batch_size = 4  # no of input sequence to process in parallel

In [248]:
# four independent rows


ix = torch.randint(len(data) - block_size, (batch_size,))
torch.randint(len(data) - block_size, (batch_size,))

tensor([131570, 130990, 417281, 491033])

in a batch,completely independent sequences are selected randomly of block size


In [249]:
x = torch.stack([data[i : i + block_size] for i in ix])
y = torch.stack([data[i + 1 : i + block_size + 1] for i in ix])
print(x)
print(y)

tensor([[11,  0, 32, 46, 43,  1, 43, 39],
        [39, 60, 63,  1, 57, 53, 52,  6],
        [57, 41, 46, 53, 53, 50,  5, 42],
        [21, 26, 15, 17, 10,  0, 15, 53]])
tensor([[ 0, 32, 46, 43,  1, 43, 39, 56],
        [60, 63,  1, 57, 53, 52,  6,  0],
        [41, 46, 53, 53, 50,  5, 42,  8],
        [26, 15, 17, 10,  0, 15, 53, 51]])


In [324]:
print(x.shape,y.shape)

torch.Size([4, 8]) torch.Size([4, 8])


In [250]:
block_size = 8  # length of the input sequence
batch_size = 4  # no of input sequence to process in parallel


def get_batch(split):
    data = train_data if split == "train" else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i : i + block_size] for i in ix])
    y = torch.stack([data[i + 1 : i + block_size + 1] for i in ix])
    return x, y


xb, yb = get_batch("train")
print("inputs: ", xb.shape)
print(xb)
print("----\n ")
print("targets: ", xb.shape)
print(yb)

# xb is the input to the transformer

inputs:  torch.Size([4, 8])
tensor([[50, 50,  1, 61, 43,  1, 51, 39],
        [53,  1, 51, 53, 56, 43,  8,  0],
        [25, 59, 56, 42, 43, 56, 43, 56],
        [ 6,  0, 32, 46, 47, 57,  1, 54]])
----
 
targets:  torch.Size([4, 8])
tensor([[50,  1, 61, 43,  1, 51, 39, 56],
        [ 1, 51, 53, 56, 43,  8,  0, 28],
        [59, 56, 42, 43, 56, 43, 56, 10],
        [ 0, 32, 46, 47, 57,  1, 54, 56]])


In [251]:
print(batch_size, block_size)

4 8


In [252]:
for i in range(batch_size):
    for j in range(block_size):
        context = xb[i, : j + 1]
        target = yb[i, j]
        print("when input is ", context.tolist(), "output --> ", target)

when input is  [50] output -->  tensor(50)
when input is  [50, 50] output -->  tensor(1)
when input is  [50, 50, 1] output -->  tensor(61)
when input is  [50, 50, 1, 61] output -->  tensor(43)
when input is  [50, 50, 1, 61, 43] output -->  tensor(1)
when input is  [50, 50, 1, 61, 43, 1] output -->  tensor(51)
when input is  [50, 50, 1, 61, 43, 1, 51] output -->  tensor(39)
when input is  [50, 50, 1, 61, 43, 1, 51, 39] output -->  tensor(56)
when input is  [53] output -->  tensor(1)
when input is  [53, 1] output -->  tensor(51)
when input is  [53, 1, 51] output -->  tensor(53)
when input is  [53, 1, 51, 53] output -->  tensor(56)
when input is  [53, 1, 51, 53, 56] output -->  tensor(43)
when input is  [53, 1, 51, 53, 56, 43] output -->  tensor(8)
when input is  [53, 1, 51, 53, 56, 43, 8] output -->  tensor(0)
when input is  [53, 1, 51, 53, 56, 43, 8, 0] output -->  tensor(28)
when input is  [25] output -->  tensor(59)
when input is  [25, 59] output -->  tensor(56)
when input is  [25, 59

---


## start feeding to NN

- Bigram model
  - simple model for language modeling that predicts the next token based on the current token using a lookup table.
  - each token in the vocabulary has a corresponding embedding vector in the lookup table.


In [253]:
# table of size (vocab_size, vocab_size) where each row corresponds to a token in the vocabulary and contains the logits for predicting the next token.

# here each token is made to (65\*65)

# Embedding =  matrix of shape (num_embeddings, embedding_dim)

# when
# logits = self.token_embedding_table(idx)
# internally
# logits[b, t] = W[idx[b, t]]

# The embedding table is formed by initializing a (vocab_size Ã— vocab_size) matrix with random values and then gradually shaping each row through gradient descent so that it learns the logits for predicting the next token given the current token.

In [None]:
torch.manual_seed(1337)


class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()

        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)  # 65*65

    def forward(self, idx, targets):
        # idx and target are both (B,T)tensor of integer B-batch ,T-time/block_size/context length, C-channel. (here b=4,T=8,C=vocabsize ie 65)

        logits = self.token_embedding_table(idx)
        return logits


m = BigramLanguageModel(vocab_size)
out = m(xb, yb)
print(out.shape)

# idx or xb =(4,8)
# returned logits= (4,8,65)(4batch of 8dim with 65vec)

torch.Size([4, 8, 65])


In [255]:
xb.shape

torch.Size([4, 8])

In [256]:
x[1]

tensor([39, 60, 63,  1, 57, 53, 52,  6])

#returned logits= (4,8,65)(4batch of 65dim vector for each of the 8 tokens in the sequence)

- Each integer in the 8-length vector becomes a 65-length vector
- for x[4,8] 4 vec of 8dimlength each logits returns as (4,8,65) ,4batch of 65dim vector for each of the 8 tokens in the sequence
- [ 0, 32, 46, 53, 59, 1, 40, 43] each integer in 8-length vector becomes a 65-length vector


In [257]:
out[:1].shape

torch.Size([1, 8, 65])

In [258]:
out[:1]

tensor([[[-7.4696e-01, -1.4852e+00,  1.7144e-01, -6.0946e-01,  5.3037e-01,
          -1.1188e+00,  9.1132e-01, -1.7415e-02,  1.6884e-02,  6.2911e-01,
          -8.8912e-01,  6.4300e-01,  7.9472e-01, -3.8905e-01, -5.6550e-01,
           1.8625e-01,  4.4660e-01,  8.0538e-02,  8.7537e-01, -1.2676e+00,
           6.2803e-01,  1.8043e+00, -3.3579e+00, -8.3055e-01,  4.4504e-01,
          -9.5354e-01, -5.3696e-01, -2.1837e-01, -1.7932e+00,  5.1601e-01,
          -1.9304e+00,  7.6272e-01,  1.9272e-01, -1.6400e+00,  2.1917e-01,
          -4.7368e-02, -1.8540e+00,  2.3971e-01, -1.3184e-02,  1.8821e+00,
          -1.9880e-01,  3.4216e-01, -8.7237e-01, -2.3707e+00,  2.2814e-01,
           2.8507e+00,  3.0406e-01,  8.5270e-01,  3.8168e-01, -1.6806e-01,
           7.4757e-01,  2.6562e-01,  6.8140e-01,  4.0766e-01, -6.7112e-01,
          -7.7796e-01,  5.2343e-01, -9.2963e-01,  1.2592e+00,  1.0245e+00,
           2.5456e+00, -3.5598e-01,  2.2019e+00,  2.4498e+00,  6.3467e-01],
         [-7.4696e-01, -

In [None]:
torch.manual_seed(1337)


class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()

        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)  # 65*65

    def forward(self, idx, targets):
        # idx and target are both (B,T)tensor of integer B-batch ,T-time/block_size/context length, C-channel. (here b=4,T=8,C=vocabsize ie 65)

        logits = self.token_embedding_table(idx)  # (B,T,C)ie(4,8,65)
        B, T, C = logits.shape
        logits = logits.view(B * T, C)  # (32*65) stretching the vec

        targets = targets.view(B * T)  # (32)
        loss = F.cross_entropy(logits, targets)

        return logits, loss


m = BigramLanguageModel(vocab_size)
logits, loss = m(xb, yb)
print("logits", logits.shape, "\n loss= ", loss)

# idx or xb =(4,8)
# returned logits= (4*8,65) stretched vec
# losscalculation
# The 65-logit vector represents a distribution over choices.
# The target integer selects the correct choice, and the loss measures how much probability the model assigned to that choice.

logits torch.Size([32, 65]) 
 loss=  tensor(4.6382, grad_fn=<NllLossBackward0>)


In [260]:
print(logits)

tensor([[-0.7470, -1.4852,  0.1714,  ...,  2.2019,  2.4498,  0.6347],
        [-0.7470, -1.4852,  0.1714,  ...,  2.2019,  2.4498,  0.6347],
        [ 0.5978, -0.0514, -0.0646,  ..., -1.4649, -2.0555,  1.8275],
        ...,
        [-0.5201,  0.2831,  1.0847,  ..., -0.0198,  0.7959,  1.6014],
        [ 0.5978, -0.0514, -0.0646,  ..., -1.4649, -2.0555,  1.8275],
        [-0.6787,  0.8662, -1.6433,  ...,  2.3671, -0.7775, -0.2586]],
       grad_fn=<ViewBackward0>)


In [261]:
logits.shape

torch.Size([32, 65])

In [262]:
B, T = 4, 8
print(yb.view(B * T))
yb.view(B * T).shape

tensor([50,  1, 61, 43,  1, 51, 39, 56,  1, 51, 53, 56, 43,  8,  0, 28, 59, 56,
        42, 43, 56, 43, 56, 10,  0, 32, 46, 47, 57,  1, 54, 56])


torch.Size([32])

loss calculation complete


---


# Generate text


1) Part 1  dimension calculation for single step of generation and explanation
2) Part 2  implementation

# Part 1

for a single tensor sent to predict next token 

In [None]:
# for single next token calculation ,max_new_tokens=1 

torch.manual_seed(1337)


class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()

        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)  # 65*65

    def forward(self, idx, targets=None):
        # idx and target are both (B,T)tensor of integer B-batch ,T-time/block_size/context length, C-channel. (here b=4,T=8,C=vocabsize ie 65)

        logits = self.token_embedding_table(idx)  # logits becomes (B,T,C)ie(4,8,65)
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B * T, C)  # (32*65) stretching the vec
            targets = targets.view(B * T)  # (32)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        # takes (B,T) and generate work is to generate (b,T+1,T+2)ie generate new token in time dim ie(contextlength dim)
        # idx is (B,T) array of indices in the current context(1,1)
        for _ in range(max_new_tokens):
            #   get new prediction
            logits, loss = self(idx)
            # returns(batch, time, embedding_dim) ie(B,T,C)->(1,1)->(1,1,65)
            #during iteration when idx increases egidx=[31,32] logits, loss = self(idx) returns (1,2,65)
            #then logits = logits[:, -1, :]  selects last element of timedim so results(1,65)batch,vocab/contextdim
            print("---\n")
            print("logit_shape_prev",logits.shape)
            print("logits_prev=",logits)
            # focus only on the last time step
            logits = logits[:, -1, :]  # becomes (B,C) <-last element in the time dim,,,just one time dim so selects that whole tensor(1,1)->(1,1,65)->(1,65)
            # applying softmax to get probabilities form logits
            print("---\n")
            print("logits_next=",logits)
            print("logit_shape_next",logits.shape)
            probs = F.softmax(logits, dim=-1)  # (B,C)
            print("---\n")
            print("probs=",probs)
            print("probsshape",probs.shape)#(1,65)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1)  # (B,1)ie(1,1)selects any one token from the probability values from 65 of them
            #   append sampled index to the running sequence
            # Selects the next token based on the probability of each token, so higher-probability tokens are more likely but not guaranteed.
            print("---\n")
            print("idx_next=",idx_next)
            print("idx_nextshape",idx_next.shape)
            idx = torch.cat((idx, idx_next), dim=1)  # (B,T+1)
            #eg idx=[31,32]
        return idx


m = BigramLanguageModel(vocab_size)
logits, loss = m(xb, yb)
print("logits", logits.shape, "\n loss= ", loss)


# --------
# generate
idx = torch.zeros((1, 1), dtype=torch.long)
# PyTorch expects a batch dimension in tensors, so even a single sequence must be shaped as (B, T) rather than just (T).
print("idx begin------")
print("idx=",idx)
print("idxshape",idx.shape)
ret_idx=m.generate(idx, max_new_tokens=1)[0].tolist()
print("ret_idx=",ret_idx)
print("len=",len(ret_idx))
print("generated_text",decode_txt(ret_idx))


logits torch.Size([32, 65]) 
 loss=  tensor(4.6382, grad_fn=<NllLossBackward0>)
idx begin------
idx= tensor([[0]])
idxshape torch.Size([1, 1])
---

logit_shape_prev torch.Size([1, 1, 65])
logits_prev= tensor([[[ 0.1808, -0.0700, -0.3596, -0.9152,  0.6258,  0.0255,  0.9545,
           0.0643,  0.3612,  1.1679, -1.3499, -0.5102,  0.2360, -0.2398,
          -0.9211,  1.5433,  1.3488, -0.1396,  0.2858,  0.9651, -2.0371,
           0.4931,  1.4870,  0.5910,  0.1260, -1.5627, -1.1601, -0.3348,
           0.4478, -0.8016,  1.5236,  2.5086, -0.6631, -0.2513,  1.0101,
           0.1215,  0.1584,  1.1340, -1.1539, -0.2984, -0.5075, -0.9239,
           0.5467, -1.4948, -1.2057,  0.5718, -0.5974, -0.6937,  1.6455,
          -0.8030,  1.3514, -0.2759, -1.5108,  2.1048,  2.7630, -1.7465,
           1.4516, -1.5103,  0.8212, -0.2115,  0.7789,  1.5333,  1.6097,
          -0.4032, -0.8345]]], grad_fn=<EmbeddingBackward0>)
---

logits_next= tensor([[ 0.1808, -0.0700, -0.3596, -0.9152,  0.6258,  0.0255, 

In [327]:
# generatedtext must be \nS. char 1 by one \n is treated ans new line here in output

In [321]:
list(strtoint.items())[:5]  # lookuptable

[('\n', 0), (' ', 1), ('!', 2), ('$', 3), ('&', 4)]

In [322]:
torch.zeros((1, 1), dtype=torch.long).item()

0

In [289]:
decode_txt([torch.zeros((1), dtype=torch.long).item()])

'\n'

In [310]:
decode_txt([31])

'S'

In [None]:
# for max_new_tokens=3

torch.manual_seed(1337)


class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()

        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)  # 65*65

    def forward(self, idx, targets=None):
        # idx and target are both (B,T)tensor of integer B-batch ,T-time/block_size/context length, C-channel. (here b=4,T=8,C=vocabsize ie 65)

        logits = self.token_embedding_table(idx)  # logits becomes (B,T,C)ie(4,8,65)
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B * T, C)  # (32*65) stretching the vec
            targets = targets.view(B * T)  # (32)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        # takes (B,T) and generate work is to generate (b,T+1,T+2)ie generate new token in time dim ie(contextlength dim)
        # idx is (B,T) array of indices in the current context
        for _ in range(max_new_tokens):
            print("******"*8,"\ntoken no= ",_+1)
            #   get new predication
            logits, loss = self(idx)
            # focus only on the last time step
            print("---\n")
            print("logit_shape_prev",logits.shape)
            print("logits_prev=",logits)
            logits = logits[:, -1, :]  # becomes (B,C) <-last element in the time dim
            # applying softmax to get probabilities form logits
            print("---\n")
            print("logits_next=",logits)
            print("logit_shape_next",logits.shape)
            probs = F.softmax(logits, dim=-1)  # (B,C)
            print("---\n")
            print("probs=",probs)
            print("probsshape",probs.shape)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1)  # (B,1)
            #   append sampled index to the running sequence
            print("---\n")
            print("idx_next=",idx_next)
            print("idx_nextshape",idx_next.shape)
            idx = torch.cat((idx, idx_next), dim=1)  # (B,T+1)
        return idx


m = BigramLanguageModel(vocab_size)
logits, loss = m(xb, yb)
print("logits", logits.shape, "\n loss= ", loss)

# --------
# generate
idx = torch.tensor([[31]],dtype=torch.long)
print("idx=",idx)
print("idxshape",idx.shape)
ret_idx=m.generate((idx), max_new_tokens=3)[0].tolist()
print("ret_idx=",ret_idx)
print("len=",len(ret_idx))
print("generated_text",decode_txt(ret_idx))

logits torch.Size([32, 65]) 
 loss=  tensor(4.6382, grad_fn=<NllLossBackward0>)
idx= tensor([[31]])
idxshape torch.Size([1, 1])
************************************************ 
token no=  1
---

logit_shape_prev torch.Size([1, 1, 65])
logits_prev= tensor([[[-1.0699, -0.6119, -0.4034,  0.3025,  0.6852, -1.0045, -1.0104,
          -1.0886,  1.3292,  0.5912, -1.1082, -1.2869, -0.8170,  0.9682,
           1.6030, -0.0726, -0.4725, -1.1616,  0.5962,  1.3058, -0.7422,
          -1.2529,  0.6750,  1.5664, -0.9238, -0.0956, -1.5452, -0.1801,
           3.1838, -0.1277,  0.0910,  0.5422, -0.6110,  0.5220,  2.1368,
          -1.4166, -0.8557,  1.0129,  0.6503,  0.2432,  1.2588, -0.0644,
          -0.9707, -0.4880, -0.2550, -0.4089, -0.7687,  1.0953,  1.5294,
          -1.2395,  1.0547,  0.5108,  0.3854, -0.8898,  1.3468,  2.3590,
           0.1071, -1.2616,  0.7945, -0.7739, -0.1497, -0.6214,  1.0078,
           0.2930,  0.0943]]], grad_fn=<EmbeddingBackward0>)
---

logits_next= tensor([[-1.069

In [332]:
torch.tensor([[31]],dtype=torch.long)

tensor([[31]])

In [333]:
torch.zeros((1, 1), dtype=torch.long)


tensor([[0]])

----------

# Part 2

In [337]:
torch.manual_seed(1337)


class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()

        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)  # 65*65

    def forward(self, idx, targets=None):
        # idx and target are both (B,T)tensor of integer B-batch ,T-time/block_size/context length, C-channel. (here b=4,T=8,C=vocabsize ie 65)

        logits = self.token_embedding_table(idx)  # logits becomes (B,T,C)ie(4,8,65)
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B * T, C)  # (32*65) stretching the vec
            targets = targets.view(B * T)  # (32)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        # takes (B,T) and generate work is to generate (b,T+1,T+2)ie generate new token in time dim ie(contextlength dim)
        # idx is (B,T) array of indices in the current context(1,1)
        for _ in range(max_new_tokens):
            #   get new prediction
            logits, loss = self(idx)
            # returns(batch, time, embedding_dim) ie(B,T,C)->(1,1)->(1,1,65)
            # focus only on the last time step
            logits = logits[:, -1, :]  # becomes (B,C) <-last element in the time dim,,,just one time dim so selects that whole tensor(1,1)->(1,1,65)->(1,65)
            # applying softmax to get probabilities form logits
            probs = F.softmax(logits, dim=-1)  # (B,C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1)  # (B,1)ie(1,1)selects any one token from the probability values from 65 of them
            idx = torch.cat((idx, idx_next), dim=1)  # (B,T+1)
            #eg next = idx=[31,32]
        return idx


m = BigramLanguageModel(vocab_size)
logits, loss = m(xb, yb)
print("logits", logits.shape, "\n loss= ", loss)


# --------
# generate
idx = torch.zeros((1, 1), dtype=torch.long)
# 0 index in vocab represents \n
# PyTorch expects a batch dimension in tensors, so even a single sequence must be shaped as (B, T) rather than just (T).
print("idx begin------")
print("idx=",idx)
print("idxshape",idx.shape)
ret_idx=m.generate(idx, max_new_tokens=100)[0].tolist()
print("ret_idx=",ret_idx)
print("len=",len(ret_idx))
print("generated_text",decode_txt(ret_idx))


logits torch.Size([32, 65]) 
 loss=  tensor(4.6382, grad_fn=<NllLossBackward0>)
idx begin------
idx= tensor([[0]])
idxshape torch.Size([1, 1])
ret_idx= [0, 31, 23, 21, 41, 24, 32, 11, 13, 41, 17, 24, 25, 53, 32, 40, 60, 38, 60, 1, 15, 12, 52, 55, 7, 29, 17, 9, 9, 10, 15, 22, 55, 49, 27, 23, 20, 7, 55, 11, 10, 50, 39, 2, 53, 47, 63, 61, 49, 20, 48, 45, 15, 46, 64, 40, 29, 12, 59, 2, 9, 40, 24, 21, 45, 61, 43, 60, 51, 63, 18, 22, 19, 33, 19, 54, 0, 61, 52, 37, 35, 51, 52, 62, 23, 35, 35, 43, 60, 7, 58, 16, 55, 36, 17, 56, 34, 23, 24, 45, 22]
len= 101
generated_text 
SKIcLT;AcELMoTbvZv C?nq-QE33:CJqkOKH-q;:la!oiywkHjgChzbQ?u!3bLIgwevmyFJGUGp
wnYWmnxKWWev-tDqXErVKLgJ


result is sort of garbage because next token is predicted only on the basis of current token without any context of previous tokens.
for prediction of T SKIcLT  model only sees L and predicts next token T ,without any context of previous tokens SKIc.
- next task is to make model see previous tokens as context not just the current token while predicting next token.

---------