In [1]:
with open('input.txt','r',encoding='utf-8') as f:
    text = f.read()

In [2]:
text[:10]

'First Citi'

In [3]:
print(f"char size:({len(text)})")

char size:(1115394)


In [4]:
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(f"vocab size: ({vocab_size})")

vocab size: (65)


In [5]:
stoi = {ch:i for i,ch in enumerate(chars)}
itos = {i:ch for i,ch in enumerate(chars)}

In [6]:
def encoder(chars):
    return [stoi[char] for char in chars]

def decoder(nums):
    return "".join(itos[num] for num in nums)

decoder(encoder("hi there"))

'hi there'

In [7]:
import torch 
data = torch.tensor(encoder(text),dtype=torch.long)
print(f"shape: {data.shape}")

shape: torch.Size([1115394])


In [8]:
def train_test_split(data,percentage):
    n = (int)(data.shape[0]*percentage)
    return {
        "train":data[:n],
        "val":data[n:]
    }


In [9]:
tts = train_test_split(data,0.9)
train_data = tts["train"]
val_data = tts["val"]

In [10]:
# context length
block_size = 8
train_data[:block_size+1]

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58])

In [11]:
x = train_data[:block_size]
y = train_data[1:block_size+1]
for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print(f"context: {context}-target: {target}")


context: tensor([18])-target: 47
context: tensor([18, 47])-target: 56
context: tensor([18, 47, 56])-target: 57
context: tensor([18, 47, 56, 57])-target: 58
context: tensor([18, 47, 56, 57, 58])-target: 1
context: tensor([18, 47, 56, 57, 58,  1])-target: 15
context: tensor([18, 47, 56, 57, 58,  1, 15])-target: 47
context: tensor([18, 47, 56, 57, 58,  1, 15, 47])-target: 58


In [12]:
torch.manual_seed(1337)
batch_size = 4 # stack of sequence for parrallel process
block_size = 8 # maximum context length

def get_batch(split):
    data = train_data if split == "train" else val_data
    ix = torch.randint(low = 0,high = len(data)-block_size,size=(batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x,y

In [14]:
xb,yb = get_batch("train")
xb

tensor([[57, 43, 60, 43, 52,  1, 63, 43],
        [60, 43, 42,  8,  0, 25, 63,  1],
        [56, 42,  5, 57,  1, 57, 39, 49],
        [43, 57, 58, 63,  6,  1, 58, 46]])

In [15]:
for i in range(batch_size):
    for j in range(block_size):
        context = xb[i,:j+1]
        target = yb[i,j]
        print(f"input: {context} -> target {target}")

input: tensor([24]) -> target 43
input: tensor([24, 43]) -> target 58
input: tensor([24, 43, 58]) -> target 5
input: tensor([24, 43, 58,  5]) -> target 57
input: tensor([24, 43, 58,  5, 57]) -> target 1
input: tensor([24, 43, 58,  5, 57,  1]) -> target 46
input: tensor([24, 43, 58,  5, 57,  1, 46]) -> target 43
input: tensor([24, 43, 58,  5, 57,  1, 46, 43]) -> target 39
input: tensor([44]) -> target 53
input: tensor([44, 53]) -> target 56
input: tensor([44, 53, 56]) -> target 1
input: tensor([44, 53, 56,  1]) -> target 58
input: tensor([44, 53, 56,  1, 58]) -> target 46
input: tensor([44, 53, 56,  1, 58, 46]) -> target 39
input: tensor([44, 53, 56,  1, 58, 46, 39]) -> target 58
input: tensor([44, 53, 56,  1, 58, 46, 39, 58]) -> target 1
input: tensor([52]) -> target 58
input: tensor([52, 58]) -> target 1
input: tensor([52, 58,  1]) -> target 58
input: tensor([52, 58,  1, 58]) -> target 46
input: tensor([52, 58,  1, 58, 46]) -> target 39
input: tensor([52, 58,  1, 58, 46, 39]) -> targe

In [37]:
import torch
import torch.nn as nn 
import torch.nn.functional as f

class BigramLanguageModel(nn.Module):

    def __init__(self,vocab_size):
        super().__init__()
        # look-up table
        self.token_embedding_table = nn.Embedding(vocab_size,vocab_size)

    def forward(self,idx,target=None):
        logits = self.token_embedding_table(idx)
        if target == None:
            loss = None
        else:
            loss = f.cross_entropy(logits.permute(0,2,1),target)
        return logits,loss
    
    def genrate(self,idx,max_new_tokens):
        for _ in range(max_new_tokens):
            logits,loss = self(idx)
            logits = logits[:,-1,:]
            probs = f.softmax(logits,dim = -1)
            idx_next = torch.multinomial(probs,num_samples=1)
            idx = torch.cat((idx,idx_next),dim = 1)
        return idx


In [48]:
blm = BigramLanguageModel(vocab_size=vocab_size)
out,loss = blm(xb,yb)
print(out.shape)
print("shape: (Batch_dim,context_size,embedding_size/vocab_size)")
print(f"loss: {loss}")

torch.Size([4, 8, 65])
shape: (Batch_dim,context_size,embedding_size/vocab_size)
loss: 4.397454738616943


In [49]:
print(decoder(blm.genrate(torch.zeros((1,1),dtype=torch.long),max_new_tokens=100)[0].tolist()))


kVwiplUkLsY.r?I?mQSB.jZvYMnGsY$dHaI3Ngjzo!wmExFILBSznSmCi$ddyaMIPk&Bfo
mpiMILiCitLOvkJNsOpbBt!nCJiRF
