In [1]:
from datasets import load_dataset

dataset = load_dataset(r"E:\repo\openwebtext\openwebtext.py", trust_remote_code=True)

Downloading data:   0%|          | 0/21 [00:00<?, ?files/s]

Generating train split:   0%|          | 0/8013769 [00:00<?, ? examples/s]

PermissionError: [WinError 32] The process cannot access the file because it is being used by another process: 'C:/Users/s/.cache/huggingface/datasets/openwebtext/plain_text/1.0.0/6f68e85c16ccc770c0dd489f4008852ea9633604995addd0cd76e293aed9e521.incomplete\\openwebtext-train-00000-00060-of-NNNNN.arrow'

In [7]:
import torch

device= 'cuda' if torch.cuda.is_available else 'cpu'
print(device)

chars=""

with open("wizardofoz.txt", 'r', encoding='utf-8') as f:
    text=f.read()
    chars=sorted(list(set(text)))
print(len(text)) 


vocab_size=len(chars)
embed_dim=384
learning_rate=3e-4
max_iters=10000
block_size=32
batch_size=128
eval_iters=2500
n_layer=4
n_head=4
dropout=0.2

string_to_int = {ch:i for i,ch in enumerate(chars)}
int_to_string = {i:ch for i,ch in enumerate(chars)}
encode = lambda s: [string_to_int[c] for c in s]
decode = lambda l: ''.join([int_to_string[i] for i in l])


class Head(torch.nn.Module):
    def __init__(self,head_size):
        super().__init__()
        self.key=torch.nn.Linear(embed_dim,head_size,bias=False)
        self.query=torch.nn.Linear(embed_dim,head_size,bias=False)
        self.value=torch.nn.Linear(embed_dim,head_size,bias=False)
        self.register_buffer('tril',torch.tril(torch.ones(block_size,block_size)))
        self.dropout=torch.nn.Dropout(dropout)

    def forward(self,x):
        B,T,C=x.shape
        k=self.key(x)
        q=self.query(x)
        wei=q@k.transpose(-2,-1)*k.shape[-1]**-0.5
        wei=wei.masked_fill(self.tril[:T,:T]==0,float('-inf'))
        wei=torch.nn.functional.softmax(wei,dim=-1)
        wei=self.dropout(wei)
        v=self.value(x)
        out=wei@v
        return out

class MultiHeadAttention(torch.nn.Module):
    def __init__(self,num_heads,head_size):
        super().__init__()
        self.heads=torch.nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj=torch.nn.Linear(head_size*num_heads,embed_dim)
        self.dropout=torch.nn.Dropout(dropout)

    def forward(self,x):
        out=torch.cat([h(x) for h in self.heads],dim=-1)
        out=self.dropout(self.proj(out))
        return out
        
class FeedForward(torch.nn.Module):
    def __init__(self,embed_dim):
        super().__init__()
        self.net=torch.nn.Sequential(
            torch.nn.Linear(embed_dim,4*embed_dim),
            torch.nn.ReLU(),
            torch.nn.Linear(4*embed_dim,embed_dim),
            torch.nn.Dropout(dropout),
        )

    def forward(self,x):
        return self.net(x)

class Block(torch.nn.Module):
    def __init__(self,embed_dim,n_head):
        super().__init__()
        head_size=embed_dim//n_head
        self.sa=MultiHeadAttention(n_head,head_size)
        self.ffwd=FeedForward(embed_dim)
        self.ln1=torch.nn.LayerNorm(embed_dim)
        self.ln2=torch.nn.LayerNorm(embed_dim)

    def forward(self,x):
        y=self.sa(x)
        x=self.ln1(x+y)
        y=self.ffwd(x)
        x=self.ln2(x+y)
        return x

class GPTLM(torch.nn.Module):
    def __init__(self,vocab_size,embed_dim):
        super().__init__()
        self.token_embedding_table=torch.nn.Embedding(vocab_size,embed_dim)
        self.position_embedding_table=torch.nn.Embedding(block_size,embed_dim)
        self.blocks=torch.nn.Sequential(*[Block(embed_dim,n_head=n_head) for _ in range(n_layer)])

        self.ln_f=torch.nn.LayerNorm(embed_dim)
        self.lm_head=torch.nn.Linear(embed_dim,vocab_size)
        self.apply(self._init_weights)

    def _init_weights(self,module):
        if isinstance(module,torch.nn.Linear):
            torch.nn.init.normal_(module.weight,mean=0.0,std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module,torch.nn.Embedding):
            torch.nn.init.normal_(module.weight,mean=0.0,std=0.02)

    def forward(self,index,targets=None):
        # logits=self.token_embedding_table(index)
        B,T=index.shape
        tok_emb=self.token_embedding_table(index)
        pos_emb=self.position_embedding_table(torch.arange(T, device=device))
        x=tok_emb+pos_emb
        x=self.blocks(x)
        x=self.ln_f(x)
        logits=self.lm_head(x)
        
        if targets is None:
            loss=None
        else:
            B,T,C=logits.shape
            logits=logits.view(B*T,C)
            targets=targets.view(B*T)
            loss=torch.nn.functional.cross_entropy(logits,targets)
        return logits,loss

    def generate(self,index,max_new_tokens):
        for _ in range(max_new_tokens):
            index_cond = index[:, -block_size:]
            logits,loss=self.forward(index_cond)
            logits=logits[:,-1,:]
            probs=torch.nn.functional.softmax(logits,dim=-1)
            index_next=torch.multinomial(probs,num_samples=1)
            index=torch.cat((index,index_next),dim=1)
        return index

model=GPTLM(vocab_size,embed_dim)
m=model.to(device)

# context=torch.tensor(encode("o"),dtype=torch.long, device=device).reshape(1,1)
# print(context)
# context2=torch.zeros((1,1),dtype=torch.long, device=device)
# print(context2)
# generated_chars=decode(m.generate(context,max_new_tokens=500)[0].tolist())
# print(generated_chars)

prompt = 'Hello! Can you see me?'
context = torch.tensor(encode(prompt), dtype=torch.long, device=device)
generated_chars = decode(m.generate(context.unsqueeze(0), max_new_tokens=100)[0].tolist())
print(generated_chars)

cuda
207798
Hello! Can you see me?D&DTqbriy;fwLq-‘Q.nOGHR—ELwgpoveHM1M‘Zn;NG“g:‘ZRuaGrYMR’WLvKv??”!wDcAjz1XiXw“lR‘xWbFG;uDAqe(lmfgwpdK


In [25]:
x=torch.nn.Embedding(87,6)
z=x(torch.tensor([[4]]))
print(z[:,-1,:])
print(x(torch.tensor([[4]])))
b,t,c=x(torch.tensor([[4]])).shape
print(b,t,c)
y=x(torch.tensor([[4]])).view(1,6)
y

tensor([[ 1.9250,  0.9050, -1.1734,  1.0528, -0.1001, -0.0505]],
       grad_fn=<SliceBackward0>)
tensor([[[ 1.9250,  0.9050, -1.1734,  1.0528, -0.1001, -0.0505]]],
       grad_fn=<EmbeddingBackward0>)
1 1 6


tensor([[ 1.9250,  0.9050, -1.1734,  1.0528, -0.1001, -0.0505]],
       grad_fn=<ViewBackward0>)

In [8]:
data=torch.tensor(encode(text),dtype=torch.long)

n=int(0.8*len(data))
train_data=data[:n]
val_data=data[n:]
def get_batch(split):
    data=train_data if split=='train' else val_data
    ix=torch.randint(len(data)-block_size,(batch_size,))
    # print(ix)
    x=torch.stack([data[i:i+block_size] for i in ix])
    y=torch.stack([data[i+1:i+block_size+1] for i in ix])
    x,y=x.to(device),y.to(device)
    return x,y
    
optimizer=torch.optim.AdamW(model.parameters(),lr=learning_rate)

@torch.no_grad()
def estimate_loss():
    out={}
    model.eval()
    for split in ['train','val']:
        losses=torch.zeros(eval_iters)
        for k in range(eval_iters):
            X,Y=get_batch(split)
            logits,loss=model(X,Y)
            losses[k]=loss.item()
        out[split]=losses.mean()
    model.train()
    return out

for iter in range(max_iters):
    if iter % eval_iters == 0:
        losses=estimate_loss()
        print(f"step:{iter}, train loss, {losses['train']}, val loss, {losses['val']}")
    xb,yb=get_batch('train')
    logits,loss=model.forward(xb,yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
print(loss.item())

step:0, train loss, 4.337291240692139, val loss, 4.339147090911865
step:2500, train loss, 1.0280554294586182, val loss, 1.3671529293060303
step:5000, train loss, 0.763266384601593, val loss, 1.5050116777420044
step:7500, train loss, 0.5783995389938354, val loss, 1.7056875228881836
0.6622985601425171


In [10]:
prompt = 'Hello! Can you see me?'
context = torch.tensor(encode(prompt), dtype=torch.long, device=device)
generated_chars = decode(m.generate(context.unsqueeze(0), max_new_tokens=100)[0].tolist())
print(generated_chars)

Hello! Can you see me?” she asked the man, who was also!” reto the Scarecrow and he had greenish skin what they talked so 


In [None]:
while True:
    prompt = input("Prompt:\n")
    context = torch.tensor(encode(prompt), dtype=torch.long, device=device)
    generated_chars = decode(m.generate(context.unsqueeze(0), max_new_tokens=150)[0].tolist())
    print(f'Completion:\n{generated_chars}')

Prompt:
 Hi


Completion:
His
few so beautiful legs, where theyes
were  severely:

“Can you are that Dorothy could not help us, Dorothy with all those other animals is the end my


In [15]:
# Input
# text = "your_text_here"
# chars = list(set(text))  # List of unique characters in text

# Count frequencies manually
frequency_dict = {}
for char in text:
    if char in frequency_dict:
        frequency_dict[char] += 1
    else:
        frequency_dict[char] = 1

# Sort the dictionary by frequency (values)
sorted_items = sorted(frequency_dict.items(), key=lambda item: item[1])

# Filter to include only the characters in `chars`
result = {char: freq for char, freq in sorted_items if char in chars}

print(frequency_dict)
type(frequency_dict.items())

{'O': 454, 'n': 53695, '\n': 16178, 't': 66385, 'h': 36315, 'e': 99167, ' ': 141128, 'r': 46845, 'i': 53816, 'g': 13352, 'o': 53788, 'f': 20788, 'S': 641, 'p': 13771, 'c': 25950, 's': 50243, 'B': 480, 'Y': 59, 'M': 440, 'E': 427, 'A': 936, 'N': 474, 'F': 304, 'T': 959, 'U': 114, 'R': 239, 'L': 238, 'C': 445, 'I': 1815, ',': 12587, 'H': 418, 'P': 262, 'V': 78, 'D': 216, 'G': 287, '.': 5451, 'y': 12662, 'a': 58480, 'l': 31034, 'w': 11633, 'æ': 179, ';': 1702, 'u': 18778, '‘': 8, 'J': 41, '’': 141, 'd': 27350, 'W': 510, ':': 405, '1': 545, '8': 237, '5': 227, '9': 227, '“': 71, 'm': 18241, '—': 339, 'v': 9042, 'b': 12303, 'x': 1829, '”': 71, '_': 328, 'k': 2655, '-': 999, 'K': 35, '2': 388, '3': 490, 'X': 11, '4': 355, '6': 205, '7': 200, '0': 235, 'q': 679, 'j': 376, '?': 107, '!': 46, '(': 172, ')': 172, 'ü': 3, 'z': 273, 'ä': 33, 'ö': 15, 'Z': 32, 'ë': 1, '°': 10, '/': 7, 'Q': 2, '>': 1}


dict_items