In [1]:
!pip install pyarrow pylance lightning wandb

Collecting pylance
  Downloading pylance-0.21.0-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (7.2 kB)
Collecting lightning
  Downloading lightning-2.5.0.post0-py3-none-any.whl.metadata (40 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.4/40.4 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
Downloading pylance-0.21.0-cp39-abi3-manylinux_2_28_x86_64.whl (33.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m33.8/33.8 MB[0m [31m52.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading lightning-2.5.0.post0-py3-none-any.whl (815 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m815.2/815.2 kB[0m [31m43.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pylance, lightning
Successfully installed lightning-2.5.0.post0 pylance-0.21.0


In [2]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

import wandb
import lightning
import lance
import pyarrow as pa
from lightning.pytorch.loggers import WandbLogger
from lightning.pytorch.callbacks import ModelCheckpoint

from tqdm.auto import tqdm
from transformers import GPT2TokenizerFast
from datasets import load_dataset

wandb.login(key="put your api key here") # or use .env to hide it

print("Imported all libraries!")

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33msarthak-bhu21[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Imported all libraries!


In [3]:
tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]



In [4]:
dataset = load_dataset("roneneldan/TinyStories", data_files={"train": "TinyStoriesV2-GPT4-train.txt"})

README.md:   0%|          | 0.00/1.06k [00:00<?, ?B/s]

TinyStoriesV2-GPT4-train.txt:   0%|          | 0.00/2.23G [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [5]:
all_tokens = []
total_rows = 1000

data = dataset["train"].select([x for x in range(total_rows)])

a = 0
l = 0
for row in tqdm(data["text"]):
    text = row.split(" ")
    print(text)
    l += len(text)
    a += 1
    if a>=3:
        break
print(l)

  0%|          | 0/1000 [00:00<?, ?it/s]

['']
['Once', 'upon', 'a', 'time', 'there', 'was', 'a', 'little', 'boy', 'named', 'Ben.', 'Ben', 'loved', 'to', 'explore', 'the', 'world', 'around', 'him.', 'He', 'saw', 'many', 'amazing', 'things,', 'like', 'beautiful', 'vases', 'that', 'were', 'on', 'display', 'in', 'a', 'store.', 'One', 'day,', 'Ben', 'was', 'walking', 'through', 'the', 'store', 'when', 'he', 'came', 'across', 'a', 'very', 'special', 'vase.', 'When', 'Ben', 'saw', 'it', 'he', 'was', 'amazed!', '', '']
['He', 'said,', '“Wow,', 'that', 'is', 'a', 'really', 'amazing', 'vase!', 'Can', 'I', 'buy', 'it?”', '']
74


In [6]:
i = 0
for row in tqdm(data["text"], total=len(data)):
    row = row.replace("<|endoftext|>", " ")
    encoded = tokenizer(row)["input_ids"]
    all_tokens.extend(encoded)


pa_table = pa.Table.from_arrays([all_tokens], names=["value"])
lance.write_dataset(pa_table, "tiny_stories_gpt4_encoded.lance", {"model": "create"})
print(f"Total tokens in the tokenized dataset: {len(all_tokens)}")

  0%|          | 0/1000 [00:00<?, ?it/s]

Total tokens in the tokenized dataset: 31603


In [7]:
class Config:
    vocab_size = 50304
    n_epochs = 50
    batch_size = 36
    lr = 3e-4
    wd = 1e-6
    n_embed = 256
    num_blocks = 12
    num_heads = 12
    head_size = n_embed//num_heads
    context_len = 224
    attn_dropout_val = 0.2
    mha_dropout_val = 0.2
    ffn_dropout_val = 0.2

In [8]:
class CasualAttentionHead(nn.Module):
    def __init__(self, config):
        super(CasualAttentionHead, self).__init__()
        self.config = config

        self.query = nn.Linear(config.n_embed, config.head_size, bias=False)
        self.key = nn.Linear(config.n_embed, config.head_size, bias=False)
        self.value = nn.Linear(config.n_embed, config.head_size, bias=False)
        self.attn_drop = nn.Dropout(config.attn_dropout_val)
        # mask for casual attention during training
        self.register_buffer("mask", torch.tril(torch.ones(config.context_len, config.context_len)))

    def forward(self, x):
        bs, context_len, embed_dim = x.shape
        q, k, v = self.query(x), self.key(x), self.value(x)
        attn_filter = torch.divide(torch.bmm(q, k.transpose(1, 2)), self.config.head_size)
        attn_filter = attn_filter.masked_fill(self.mask[:context_len, :context_len]==0, float("-inf"))
        attn_weights = F.softmax(attn_filter, dim=-1)
        attn_weights = self.attn_drop(attn_weights)
        output = torch.bmm(attn_weights, v)
        return output

class MultiHeadedAttention(nn.Module):
    def __init__(self, config):
        super(MultiHeadedAttention, self).__init__()
        self.config = config
        self.heads = nn.ModuleList(
            [CasualAttentionHead(config) for _ in range(config.num_heads)]
        )
        self.proj = nn.Linear(config.num_heads*config.head_size, config.n_embed)
        self.mha_drop = nn.Dropout(config.mha_dropout_val)

    def forward(self, x):
        mha_output = torch.cat([head(x) for head in self.heads], dim=-1)
        return self.mha_drop(self.proj(mha_output))

class FeedForwardNetwork(nn.Module):
    def __init__(self, config):
        super(FeedForwardNetwork, self).__init__()

        self.ffn = nn.Sequential(
            nn.Linear(config.n_embed, config.n_embed*4),
            nn.GELU(),
            nn.Linear(config.n_embed*4, config.n_embed),
            nn.Dropout()
        )
    def forward(self, x):
        return self.ffn(x)

In [9]:
class Block(nn.Module):
    def __init__(self, config):
        super(Block, self).__init__()
        self.mha = MultiHeadedAttention(config)
        self.ln1 = nn.LayerNorm(config.n_embed)
        self.ffn = FeedForwardNetwork(config)
        self.ln2 = nn.LayerNorm(config.n_embed)

    def forward(self, x):
        x = self.ln1(x+self.mha(x))
        x = self.ln2(x+self.ffn(x))
        return x

In [10]:
class GPT(lightning.LightningModule):
    def __init__(self, config):
        super(GPT, self).__init__()
        self.config = config
        self.save_hyperparameters()
        self.token_embedding = nn.Embedding(config.vocab_size, config.n_embed)
        self.positional_embedding = nn.Embedding(config.context_len, config.n_embed)
        self.backbone = nn.Sequential(*[Block(config) for _ in range(config.num_blocks)])
        self.lm_head = nn.Linear(config.n_embed, config.vocab_size)

    def forward(self, x):
        tok_emb = self.token_embedding(x)
        pos_emb = self.positional_embedding(torch.arange(x.shape[1], device=self.device))
        x = tok_emb+pos_emb
        x = self.backbone(x)
        logits = self.lm_head(x)
        return logits

    def get_loss(self, predictions, target):
        B, C, V = predictions.shape
        predictions = predictions.view(B*C, V)
        target = target.view(B*C)
        loss = F.cross_entropy(predictions, target)
        return loss

    def training_step(self, batch, batch_idx):
        text, target = batch
        text = text.long()
        target = target.long()
        logits = self(text)
        loss = self.get_loss(logits, target)

        self.log('loss', loss.item(), prog_bar=True)
        logs = {'loss': loss}

        return {"log": logs, "loss": loss}

    def training_end(self, outputs):
        avg_loss = torch.stack([x['log']['loss'] for x in outputs]).mean()
        logs = {"log": avg_loss}
        print(f"val_loss: {avg_loss}")
        return {"log": logs}

    def configure_optimizers(self):
        opt = torch.optim.AdamW(self.parameters(), lr=self.config.lr, weight_decay=self.config.wd)
        return [opt], []

In [11]:
def generate(model, prompt, max_tokens, temperature=0.7):
    model.eval()
    for _ in range(max_tokens):
        prompt = prompt[:, :config.context_len]
        logits = model(prompt)
        logits = logits[:, -1, :] / temperature
        logit_probs = nn.functional.softmax(logits, dim=-1)
        next_prompt = torch.multinomial(logit_probs, num_samples=1)
        prompt = torch.cat((prompt, next_prompt), dim=1)
    return prompt

In [12]:
class GPTDataset(Dataset):
    def __init__(self, dataset_path, context_len):
        self.ds = lance.dataset(dataset_path)
        self.context_len = context_len
        self.length = self.ds.count_rows()-context_len

    def __len__(self):
        return self.length

    def from_idxs(self, idxs):
        data = self.ds.take(idxs).to_pylist()
        data = torch.tensor(list(map(lambda x: x["value"], data)))
        return data

    def __getitem__(self, idx):
        current_window_idxs = np.arange(idx, idx+self.context_len+1)
        data = self.from_idxs(current_window_idxs)
        x = data[0:self.context_len]
        y = data[1:self.context_len+1]
        return x, y

In [13]:
if __name__ == "__main__":
    dataset_path = "tiny_stories_gpt4_encoded.lance"
    config = Config()
    gpt = GPT(config)
    dataset = GPTDataset(dataset_path, config.context_len)
    loader = DataLoader(
        dataset,
        batch_size=config.batch_size,
        shuffle=False
    )
    wandb_logger = WandbLogger(project="storyGPT")
    checkpoint_callback = ModelCheckpoint(
        dirpath="/kaggle/working/",
    )
    trainer = lightning.Trainer(default_root_dir="/kaggle/working/", accelerator="auto", max_epochs=config.n_epochs, logger=wandb_logger, callbacks=[checkpoint_callback])
    trainer.fit(gpt, loader)

INFO: GPU available: True (cuda), used: True
INFO: TPU available: False, using: 0 TPU cores
INFO: HPU available: False, using: 0 HPUs
[34m[1mwandb[0m: Tracking run with wandb version 0.19.1
[34m[1mwandb[0m: Run data is saved locally in [35m[1m./wandb/run-20250101_080333-6crcid7e[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33measy-puddle-2[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/sarthak-bhu21/storyGPT[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/sarthak-bhu21/storyGPT/runs/6crcid7e[0m
/usr/local/lib/python3.10/dist-packages/lightning/pytorch/callbacks/model_checkpoint.py:654: Checkpoint directory /kaggle/working exists and is not empty.
INFO: LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO: 
  | Name                 | Type       | Params | Mode 
------------------------------------------------------------
0 | token_embedding      | Embedding  | 12.9 M | train
1 | positi

Training: |          | 0/? [00:00<?, ?it/s]

INFO: `Trainer.fit` stopped: `max_epochs=50` reached.


In [14]:
prompt = "My dog is"
tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
gpt = gpt.to('cuda')
prompt = tokenizer.encode(prompt, return_tensors='pt').to('cuda')
generated_text = generate(gpt, prompt, max_tokens=config.context_len, temperature=0.7)
generated_text = tokenizer.decode(generated_text.tolist()[0])
print(generated_text)



My dog is so big, Lily!"Anna was having a lot of fun. But she did not see the unknown boy who came to the sandbox. He was bigger than Anna and had a mean face. He saw Lily and wanted to take her. He grabbed Lily from the castle and said, "This is mine now!"Anna was shocked and scared. He took the dog feel the dog feel dizzy, "This is my angel!"Anna was shocked and scared. She cried and said, "No, no, no, no! Give Lily back! She is my angel!"The boy laughed and said, "No, she is mine! She is a stupid doll!"He ran after him. Anna ran after him. She shouted, "Mom, mom, help! She is her and saw what happened. She is her mom, "Mom, help! She is her angel!"The boy laughed and saw what happened. He dropped Lily and saw what happened. She ran after the boy too. He called Anna and said, "Hey, you! She is my angel!"The boy laughed and ran away.
