In [6]:
!wandb login

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit: 
Aborted!


In [21]:
# Data
import torch
import torch.nn as nn
import math
from datasets import load_dataset
import sentencepiece as spm
import os

vocab_size = 7600 # english has 26 * 2 + punctuation

dataset = load_dataset("roneneldan/TinyStories")

train_dataset = dataset['train'][:8000]

validation_dataset = dataset['validation'][:2000]
text_data = [entry for entry in train_dataset['text']]
validation_data = [entry for entry in validation_dataset['text']]

text_data_str = '\n'.join(text_data)
with open('temp.txt', 'w', encoding='utf-8') as f:
    f.write(text_data_str)

spm.SentencePieceTrainer.train(
        f'--input=temp.txt --model_prefix=stories --vocab_size={vocab_size} --character_coverage=1.0 --model_type=unigram'
    )
sp = spm.SentencePieceProcessor(model_file='./stories.model')

print('successfully trained sp')


Found cached dataset parquet (/home/recurrent/.cache/huggingface/datasets/roneneldan___parquet/roneneldan--TinyStories-6ac769f186d7da53/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
100%|██████████| 2/2 [00:00<00:00, 91.74it/s]
sentencepiece_trainer.cc(177) LOG(INFO) Running command: --input=temp.txt --model_prefix=stories --vocab_size=7600 --character_coverage=1.0 --model_type=unigram
sentencepiece_trainer.cc(77) LOG(INFO) Starts training with : 
trainer_spec {
  input: temp.txt
  input_format: 
  model_prefix: stories
  model_type: UNIGRAM
  vocab_size: 7600
  self_test_sample_size: 0
  character_coverage: 1
  input_sentence_size: 0
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 4192
  num_threads: 16
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 0
  pretokenization_delimiter: 
  treat_whitespace_

successfully trained sp


eos_piece: </s>
  pad_piece: <pad>
  unk_surface:  ⁇ 
  enable_differential_privacy: 0
  differential_privacy_noise_level: 0
  differential_privacy_clipping_threshold: 0
}
normalizer_spec {
  name: nmt_nfkc
  add_dummy_prefix: 1
  remove_extra_whitespaces: 1
  escape_whitespaces: 1
  normalization_rule_tsv: 
}
denormalizer_spec {}
trainer_interface.cc(351) LOG(INFO) SentenceIterator is not specified. Using MultiFileSentenceIterator.
trainer_interface.cc(183) LOG(INFO) Loading corpus: temp.txt
trainer_interface.cc(407) LOG(INFO) Loaded all 40791 sentences
trainer_interface.cc(423) LOG(INFO) Adding meta_piece: <unk>
trainer_interface.cc(423) LOG(INFO) Adding meta_piece: <s>
trainer_interface.cc(423) LOG(INFO) Adding meta_piece: </s>
trainer_interface.cc(428) LOG(INFO) Normalizing sentences...
trainer_interface.cc(537) LOG(INFO) all chars count=6771738
trainer_interface.cc(558) LOG(INFO) Alphabet size=89
trainer_interface.cc(559) LOG(INFO) Final character coverage=1
trainer_interface.cc(5

In [22]:
import torch
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from torch.optim import Adam
from collections import Counter
import json

PAD_TOKEN = sp.piece_to_id('<unk>')
batch_size = 16

def target_story_to_tensor(story):
    tokens = torch.tensor(sp.encode_as_ids(story) + [sp.piece_to_id('</s>')], dtype=torch.long)
    return tokens

def input_story_to_tensor(story):
    tokens = torch.tensor([sp.piece_to_id('<s>')] + sp.encode_as_ids(story), dtype=torch.long)
    return tokens

class StoryDataset(Dataset):
    def __init__(self, stories):
        self.stories = stories

    def __len__(self):
        return len(self.stories)

    def __getitem__(self, idx):
        story = self.stories[idx]
        return input_story_to_tensor(story), target_story_to_tensor(story)

def collate_fn(batch):
    inputs, targets = zip(*batch)
    inputs = pad_sequence(inputs, batch_first=True, padding_value=PAD_TOKEN)
    targets = pad_sequence(targets, batch_first=True, padding_value=PAD_TOKEN)
    return inputs, targets

# Create dataset and dataloader
dataset = StoryDataset(text_data)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
val_data = StoryDataset(validation_data)
val_dataloader = DataLoader(val_data, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)


In [39]:
print(len(text_data))
print(len(validation_data))




8000
2000


In [40]:
# Hyperparameters
d_model = 256
dropout = 0.1 # 10% chance that any given neuron will be dropped out
n_heads = 8
n_layer = 8

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# device = 'cpu'


In [41]:
def create_mask(seq):
    seq_len = seq.size(1)
    mask = torch.triu(torch.ones(seq_len, seq_len, device=seq.device), diagonal=1).bool()
    return mask

class TransformerDecoder(nn.Module):
    def __init__(self, vocab_size, d_model):
        super(TransformerDecoder, self).__init__()

        self.embedding = nn.Embedding(vocab_size, d_model)
        self.pos_encoder = PositionalEncoding(d_model)

        self.blocks = nn.Sequential(
            *[Block(n_heads) for _ in range(n_layer)]
        )
        self.fc = nn.Linear(d_model, vocab_size)

    def forward(self, input):
        # print('Im here')
        input = self.embedding(input)
        input = self.pos_encoder(input)
        blocks_output = self.blocks(input)

        logits = self.fc(blocks_output)
        return logits

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)

class Block(nn.Module):
    def __init__(self, n_heads):
        super().__init__()
        head_size = d_model // n_heads
        self.multi_head_attention = MultiHeadAttention(n_heads, head_size)
        self.ffwd = nn.Sequential(
            nn.Linear(d_model, 4*d_model), # expanding and contracting the model for it to learn more intricate patterns
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(4*d_model, d_model)
        )

        self.ln1 = nn.LayerNorm(d_model)
        self.ln2 = nn.LayerNorm(d_model)

    def forward(self, x):
        # actually doing residual connection here by attn1_output + input
        # print('im in block forward')
        x = x + self.multi_head_attention(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        # print('Block shape', x.shape)
        return x

class MultiHeadAttention(nn.Module):
    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([SelfAttention(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(d_model, d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        mask = create_mask(x).to(x.device)
        # print('im in multiheadattention')
        out = torch.cat([h(x, x, x, mask) for h in self.heads], dim=-1) # can parallelize it
        # print('multiheadattention out.shape', out.shape)
        out = self.dropout(self.proj(out))
        # here not printing
        # print('dropout out.shape', out.shape)
        return out

class SelfAttention(nn.Module):
    def __init__(self, head_size):
        super(SelfAttention, self).__init__()
        self.query = nn.Linear(d_model, head_size, bias=False)
        self.key = nn.Linear(d_model, head_size, bias=False)
        self.value = nn.Linear(d_model, head_size, bias=False)

    def forward(self, query, key, value, mask=None):
        # print('im in self attention')
        # print('head_size')
        q = self.query(query)
        k = self.key(key)
        v = self.value(value)

        # print('q.shape', q.shape)
        scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(d_model)
        # print('scores.shape', scores.shape)
        # print('mask', mask)
        if mask is not None:
            scores = scores.masked_fill(mask, float('-inf'))

        attention_weights = torch.nn.functional.softmax(scores, dim=-1)
        # print('attention_weights', attention_weights.shape)
        # print('v', v.shape)
        output = torch.matmul(attention_weights, v)
        # print('output.shape of selfattention', output.shape)
        return output

In [38]:
def save_checkpoint(epoch, model, optimizer, filename="checkpoint.pth.tar"):
    torch.save({
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
    }, filename)

def load_checkpoint(filename="checkpoint.pth.tar"):
    checkpoint = torch.load(filename)
    return checkpoint

In [44]:
import wandb
import time
from torch.cuda.amp import autocast, GradScaler

num_epochs = 30
lr = 0.001

name = "experiment-8-embedding-256-mixed-precision"

# Define the model, optimizer, and loss
decoder = TransformerDecoder(vocab_size, d_model).to(device)
optimizer = Adam(decoder.parameters(), lr=lr)
criterion = torch.nn.CrossEntropyLoss()
num_params = sum(p.numel() for p in decoder.parameters())

# Training loop

criterion = torch.nn.CrossEntropyLoss(ignore_index=PAD_TOKEN)

run = wandb.init(project="story_generator", name=name,
                 config={"optimizer": "Adam", "lr": lr, "epochs": num_epochs, "batch_size": batch_size, "device": device, "vocab_size": vocab_size, "dataloader": len(text_data), "n_heads": n_heads, "n_layer": n_layer, "d_model": d_model, "num_params": num_params})

wandb.watch(decoder)

patience = 5
stopping_counter = 0
train_iteration = 0
val_iteration = 0
scaler = GradScaler()
for epoch in range(num_epochs):
    decoder.train()
    train_loss = 0.0
    start_time = time.time()
    for inputs, targets in dataloader:
        # start_time = time.time()

        optimizer.zero_grad()
        inputs = inputs.to(device)
        targets = targets.to(device)
        with autocast():t
            outputs = decoder(inputs)
            torch.cuda.empty_cache()
            loss = criterion(outputs.view(-1, vocab_size), targets.view(-1))

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        train_loss += loss.item()
        train_iteration += 1

        # end_time = time.time()
        # iteration_duration = (end_time - start_time)
        wandb.log({"train_loss": loss.item()})

    end_time = time.time()  # <-- Record the end time
    epoch_duration = end_time - start_time  # <-- Calculate epoch duration

    avg_loss_train = train_loss / len(dataloader)
    print(f"train Epoch {epoch + 1}/{num_epochs} - Loss: {avg_loss_train:.4f}")
    wandb.log({"epoch": epoch, "train_loss": avg_loss_train, "time_taken_epoch_train": epoch_duration})


    # validation loop
    decoder.eval()
    val_loss = 0.0
    best_val_loss = float('inf')
    start_time = time.time()
    with torch.no_grad():
      for val_inputs, val_targets in val_dataloader:
        # start_time = time.time()
        val_inputs, val_targets = val_inputs.to(device), val_targets.to(device)
        val_outputs = decoder(val_inputs)
        loss = criterion(outputs.view(-1, vocab_size), targets.view(-1))
        val_loss += loss.item()
        
        val_iteration += 1
        end_time = time.time()
        iteration_duration = (end_time - start_time)
        wandb.log({"val_loss": loss.item()})
        
    end_time = time.time()
    avg_val_loss = val_loss / len(val_dataloader)
    epoch_duration = end_time - start_time
    wandb.log({"epoch": epoch, "val_loss": avg_val_loss, "time_taken_epoch_val": epoch_duration})
    avg_loss_val = val_loss / len(val_dataloader)
    print(f"val Epoch {epoch + 1}/{num_epochs} - Loss: {avg_loss_val:.4f}")
    
    
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        save_checkpoint(epoch, decoder, optimizer, "best_checkpoint.pth.tar")
        stopping_counter = 0
    else:
        stopping_counter += 1
        if stopping_counter >= patience:
            print("Early stopping triggered")
            break

# Save your model.
model_path = f'cuda-train-50000-{name}.pth'

torch.save(decoder.state_dict(), model_path)
artifact = wandb.Artifact('model', type='model')
artifact.add_file(model_path)
run.log_artifact(artifact)
run.finish()


# torch.save(decoder.state_dict(), model_path)
# wandb.save(model_path)


train Epoch 1/30 - Loss: 4.0466
val Epoch 1/30 - Loss: 3.5273
train Epoch 2/30 - Loss: 3.1992
val Epoch 2/30 - Loss: 3.0645
train Epoch 3/30 - Loss: 2.8545
val Epoch 3/30 - Loss: 2.7305
train Epoch 4/30 - Loss: 2.6342
val Epoch 4/30 - Loss: 2.4492
train Epoch 5/30 - Loss: 2.4737
val Epoch 5/30 - Loss: 2.3223
train Epoch 6/30 - Loss: 2.3450
val Epoch 6/30 - Loss: 2.5000
train Epoch 7/30 - Loss: 2.2417
val Epoch 7/30 - Loss: 2.4707
train Epoch 8/30 - Loss: 2.1550
val Epoch 8/30 - Loss: 2.2637
train Epoch 9/30 - Loss: 2.0821
val Epoch 9/30 - Loss: 2.3438
train Epoch 10/30 - Loss: 2.0172
val Epoch 10/30 - Loss: 2.2090
train Epoch 11/30 - Loss: 1.9614
val Epoch 11/30 - Loss: 2.1309
train Epoch 12/30 - Loss: 1.9099
val Epoch 12/30 - Loss: 1.8193
train Epoch 13/30 - Loss: 1.8627
val Epoch 13/30 - Loss: 2.0059
train Epoch 14/30 - Loss: 1.8165
val Epoch 14/30 - Loss: 1.9209
train Epoch 15/30 - Loss: 1.7801
val Epoch 15/30 - Loss: 1.8467
train Epoch 16/30 - Loss: 1.7390
val Epoch 16/30 - Loss: 1

0,1
epoch,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇████
time_taken_epoch_train,▃█▅▄▅▅▁▇▃▄▄▄▄▄▅▂▃▅▂▄▆▅▄▄▄▅▅▁▃▆
time_taken_epoch_val,▇▆▅▃▆▄▇▄▆▇▇▅▃▆▄▅▅▂█▅▃▃▃▅▄▁▆▅▄▅
train_loss,█▇▆▅▄▅▄▄▃▃▃▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▂▁▂▂▂▁▁▂▁▁▁▁
val_loss,█▆▆▅▄▄▄▅▄▄▄▄▄▃▃▂▃▃▃▂▃▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▂▂

0,1
epoch,29.0
time_taken_epoch_train,68.08528
time_taken_epoch_val,5.15923
train_loss,1.37548
val_loss,1.60254


In [45]:
print('vocab_size', vocab_size)

def generate_story(model, device, max_length=200):
    model.eval()
    with torch.no_grad():
        input_token = sp.piece_to_id('<s>')
        output_sequence = [input_token] # we'll always get the same name because we are using the same model and the same starter token

        for i in range(max_length):
            input_tensor = torch.tensor([output_sequence]).long().to(device) # Move tensor to the correct device
            logit_output = model(input_tensor)

            softmax = nn.Softmax(dim=-1)
            softmax_output = softmax(logit_output)
            # Taking the token with the highest probability for prediction
            predicted_token = softmax_output[0, -1, :].argmax().item()

            # Break if we predict the end-of-string token
            if predicted_token == sp.piece_to_id('</s>'):
                break

            output_sequence.append(predicted_token)

        # Convert token IDs back to strings
        print(output_sequence[1:])
        generated_story= sp.decode_ids(output_sequence[1:])

    return generated_story

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
generated_story = generate_story(decoder.to(device), device) # Make sure your model is on the correct device
print(generated_story)


vocab_size 7600
[47, 30, 51, 8, 37, 4, 39, 9, 8, 36, 56, 65, 53, 26, 3, 14, 84, 7, 58, 154, 22, 6, 107, 3, 46, 25, 4, 20, 44, 8, 2202, 592, 83, 12, 3, 17, 2202, 9, 41, 295, 5, 31, 8, 295, 890, 640, 3, 26, 61, 7, 443, 6, 2202, 4, 50, 10, 257, 103, 3, 26, 9, 119, 5, 145, 13, 40, 187, 102, 7, 120, 3, 169, 4, 20, 44, 8, 2202, 592, 770, 3, 17, 2202, 81, 26, 102, 9, 486, 5, 6, 2202, 19, 4, 11, 692, 79, 34, 119, 4, 36, 3923, 75, 26, 289, 4, 11, 115, 516, 179, 1718, 3, 57, 226, 13, 40, 187, 175, 7, 443, 10, 49, 17, 2202, 257, 103, 5, 26, 9, 119, 3, 26, 31, 516, 12, 1718, 5, 173, 13, 40, 368, 372, 3, 14, 31, 7, 96, 6, 2202, 3]
Once upon a time, there was a little girl named Lily. She loved to play outside in the park. One day, she saw a pigeon flying around her. The pigeon was very pretty and had a pretty purple hat. Lily wanted to catch the pigeon, but it flew away. Lily was sad and didn't know what to do. Suddenly, she saw a pigeon flying nearby. The pigeon asked Lily what was wrong and the p

In [None]:
# Import
import wandb

model_path = 'cuda-train-50000-epoch-3.pth'

# Save your model.
torch.save(decoder.state_dict(), model_path)
# Save as artifact for version control.
run = wandb.init(project='story-generator')
artifact = wandb.Artifact('model', type='model')
artifact.add_file(model_path)
run.log_artifact(artifact)
run.finish()



VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

VBox(children=(Label(value='5.954 MB of 5.954 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

In [None]:
#Load
import wandb
import torch
import os

run = wandb.init()

artifact = run.use_artifact('serena_chan/story-generator/model:v0', type='model')
artifact_dir = artifact.download()

model_path = os.path.join(artifact_dir, 'cuda-train-50000-epoch-3.pth')

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# Initialize your model architecture
decoder = TransformerDecoder(vocab_size, d_model).to(device)
# Load the state dictionary
decoder.load_state_dict(torch.load(model_path))

run.finish()

[34m[1mwandb[0m:   1 of 1 files downloaded.  


FileNotFoundError: ignored

In [None]:
print('vocab_size', vocab_size)

temperature = 1.5

def temperature_sampling(logits):
    # Divide the logits by the temperature
    logits = logits / temperature
    # Create a distribution
    distribution = torch.nn.functional.softmax(logits, dim=-1)
    # Sample from the distribution
    choice = torch.multinomial(distribution, 1)
    token = choice.squeeze().item()
    return token

def generate_story(model, device, max_length=200):
    model.eval()
    with torch.no_grad():
        input_token = sp.piece_to_id('<s>')
        output_sequence = [input_token] # we'll always get the same name because we are using the same model and the same starter token

        for i in range(max_length):
            input_tensor = torch.tensor([output_sequence]).long().to(device) # Move tensor to the correct device
            logit_output = model(input_tensor)

            predicted_token = temperature_sampling(logit_output[0, -1, :])

            # Break if we predict the end-of-string token
            if predicted_token == sp.piece_to_id('</s>'):
                break

            output_sequence.append(predicted_token)

        generated_story= sp.decode_ids(output_sequence[1:])

    return generated_story

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
generated_story = generate_story(decoder, device) # Make sure your model is on the correct device
print(generated_story)


vocab_size 8000
Anna and his grandma like to tell ketchup car walls house aloud foot a learn thermometer in fair ear all their knees and watching herself rich squares activities for sweets. Lila stumble and Anna, beg trains, a wheel boil appeared more egg, cart of friends skip around and her in the floor when she pointed there, then they heard them fly sand a thoughtful empty noise of Happylanditail. Molly's dead sounded sour dog poking on there stuck in town stop playing balls! On arrived against many secrets almost had extra cucumber setting wide, salt while until she reached Helen to get a fan curiously, cell. Mrs the neighbourhood tried in front of time itself, butsle whirl curled everywhere he showed it sound. "Letogit I answer that will be mighty flaps cloud first and buried a trees and watch the Ca suddenNext effort wants to put it on its head mitts examine her own plant onbagsizard saying hello together: okay for shining than town happened," Lila flashedfully. They squeezed sta