In [277]:
import os
import torch
from torch import nn
import torch.nn.functional as F
import seaborn as sns
from torch.utils.data import DataLoader
import lightning as L
import pytorch_lightning as pl
from datasets import load_dataset
from transformers import BasicTokenizer
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from IPython.display import display, HTML
from pprint import pprint
from dotenv import load_dotenv
from datasets import DatasetDict
import matplotlib.pyplot as plt
from torch.utils.tensorboard import SummaryWriter

%load_ext tensorboard
%matplotlib inline

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


In [232]:
load_dotenv()

True

In [227]:
import io
import sys
from IPython.display import display, HTML


class OutputCapturer:
    def __init__(self, height: int = 400):
        self.height = height
        self.output_buffer = io.StringIO()

    def __enter__(self):
        self.original_stdout = sys.stdout
        sys.stdout = self.output_buffer
        return self

    def __exit__(self, exc_type, exc_value, traceback):
        sys.stdout = self.original_stdout
        output_text = self.output_buffer.getvalue()
        lines = output_text.splitlines()
        numbered_lines = [f"{i+1} | {line}" for i, line in enumerate(lines)]
        numbered_output = "\n".join(numbered_lines)
        html = f"""
        <div style="
            width: calc(100% - 40px);
            max-height: {self.height}px; 
            overflow: auto;
            background: rgb(0,0,0);
            background: linear-gradient(0deg, rgba(10,10,10,1) 0%, rgba(31,31,31,1) 100%);
            foreground-color: #E3D8F1;
            padding: 20px;
            font-family: Fira Code, monospace;
            font-size: 14pt;
            font-weight: 500;
            border: solid 2px #E3D8F1;
        ">
            <pre style="
                word-wrap: normal;   
            ">{output_text}</pre>
        </div>
        """
        display(HTML(html))

In [50]:
ds = load_dataset("roneneldan/TinyStories")

We do not want to start with a large training set since we could have errors in our code that we should test on smaller amounts of data first. For this role I selected a small dataset consisting of short stories generated by `GPT-4`.

We will use the `datasets` library from HuggingFace for convenience. This library will download the selected dataset and produce train-test splits automatically for us. We will use the default settings for loading the dataset since the other features (streaming, memory-pinning, etc.) are not very interesting for our use case. This will result in a dictionary object with two keys, one for the training set and the other for validation:


In [51]:
ds

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 2119719
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 21990
    })
})

We will save the validation set for later during the evaluation phase. There are over two million samples in the training set and over twenty thousand in the evaluation set:


In [24]:
ds["train"], ds["validation"]

(Dataset({
     features: ['text'],
     num_rows: 2119719
 }),
 Dataset({
     features: ['text'],
     num_rows: 21990
 }))

Let's take a look at some of the samples. You can set the `batch_size` parameter below to control how many samples are displayed to the screen and set the `index` parameter to select which batch you want to inspect.


In [21]:
batch_size = 4
index = 0
sample = ds["train"][index : index + batch_size]["text"]
for i, s in enumerate(sample):
    display(
        HTML('<hr style="border: none; border-top: 3px solid #3083DC; width: 100%;">')
    )
    print(f"Sample {i}: {s}")

Sample 0: One day, a little girl named Lily found a needle in her room. She knew it was difficult to play with it because it was sharp. Lily wanted to share the needle with her mom, so she could sew a button on her shirt.

Lily went to her mom and said, "Mom, I found this needle. Can you share it with me and sew my shirt?" Her mom smiled and said, "Yes, Lily, we can share the needle and fix your shirt."

Together, they shared the needle and sewed the button on Lily's shirt. It was not difficult for them because they were sharing and helping each other. After they finished, Lily thanked her mom for sharing the needle and fixing her shirt. They both felt happy because they had shared and worked together.


Sample 1: Once upon a time, there was a little car named Beep. Beep loved to go fast and play in the sun. Beep was a healthy car because he always had good fuel. Good fuel made Beep happy and strong.

One day, Beep was driving in the park when he saw a big tree. The tree had many leaves that were falling. Beep liked how the leaves fall and wanted to play with them. Beep drove under the tree and watched the leaves fall on him. He laughed and beeped his horn.

Beep played with the falling leaves all day. When it was time to go home, Beep knew he needed more fuel. He went to the fuel place and got more healthy fuel. Now, Beep was ready to go fast and play again the next day. And Beep lived happily ever after.


Sample 2: One day, a little fish named Fin was swimming near the shore. He saw a big crab and wanted to be friends. "Hi, I am Fin. Do you want to play?" asked the little fish. The crab looked at Fin and said, "No, I don't want to play. I am cold and I don't feel fine."

Fin felt sad but wanted to help the crab feel better. He swam away and thought of a plan. He remembered that the sun could make things warm. So, Fin swam to the top of the water and called to the sun, "Please, sun, help my new friend feel fine and not freeze!"

The sun heard Fin's call and shone its warm light on the shore. The crab started to feel better and not so cold. He saw Fin and said, "Thank you, little fish, for making me feel fine. I don't feel like I will freeze now. Let's play together!" And so, Fin and the crab played and became good friends.


Sample 3: Once upon a time, in a land full of trees, there was a little cherry tree. The cherry tree was very sad because it did not have any friends. All the other trees were big and strong, but the cherry tree was small and weak. The cherry tree was envious of the big trees.

One day, the cherry tree felt a tickle in its branches. It was a little spring wind. The wind told the cherry tree not to be sad. The wind said, "You are special because you have sweet cherries that everyone loves." The cherry tree started to feel a little better.

As time went on, the cherry tree grew more and more cherries. All the animals in the land came to eat the cherries and play under the cherry tree. The cherry tree was happy because it had many friends now. The cherry tree learned that being different can be a good thing. And they all lived happily ever after.


Each story consists of a few sentences without special characters or many numeric values. We will include these in the final training run when we switch to a larger training set, but for now this data will suffice.


## Tokenization


We cannot feed the sequences of words found in the dataset directly into the model. We need to translate each sequence into atomic units of language we call _tokens_.


It is important that the tokenizer used to train the model is also used for inference. If a different tokenizer is used then a word might be split in a way that is not expected by the model and will yield undesirable results. This is why in HuggingFace and other Machine Learning tools you will encounter tokenizers that are named after the model they are associated with (e.g. `T5Tokenizer`, `BERTTokenizer`, etc.). Just because a tokenizer was used to train a popular foundational language (such as BERT) doesn't mean you cannot use it for another NLP model if you are doing the pre-training of said model.


In [3]:
tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
trainer = BpeTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])

We will use a simple pre-tokenizer that splits on whitespace.


In [25]:
from tokenizers.pre_tokenizers import Whitespace

tokenizer.pre_tokenizer = Whitespace()

In [26]:
tokenizer.train_from_iterator(ds["train"]["text"], trainer=trainer)






In [28]:
tokenizer.save("models/tokenizer.json")

In [36]:
sentence = "Training a tokenizer is not hard when you have good libraries."
print(tokenizer.encode(sentence).tokens)

['T', 'raining', 'a', 'token', 'izer', 'is', 'not', 'hard', 'when', 'you', 'have', 'good', 'libraries', '.']


In [35]:
sentence = ds["train"][0]["text"]
print(tokenizer.encode(sentence).tokens)

['One', 'day', ',', 'a', 'little', 'girl', 'named', 'Lily', 'found', 'a', 'needle', 'in', 'her', 'room', '.', 'She', 'knew', 'it', 'was', 'difficult', 'to', 'play', 'with', 'it', 'because', 'it', 'was', 'sharp', '.', 'Lily', 'wanted', 'to', 'share', 'the', 'needle', 'with', 'her', 'mom', ',', 'so', 'she', 'could', 'sew', 'a', 'button', 'on', 'her', 'shirt', '.', 'Lily', 'went', 'to', 'her', 'mom', 'and', 'said', ',', '"', 'Mom', ',', 'I', 'found', 'this', 'needle', '.', 'Can', 'you', 'share', 'it', 'with', 'me', 'and', 'sew', 'my', 'shirt', '?"', 'Her', 'mom', 'smiled', 'and', 'said', ',', '"', 'Yes', ',', 'Lily', ',', 'we', 'can', 'share', 'the', 'needle', 'and', 'fix', 'your', 'shirt', '."', 'Together', ',', 'they', 'shared', 'the', 'needle', 'and', 'sewed', 'the', 'button', 'on', 'Lily', "'", 's', 'shirt', '.', 'It', 'was', 'not', 'difficult', 'for', 'them', 'because', 'they', 'were', 'sharing', 'and', 'helping', 'each', 'other', '.', 'After', 'they', 'finished', ',', 'Lily', 'thank

In [None]:
sentences = ds["train"][0:4]["text"]
encoded_batch = tokenizer.encode_batch(sentences)
with OutputCapturer() as capturer:
    for encoding in encoded_batch:
        print(encoding.tokens)

In [52]:
ds = load_dataset("roneneldan/TinyStories", split="train")

In [54]:
ds[0]

{'text': 'One day, a little girl named Lily found a needle in her room. She knew it was difficult to play with it because it was sharp. Lily wanted to share the needle with her mom, so she could sew a button on her shirt.\n\nLily went to her mom and said, "Mom, I found this needle. Can you share it with me and sew my shirt?" Her mom smiled and said, "Yes, Lily, we can share the needle and fix your shirt."\n\nTogether, they shared the needle and sewed the button on Lily\'s shirt. It was not difficult for them because they were sharing and helping each other. After they finished, Lily thanked her mom for sharing the needle and fixing her shirt. They both felt happy because they had shared and worked together.'}

In [88]:
def encode(samples):
    return {"tokens": [b.tokens for b in tokenizer.encode_batch(samples["text"])]}

In [89]:
train_ds = ds.map(encode, batched=True, batch_size=2_048)

Map:   0%|          | 0/2119719 [00:00<?, ? examples/s]

Map: 100%|██████████| 2119719/2119719 [05:59<00:00, 5902.74 examples/s]


In [239]:
ds = load_dataset("roneneldan/TinyStories", split="validation")

In [245]:
test_ds = ds.map(encode, batched=True, batch_size=2_048)

In [231]:
with OutputCapturer() as capturer:
    pprint(train_ds[0], compact=True)

In [251]:
dataset = DatasetDict({"train": train_ds, "test": test_ds})

In [223]:
!huggingface-cli login --token $HUGGINGFACE_TOKEN

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /home/dwalker/.cache/huggingface/token
Login successful


In [236]:
!yes | huggingface-cli repo create gpt4_short_stories_with_tokens --type dataset --organization athena-ml

[90mgit version 2.34.1[0m
[90mgit-lfs/3.0.2 (GitHub; linux amd64; go 1.18.1)[0m

You are about to create [1mdatasets/athena-ml/gpt4_short_stories_with_tokens[0m
Proceed? [Y/n] 409 Client Error: Conflict for url: https://huggingface.co/api/repos/create (Request ID: Root=1-66d7b06e-67e498be1aba9301452914f2;49df577c-64b3-453f-9854-f73051d02a35)

You already created this dataset repo
[1m[31m{"error":"You already created this dataset repo","url":"https://huggingface.co/datasets/athena-ml/gpt4_short_stories_with_tokens"}[0m
yes: standard output: Broken pipe


In [252]:
dataset.push_to_hub("athena-ml/gpt4_short_stories_with_tokens")

Creating parquet from Arrow format: 100%|██████████| 193/193 [00:04<00:00, 45.92ba/s]
Creating parquet from Arrow format: 100%|██████████| 193/193 [00:04<00:00, 45.59ba/s]
Creating parquet from Arrow format: 100%|██████████| 193/193 [00:04<00:00, 45.88ba/s]
Creating parquet from Arrow format: 100%|██████████| 193/193 [00:04<00:00, 45.39ba/s]
Creating parquet from Arrow format: 100%|██████████| 193/193 [00:04<00:00, 46.64ba/s]
Creating parquet from Arrow format: 100%|██████████| 193/193 [00:04<00:00, 45.28ba/s]
Creating parquet from Arrow format: 100%|██████████| 193/193 [00:04<00:00, 44.71ba/s]
Creating parquet from Arrow format: 100%|██████████| 193/193 [00:04<00:00, 46.11ba/s]
Creating parquet from Arrow format: 100%|██████████| 193/193 [00:04<00:00, 46.14ba/s]
Creating parquet from Arrow format: 100%|██████████| 193/193 [00:04<00:00, 45.47ba/s]
Creating parquet from Arrow format: 100%|██████████| 193/193 [00:04<00:00, 45.39ba/s]
Uploading the dataset shards: 100%|██████████| 11/11 [

CommitInfo(commit_url='https://huggingface.co/datasets/athena-ml/gpt4_short_stories_with_tokens/commit/09c90ea6803d3bade5c4764bb7df059122c36d9a', commit_message='Upload dataset', commit_description='', oid='09c90ea6803d3bade5c4764bb7df059122c36d9a', pr_url=None, pr_revision=None, pr_num=None)

In [262]:
writer = SummaryWriter()

In [263]:
pl.seed_everything(577)

Seed set to 577


577

In [273]:
class W2V_CBOW(L.LightningModule):
    """Implements the Continuous Bag of Words Word2Vec model."""

    def __init__(self, vocab_size, embedding_dim):
        super().__init__()
        self.neighborhood_size = 2
        self.vocab_size = vocab_size
        self.projection_dim = embedding_dim
        self.projection_layer = nn.Embedding(
            num_embeddings=vocab_size, embedding_dim=embedding_dim, max_norm=1
        )
        self.linear_layer = nn.Linear(
            in_features=embedding_dim, out_features=vocab_size
        )

    def forward(self, x):
        x = self.projection_layer(x)
        x = x.mean(dim=1)
        x = self.linear_layer(x)
        return x

    def training_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        loss = F.cross_entropy(y_hat.view(-1, self.vocab_size), y.view(-1))
        return loss

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters())

In [None]:
##############################################################


def plot_dists(val_dict, color="C0", xlabel=None, stat="count", use_kde=True):
    columns = len(val_dict)
    fig, ax = plt.subplots(1, columns, figsize=(columns * 3, 2.5))
    fig_index = 0
    for key in sorted(val_dict.keys()):
        key_ax = ax[fig_index % columns]
        sns.histplot(
            val_dict[key],
            ax=key_ax,
            color=color,
            bins=50,
            stat=stat,
            kde=use_kde and ((val_dict[key].max() - val_dict[key].min()) > 1e-8),
        )  # Only plot kde if there is variance
        hidden_dim_str = (
            r"(%i $\to$ %i)" % (val_dict[key].shape[1], val_dict[key].shape[0])
            if len(val_dict[key].shape) > 1
            else ""
        )
        key_ax.set_title(f"{key} {hidden_dim_str}")
        if xlabel is not None:
            key_ax.set_xlabel(xlabel)
        fig_index += 1
    fig.subplots_adjust(wspace=0.4)
    return fig


##############################################################


def visualize_weight_distribution(model, color="C0"):
    weights = {}
    for name, param in model.named_parameters():
        if name.endswith(".bias"):
            continue
        key_name = f"Layer {name.split('.')[1]}"
        weights[key_name] = param.detach().view(-1).cpu().numpy()

    # Plotting
    fig = plot_dists(weights, color=color, xlabel="Weight vals")
    fig.suptitle("Weight distribution", fontsize=14, y=1.05)
    plt.show()
    plt.close()


##############################################################


def visualize_gradients(model, color="C0", print_variance=False):
    """
    Args:
        net: Object of class BaseNetwork
        color: Color in which we want to visualize the histogram (for easier separation of activation functions)
    """
    model.eval()
    small_loader = data.DataLoader(train_set, batch_size=1024, shuffle=False)
    imgs, labels = next(iter(small_loader))
    imgs, labels = imgs.to(device), labels.to(device)

    # Pass one batch through the network, and calculate the gradients for the weights
    model.zero_grad()
    preds = model(imgs)
    loss = F.cross_entropy(
        preds, labels
    )  # Same as nn.CrossEntropyLoss, but as a function instead of module
    loss.backward()
    # We limit our visualization to the weight parameters and exclude the bias to reduce the number of plots
    grads = {
        name: params.grad.view(-1).cpu().clone().numpy()
        for name, params in model.named_parameters()
        if "weight" in name
    }
    model.zero_grad()

    # Plotting
    fig = plot_dists(grads, color=color, xlabel="Grad magnitude")
    fig.suptitle("Gradient distribution", fontsize=14, y=1.05)
    plt.show()
    plt.close()

    if print_variance:
        for key in sorted(grads.keys()):
            print(f"{key} - Variance: {np.var(grads[key])}")


##############################################################


def visualize_activations(model, color="C0", print_variance=False):
    model.eval()
    small_loader = data.DataLoader(train_set, batch_size=1024, shuffle=False)
    imgs, labels = next(iter(small_loader))
    imgs, labels = imgs.to(device), labels.to(device)

    # Pass one batch through the network, and calculate the gradients for the weights
    feats = imgs.view(imgs.shape[0], -1)
    activations = {}
    with torch.no_grad():
        for layer_index, layer in enumerate(model.layers):
            feats = layer(feats)
            if isinstance(layer, nn.Linear):
                activations[f"Layer {layer_index}"] = (
                    feats.view(-1).detach().cpu().numpy()
                )

    # Plotting
    fig = plot_dists(activations, color=color, stat="density", xlabel="Activation vals")
    fig.suptitle("Activation distribution", fontsize=14, y=1.05)
    plt.show()
    plt.close()

    if print_variance:
        for key in sorted(activations.keys()):
            print(f"{key} - Variance: {np.var(activations[key])}")


##############################################################

In [274]:
def const_init(model, fill=0.0):
    for name, param in model.named_parameters():
        param.data.fill_(fill)

In [275]:
model = Word2Vec(vocab_size=100, embedding_dim=10)
const_init(model, fill=0.005)

In [259]:
%tensorboard --logdir=runs

ERROR: Timed out waiting for TensorBoard to start. It may still be running as pid 1667.