In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
! cp /content/drive/MyDrive/ERAv1/S22/session22/iter-010915-ckpt.pth.zip .
! unzip iter-010915-ckpt.pth.zip


! cp /content/drive/MyDrive/ERAv1/S22/session22/data1.zip .
! unzip data1.zip

In [3]:
! pip install lightning sentencepiece -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m14.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m805.2/805.2 kB[0m [31m16.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m776.3/776.3 kB[0m [31m17.6 MB/s[0m eta [36m0:00:00[0m
[?25h

In [46]:
@torch.inference_mode()
def generate(
    model: GPT,
    idx: torch.Tensor,
    max_returned_tokens: int,
    *,
    temperature: float = 1.0,
    top_k:int = None,
    eos_id:int = None,
) -> torch.Tensor:
    """Takes a conditioning sequence (prompt) as input and continues to generate as many tokens as requested.

    The implementation of this function is modified from A. Karpathy's nanoGPT.

    Args:
        model: The model to use.
        idx: Tensor of shape (T) with indices of the prompt sequence.
        max_returned_tokens: The maximum number of tokens to return (given plus generated).
        temperature: Scales the predicted logits by 1 / temperature.
        top_k: If specified, only sample among the tokens with the k highest probabilities.
        eos_id: If specified, stop generating any more token once the <eos> token is triggered.
    """
    T = idx.size(0)
    assert max_returned_tokens > T
    if model.max_seq_length < max_returned_tokens - 1:
        # rolling the kv cache based on the `input_pos` value would be necessary. However, doing so would introduce a
        # data dependency on the `input_pos` tensor and impact model compilation. Since this setting is uncommon, we do
        # not support it to avoid negatively impacting the overall speed
        raise NotImplementedError(f"max_seq_length {model.max_seq_length} needs to be >= {max_returned_tokens - 1}")

    device, dtype = idx.device, idx.dtype
    # create an empty tensor of the expected final shape and fill in the current tokens
    empty = torch.empty(max_returned_tokens, dtype=dtype, device=device)
    empty[:T] = idx
    idx = empty
    input_pos = torch.arange(0, T, device=device)

    # generate up to a fixed number of tokens
    for _ in range(max_returned_tokens - T):
        x = idx.index_select(0, input_pos).view(1, -1)

        # forward
        logits = model(x, input_pos)
        logits = logits[0, -1] / temperature

        # optionally crop the logits to only the top k options
        if top_k is not None:
            v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
            logits = torch.where(logits < v[[-1]], -float("Inf"), logits)

        probs = torch.nn.functional.softmax(logits, dim=-1)
        idx_next = torch.multinomial(probs, num_samples=1).to(dtype=dtype)

        # advance
        input_pos = input_pos[-1:] + 1

        # concatenate the new generation
        idx = idx.index_copy(0, input_pos, idx_next)

        # if <eos> token is triggered, return the output (stop generation)
        if idx_next == eos_id:
            return idx[:input_pos]  # include the EOS token

    return idx


In [31]:
import time
import torch
import lightning as L
from torch.utils.data import DataLoader
from lightning.fabric.loggers import CSVLogger
from lightning.fabric.strategies import FSDPStrategy

In [32]:
from tsai_gpt.model import GPT, Block, Config
from tsai_gpt.tokenizer import Tokenizer
from tsai_gpt.packed_dataset import CombinedDataset, PackedDataset
from tsai_gpt.speed_monitor import SpeedMonitorBase, estimate_flops, measure_flops
from tsai_gpt.speed_monitor import SpeedMonitorFabric as SpeedMonitor
from tsai_gpt.utils import chunked_cross_entropy, get_default_supported_precision, num_parameters, load_checkpoint, gptq_quantization

In [33]:
model_name = "pythia-160m"
name = "redpajama"
save_interval = 1000
eval_interval = 1000
eval_iters = 100
log_interval = 100

In [7]:
# Hyperparameters
learning_rate = 6e-3
batch_size = 32
micro_batch_size = 8
gradient_accumulation_steps = batch_size // micro_batch_size
assert gradient_accumulation_steps > 0
#max_iters = 600000  # num_epochs * (epoch_size // micro_batch_size) // devices
max_iters = 15000
weight_decay = 1e-1
beta1 = 0.9
beta2 = 0.95
grad_clip = 1.0
decay_lr = True
warmup_iters = 2000
lr_decay_iters = max_iters
min_lr = 6e-6

In [34]:
import torch
import torch.nn as nn


def _init_weights(module: nn.Module) -> None:
        """Meant to be used with `gpt.apply(gpt._init_weights)`."""
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)



In [35]:
config = Config.from_name(model_name)
model = GPT(config)

In [None]:
model

In [37]:
next(model.parameters()).sum() #-25 -2 -860

tensor(-86016., dtype=torch.bfloat16, grad_fn=<SumBackward0>)

In [None]:
model.apply(_init_weights)

In [None]:
model.load_state_dict

In [37]:
from pathlib import Path

In [38]:
prompt= "Hello, my name is"
num_samples = 1
max_new_tokens = 50
top_k = 200
temperature = 0.8
checkpoint_dir = Path("/content/iter-010915-ckpt.pth")
quantize = None
strategy = "auto"
devices = 1
precision = None

In [39]:
import sys

precision = get_default_supported_precision(training=False)
plugins = None
fabric = L.Fabric(devices=devices, precision=precision, strategy=strategy, plugins=plugins)
fabric.launch()
fabric.print(f"Loading model {str(checkpoint_dir)!r} with {config.__dict__}", file=sys.stderr)


Loading model '/content/iter-010915-ckpt.pth' with {'name': 'pythia-160m', 'hf_config': {'org': 'EleutherAI', 'name': 'pythia-160m-deduped'}, 'block_size': 2048, 'vocab_size': 50254, 'padding_multiple': 128, 'padded_vocab_size': 50304, 'n_layer': 12, 'n_head': 12, 'n_embd': 768, 'rotary_percentage': 0.25, 'parallel_residual': True, 'bias': True, 'lm_head_bias': False, 'n_query_groups': 12, 'shared_attention_norm': False, '_norm_class': 'LayerNorm', 'norm_eps': 1e-05, '_mlp_class': 'GptNeoxMLP', 'gelu_approximate': 'none', 'intermediate_size': 3072, 'rope_condense_ratio': 1, 'rope_base': 10000, 'head_size': 64, 'rope_n_elem': 16}


In [40]:
with fabric.init_module(empty_init=True), gptq_quantization(quantize=="gptq.int4"):
    model = GPT(config)

t0 = time.perf_counter()
fabric.print(f"Time to instantiate model: {time.perf_counter() - t0:.02f} seconds.", file=sys.stderr)

Time to instantiate model: 0.00 seconds.


In [41]:
model.eval()
model = fabric.setup_module(model)

t0 = time.perf_counter()
load_checkpoint(fabric, model, checkpoint_dir)
fabric.print(f"Time to load the model weights: {time.perf_counter() - t0:.02f} seconds.", file=sys.stderr)

Time to load the model weights: 8.34 seconds.


In [43]:
tokenizer = Tokenizer(Path('/content/tokenizer'))
encoded = tokenizer.encode(prompt, device=fabric.device)
prompt_length = encoded.size(0)
max_returned_tokens = prompt_length + max_new_tokens

In [44]:
with fabric.init_tensor():
    # set the max_seq_length to limit the memory usage to what we need
    model.max_seq_length = max_returned_tokens

In [55]:
for i in range(num_samples):
    with fabric.init_tensor():
        # enable the kv cache
        model.set_kv_cache(batch_size=1)

    t0 = time.perf_counter()
    y = generate(model, encoded, max_returned_tokens, temperature=temperature, top_k=top_k)
    t = time.perf_counter() - t0

    fabric.print(tokenizer.decode(y))
    tokens_generated = y.size(0) - prompt_length
    fabric.print(
        f"Time for inference {i + 1}: {t:.02f} sec total, {tokens_generated / t:.02f} tokens/sec", file=sys.stderr
    )

Hello, my name is Red Hat (aka "Dentals") and I think we got born here.
I agree with that same name, and I still get a name in Red Hat!
I would love their name and to the name of the Red Hat


Time for inference 1: 10.63 sec total, 4.70 tokens/sec


In [None]:
! pip install gradio

In [29]:
import random
import torch
from torch import nn
import lightning.pytorch as pl
from torch.nn import functional as F

device     = 'cuda' if torch.cuda.is_available() else 'cpu'


def generate_dialogue(input_text, temperature, max_tokens, top_k):
    encoded = tokenizer.encode(input_text, device=fabric.device)
    max_returned_tokens = encoded.size(0) + max_tokens


    with fabric.init_tensor():
        # set the max_seq_length to limit the memory usage to what we need
        model.max_seq_length = max_returned_tokens


    with fabric.init_tensor():
        model.set_kv_cache(batch_size=1)

    y = generate(model, encoded, max_returned_tokens, temperature=temperature, top_k=top_k)

    return(tokenizer.decode(y))




example_text = [
    "In a galaxy far, far away, an intergalactic council convenes to discuss the rising cost of lightsaber batteries. Among them is an unlikely representative: a droid with a penchant for economics...",
    "As Sherlock Holmes and Dr. Watson enter the world of social media influencers, they find their first case: the mysterious disappearance of a famous TikTok star's like button.",
    "In the midst of a zombie apocalypse, a group of survivors discovers a library with every book intact except for cookbooks. Their leader, a former TV chef, decides to write the ultimate survival recipe book titled...",
    "A time traveler accidentally attends Shakespeare's first play, but instead of a quill, she hands him a smartphone with autocorrect. The resulting play is...",
    "Amidst the chaos of a Hogwarts School reunion, a magical mishap swaps the voices of Professors Dumbledore and Snape, leading to an unexpected duet in the Great Hall that goes viral in the wizarding world."
]

examples = [
             [
                example_text[i],
                round(random.uniform(0,1), 1),
                int(random.uniform(50,200)),
                int(random.uniform(100,300))] for i,x in enumerate(example_text)
           ]

In [53]:

import gradio as gr
import torch
from torch import nn
import lightning.pytorch as pl
from torch.nn import functional as F


HTML_TEMPLATE = """
<style>

    #app-header {
        text-align: center;
        background: rgba(255, 255, 255, 0.3); /* Semi-transparent white */
        padding: 20px;
        border-radius: 10px;
        box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
        position: relative; /* To position the artifacts */
    }
    #app-header h1 {
        color: #FF0000;
        font-size: 2em;
        margin-bottom: 10px;
    }
    .concept {
        position: relative;
        transition: transform 0.3s;
    }
    .concept:hover {
        transform: scale(1.1);
    }
    .concept img {
        width: 100px;
        border-radius: 10px;
        box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
    }
    .concept-description {
        position: absolute;
        bottom: -30px;
        left: 50%;
        transform: translateX(-50%);
        background-color: #4CAF50;
        color: white;
        padding: 5px 10px;
        border-radius: 5px;
        opacity: 0;
        transition: opacity 0.3s;
    }
    .concept:hover .concept-description {
        opacity: 1;
    }
    /* Artifacts */

</style>
<div id="app-header">
    <!-- Artifacts -->
    <div class="artifact large"></div>
    <div class="artifact large"></div>
    <div class="artifact large"></div>
    <div class="artifact large"></div>
    <!-- Content -->
    <h1>GPT NEXT WORD GENERATOR</h1>
    <p>Generate dialogue for given some initial prompt for context.</p>
    <p>Model: GPT, Dataset: arxiv + book + cc, Parameter Count: 160M</p>
"""

with gr.Blocks(theme=gr.themes.Glass(),css=".gradio-container {background: url('file=https://github.com/Delve-ERAV1/Conditional-Diffusion/assets/11761529/1ff9d2e1-798f-442a-a1e2-386fdd35010a')}") as interface:
    gr.HTML(value=HTML_TEMPLATE, show_label=False)

    gr.Markdown("")
    gr.Markdown("")
    gr.Markdown("")

    gr.Markdown("")
    gr.Markdown("")
    gr.Markdown("")
    gr.Markdown("")

    with gr.Row():

        input_text = gr.Textbox(
            label="Input Text",
            value="Enter your prompt here: This text will set the context for the AI's response."
        )

        temperature_dropdown = gr.Slider(0, 1, value=0.8, label="Temperature", info="Set the creativity level: Higher values produce more varied results, lower values generate more predictable text.")
        top_k_dropdown = gr.Slider(50, 300, value=200, label="Top K", info="Control the randomness: Limits the AI to consider only the top K most likely next words.")
        max_new_tokens = gr.Slider(1, 100, value=50, label="Max Tokens", info="Choose the length: This determines the maximum number of words the AI will generate.")


        outputs = gr.Textbox(
            label="Generated Dialogue"
        )
        inputs = [input_text, temperature_dropdown, top_k_dropdown, max_new_tokens]

    with gr.Column():
        button = gr.Button("Generate")
        button.click(generate_dialogue, inputs=inputs, outputs=outputs)

    with gr.Row():
         gr.Examples(examples=examples, inputs=inputs, outputs=outputs, fn=generate_dialogue, cache_examples=True,)


Caching examples at: '/content/gradio_cached_examples/574'
Caching example 1/5
Caching example 2/5
Caching example 3/5
Caching example 4/5
Caching example 5/5


In [54]:
interface.launch()

Setting queue=True in a Colab notebook requires sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Running on public URL: https://d6a4765be993bd4045.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


