In [1]:
import torch
import tiktoken
from src.shraygpt import ShrayGPT
torch.set_num_threads(1) # Prevents deadlocks with DataLoader and multiple workers

tokenizer = tiktoken.get_encoding("r50k_base")

def get_total_param_count(module):
    return sum(p.numel() for p in module.parameters())

d_model = 32*32
n_head = 32
d_head = 32
n_layers = 32
num_experts = 8
num_experts_per_tok = 1
block_size = 8192
batch_size = 1
lr = 3e-4

model = ShrayGPT(
    vocab_size=tokenizer.n_vocab, 
    block_size=block_size, 
    d_model=d_model,
    n_head=n_head, 
    d_head=d_head, 
    n_layers=n_layers, 
    num_experts=num_experts, 
    num_experts_per_tok=num_experts_per_tok
)
model.hparams.learning_rate = lr
model.hparams.aux_loss_weight = 1e-2
# model.compile(backend="inductor", dynamic=True, mode="reduce-overhead")

params = get_total_param_count(model)
print(f"Total parameters: {params/1e9:.2f}B")

Total parameters: 2.67B


In [2]:
from torch.utils.data import IterableDataset, DataLoader
from datasets import load_dataset
import torch.distributed as dist

class IterableTextDataset(IterableDataset):
    def __init__(self, tokenizer, hf_dataset, block_size):
        self.tokenizer = tokenizer
        self.hf_dataset = hf_dataset
        self.block_size = block_size

    def _rank_world(self):
        if dist.is_available() and dist.is_initialized():
            return dist.get_rank(), dist.get_world_size()
        return 0, 1

    def __iter__(self):
        rank, world = self._rank_world()

        # Shard the HF streaming dataset so each rank reads a disjoint slice
        ds = self.hf_dataset
        if hasattr(ds, "shard"):
            ds = ds.shard(num_shards=world, index=rank, contiguous=True)

        buffer = []
        for item in ds:
            if 'text' in item:
                tokenized = self.tokenizer.encode(item['text']) + [self.tokenizer.eot_token]
                buffer.extend(tokenized)
                while len(buffer) >= self.block_size + 1:
                    x = torch.tensor(buffer[:self.block_size], dtype=torch.long)
                    y = torch.tensor(buffer[1:self.block_size+1], dtype=torch.long)
                    yield x, y
                    buffer = buffer[self.block_size:]


full_train_stream = load_dataset('HuggingFaceFW/fineweb-edu', name='sample-350BT', split='train', streaming=True)
num_val_samples = 10000  # Let's reserve 10,000 samples for validation.

val_stream_full = full_train_stream.take(num_val_samples)
train_stream_full = full_train_stream.skip(num_val_samples)

train_dataset_full = IterableTextDataset(tokenizer, train_stream_full, block_size)
val_dataset_full = IterableTextDataset(tokenizer, val_stream_full, block_size)

train_loader = DataLoader(
    train_dataset_full, 
    batch_size=batch_size, 
    num_workers=2,
    prefetch_factor=2,  
    pin_memory=True # Helps speed up data transfer to the GPU
)
val_loader = DataLoader(
    val_dataset_full, 
    batch_size=batch_size, 
    num_workers=2,
    prefetch_factor=2,  
    pin_memory=True
)

Resolving data files:   0%|          | 0/2410 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/472 [00:00<?, ?it/s]

In [3]:
import torch
import evaluate
from datasets import load_dataset
from tqdm import tqdm

print("Loading HellaSwag validation set and accuracy metric...")
hellaswag_val = load_dataset("hellaswag", split="validation")
accuracy_metric = evaluate.load("accuracy")

# For a quick demonstration, let's use a small subset of the validation set.
# A full evaluation would run on the entire set.
subset_size = 100
hellaswag_subset = hellaswag_val.select(range(subset_size))

# 2. Create an evaluation function
def evaluate_on_hellaswag(model, tokenizer, dataset):
    """
    Evaluates a model on the HellaSwag dataset.
    """
    model.eval()
    
    predictions = []
    references = []
    
    # The core idea is to calculate the loss for the context concatenated with each ending.
    # The ending that results in the lowest loss is the model's prediction.
    
    for example in tqdm(dataset, desc="Evaluating HellaSwag"):
        context = example['ctx']
        endings = example['endings']
        correct_label = int(example['label'])
        
        context_tokens = tokenizer.encode(context)
        
        losses = []
        with torch.no_grad():
            for ending in endings:
                # Create the full input by combining context and the current ending
                full_text_tokens = context_tokens + tokenizer.encode(ending)
                
                # Prepare input and target tensors
                x = torch.tensor([full_text_tokens[:-1]], dtype=torch.long, device=model.device)
                y = torch.tensor([full_text_tokens[1:]], dtype=torch.long, device=model.device)
                
                # Get the loss for this specific continuation
                logits, _, aux_loss_ = model(x)
                total_loss, main_loss, aux_loss = model._calculate_loss(logits, y.to(model.device), aux_loss_)
                losses.append(total_loss.item())
        
        # The prediction is the index of the ending with the minimum loss
        prediction = torch.argmin(torch.tensor(losses)).item()
        
        predictions.append(prediction)
        references.append(correct_label)
        
    # 3. Compute the final score
    print("Computing final accuracy...")
    results = accuracy_metric.compute(predictions=predictions, references=references)
    return results

# hellaswag_results = evaluate_on_hellaswag(model, tokenizer, hellaswag_subset)
# hellaswag_results

Loading HellaSwag validation set and accuracy metric...


In [None]:
import lightning as L
from lightning.pytorch.callbacks import ModelCheckpoint, LearningRateMonitor
torch._dynamo.config.capture_scalar_outputs = True
torch.set_float32_matmul_precision('medium')
from huggingface_hub import login

login(token='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') 

class GenerateTextCallback(L.Callback):
    """A PyTorch Lightning callback to generate text samples at the end of each validation epoch."""
    def __init__(self, prompts, tokenizer, every_n_steps=100):
        super().__init__()
        self.prompts = prompts
        self.tokenizer = tokenizer
        self.every_n_steps = every_n_steps

    def on_validation_epoch_end(self, trainer, pl_module):
        if trainer.global_step == 0 or trainer.global_step % self.every_n_steps != 0:
            return
        if not trainer.is_global_zero:
            return  # only rank 0 prints/logs text
        pl_module.print(f"\n\n--- Generating text at step {trainer.global_step} ---")
        tb = getattr(trainer.logger, "experiment", None)
        
        for i, prompt in enumerate(self.prompts):
            start_tokens = self.tokenizer.encode(prompt)
            context = torch.tensor(start_tokens, dtype=torch.long, device=pl_module.device).unsqueeze(0)
            generated_tokens = pl_module.generate(context, max_new_tokens=100, temperature=0.8, top_k=20)
            generated_text = self.tokenizer.decode(generated_tokens[0].tolist())
            pl_module.print(f"PROMPT: '{prompt}'")
            pl_module.print(f"GENERATED: {generated_text}\n")
            if tb is not None and hasattr(tb, "add_text"):
                tb.add_text(f"samples/prompt_{i}", f"**Prompt:** {prompt}\n\n**Generated:** {generated_text}",
                            global_step=trainer.global_step)

class EvaluateHellaSwag(L.Callback):
    """A PyTorch Lightning callback to evaluate the LLM."""
    def __init__(self, every_n_steps=1000):
        super().__init__()
        self.every_n_steps = every_n_steps

    def on_validation_epoch_end(self, trainer, pl_module):
        if trainer.global_step == 0 or trainer.global_step % self.every_n_steps != 0:
            return
        # do heavy eval only on rank 0
        if not trainer.is_global_zero:
            return
        pl_module.print(f"\n\n--- Evaluating at step {trainer.global_step} ---")
        
        hellaswag_results = evaluate_on_hellaswag(model, tokenizer, hellaswag_subset)
        acc = hellaswag_results['accuracy']
        pl_module.print(f"\n\n--- Accuracy: {acc} at step {trainer.global_step} ---")
        pl_module.log("hellaswag/accuracy", acc, on_step=False, on_epoch=True, prog_bar=True, logger=True, sync_dist=False)


callback = GenerateTextCallback(prompts=["The verdict was", "In a shocking turn of events", "The jury decided to"], 
    tokenizer=tokenizer, every_n_steps=1000)
evalcallback = EvaluateHellaSwag(every_n_steps=1000)
checkpoint_cb = ModelCheckpoint(
    dirpath="checkpoints/",
    filename="shraygpt-{epoch:02d}-{step:05d}-{val_loss:.3f}",
    monitor="val_loss",
    mode="min",
    save_top_k=3,
    save_last=True,
)
lr_monitor = LearningRateMonitor(logging_interval="step")

trainer = L.Trainer(max_steps=200_000, accelerator='auto', devices=8, precision='16-mixed', strategy='auto', 
                    num_sanity_val_steps=0, limit_train_batches=1000, limit_val_batches=100,
                    callbacks=[callback, L.pytorch.callbacks.EarlyStopping(monitor='val_loss', mode='min', patience=100), evalcallback, checkpoint_cb, lr_monitor],
                    logger=L.pytorch.loggers.TensorBoardLogger("logs/"), log_every_n_steps=1) 

model.automatic_optimization = False
trainer.fit(model, train_loader, val_loader)

  _C._set_float32_matmul_precision(precision)
Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


Initializing distributed: GLOBAL_RANK: 1, MEMBER: 2/8
Initializing distributed: GLOBAL_RANK: 0, MEMBER: 1/8
Initializing distributed: GLOBAL_RANK: 2, MEMBER: 3/8
Initializing distributed: GLOBAL_RANK: 4, MEMBER: 5/8
Initializing distributed: GLOBAL_RANK: 5, MEMBER: 6/8
Initializing distributed: GLOBAL_RANK: 3, MEMBER: 4/8
Initializing distributed: GLOBAL_RANK: 7, MEMBER: 8/8
Initializing distributed: GLOBAL_RANK: 6, MEMBER: 7/8
----------------------------------------------------------------------------------------------------
distributed_backend=nccl
All distributed processes registered. Starting with 8 processes
----------------------------------------------------------------------------------------------------

/home/zeus/miniconda3/envs/cloudspace/lib/python3.12/site-packages/lightning/pytorch/callbacks/model_checkpoint.py:751: Checkpoint directory /teamspace/studios/this_studio/checkpoints exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]
LOCAL_RANK:

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]



--- Generating text at step 2000 ---
PROMPT: 'The verdict was'
GENERATED: The verdict was, who, and more likely, or more to be so as they can take the right-out.
-term pressure will be used to create a long-up, and other, or an issue, including the most of a good, and is not to be able to use the system of the same. It will be required to be used to be more likely to be more about and, but when they are not to be given a problem is not the best to be to be a problem

PROMPT: 'In a shocking turn of events'
GENERATED: In a shocking turn of events--c-c.
-cun, n.
- H, B, R. (19. (1996(7.
- J.
- C, R. E/s, F. (2004): 5:10. (2009). "A (2013).
- "The research is:
- E-1:1.
- "M. (2006). "G.
- E. (10.
- Klam, L.

PROMPT: 'The jury decided to'
GENERATED: The jury decided to, and, but in the Holy, and the world of the Jews, and the government.
In the most important to the United States.
The government
The American government
The government
by in the
The state and the government
The public secto


[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
Evaluating HellaSwag: 100%|██████████| 100/100 [00:20<00:00,  4.95it/s]


Computing final accuracy...


--- Accuracy: 0.22 at step 2000 ---


/home/zeus/miniconda3/envs/cloudspace/lib/python3.12/site-packages/lightning/pytorch/trainer/connectors/logger_connector/result.py:434: It is recommended to use `self.log('hellaswag/accuracy', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices.


Validation: |          | 0/? [00:00<?, ?it/s]



--- Generating text at step 4000 ---
PROMPT: 'The verdict was'
GENERATED: The verdict was but again again that he was dead, and they could not put, as, was a matter of the way to do it.
The first thing of this was the one-to-day man, for the new man himself had done to the next morning, and the first Jew he had not heard it in the way of the Lord. He would say that "The Lord was so impressed by the Lord, to his wife, the King of the Lord, and they had to be a very

PROMPT: 'In a shocking turn of events'
GENERATED: In a shocking turn of events.
Dr, a team of scientists, who were doing himself, the team, the team of the group, and how he had the advantage he and told him. The team went on the team, and carried the team's decision.
The team said the team was not a researcher who had the team that the team would.
“I was asked, and how they wanted to work at that time. It had to take on a team to work on his team.
“

PROMPT: 'The jury decided to'
GENERATED: The jury decided to study the f


[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
Evaluating HellaSwag: 100%|██████████| 100/100 [00:20<00:00,  4.89it/s]


Computing final accuracy...


--- Accuracy: 0.25 at step 4000 ---


Validation: |          | 0/? [00:00<?, ?it/s]



--- Generating text at step 6000 ---
PROMPT: 'The verdict was'
GENERATED: The verdict was in the late-century German newspaper, which included the original-based survey of 1819. It was also known that in the late 19th century, and was part of New York City, a library of the former library, which is also a collection of printed and printed in 1847 and 1842.
On the early 1800s, the first newspaper, the newspaper of newspaper and newspaper, were also written in the newspaper. The newspaper was originally published in 1837, a newspaper in 18

PROMPT: 'In a shocking turn of events'
GENERATED: In a shocking turn of events, a highly-looking-growth technique of the ancient Greek of Greece, with a detailed report of its “unreal” of the Roman Empire, with the first known Roman Catholic, and a Roman Catholic in Greek (“The Greek) and a Roman Empire” (“the Greek”) by its first Greek, Greek, and Greek (ph.e. the Greek word for the Greek, and the Greek is also known as “Christian”, meaning

PROMPT


[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
Evaluating HellaSwag: 100%|██████████| 100/100 [00:20<00:00,  4.77it/s]


Computing final accuracy...


--- Accuracy: 0.32 at step 6000 ---


Validation: |          | 0/? [00:00<?, ?it/s]



--- Generating text at step 8000 ---
PROMPT: 'The verdict was'
GENERATED: The verdict was in the way that the user would have to have the same touch to touch and see if they had a hand-held hand on the hand if they had the hand, and not the hand, but that this would have been very effective.
This was the case in this story, which gave the user a different view of the button. If the user had to click, they were still working on the screen in the right way and to make them on the screen.
This was not the case

PROMPT: 'In a shocking turn of events'
GENERATED: In a shocking turn of events is a reminder and a statue of our ancestors. He is a teacher and has a voice to live in. He is a teacher, a great teacher, and a teacher who has a strong, and is a teacher. He is a writer and author of a book with the idea of anoree to be a teacher.
“The teacher is the speaker’s and the teacher is not a writer, but a teacher’s need.”
It is a writer who

PROMPT: 'The jury decided to'
GENERATED: The jury


[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
Evaluating HellaSwag: 100%|██████████| 100/100 [00:20<00:00,  4.92it/s]


Computing final accuracy...


--- Accuracy: 0.27 at step 8000 ---


Validation: |          | 0/? [00:00<?, ?it/s]



--- Generating text at step 10000 ---
PROMPT: 'The verdict was'
GENERATED: The verdict was a sign and a member of the American civil rights organization, which was the way forward to a goal of their own rights.
The first step in this step is to develop a strategy to tackle the issue of the National Park program, which included a group of people who were involved in the development of a plan that would be able to make a decision on the path of a new problem and in fact they would have to be taken. The first step would be to create a plan where a community would

PROMPT: 'In a shocking turn of events'
GENERATED: In a shocking turn of events in New York in New York’s New York City, New York, the first known place in New York City in the New City; The New York State Building in New York City, San Diego in New York Harbor, New York, New York City, New York, New York, New York, New York, New York, New York, New York in New York, New York and New York, New York, New York City, Cambridge, Ne


[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
Evaluating HellaSwag: 100%|██████████| 100/100 [00:20<00:00,  4.90it/s]


Computing final accuracy...


--- Accuracy: 0.27 at step 10000 ---


Validation: |          | 0/? [00:00<?, ?it/s]



--- Generating text at step 12000 ---
PROMPT: 'The verdict was'
GENERATED: The verdict was to take the old hand. To this point, the "whites") had been released for the "hats" before they started to create their "junk" in front of the "k" while the "hose" was moved.
"It's been a few months before the "H" was made, a "Z" in the "K" was called "K" because "hairs" of a "Z" was "so").
"The Sun was very

PROMPT: 'In a shocking turn of events'
GENERATED: In a shocking turn of events.
Another way people play more important in their lives, as adults and children play a part in all walks of life. If one of these people is in crisis, why do some people play a part in helping the children live together?
One of the most important benefits of this type of play is that it is an overall well being. For that reason, the more we work together to be a leader, we have to create a love of the world.
The play is designed to teach

PROMPT: 'The jury decided to'
GENERATED: The jury decided to explain them, 


[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
Evaluating HellaSwag: 100%|██████████| 100/100 [00:21<00:00,  4.69it/s]


Computing final accuracy...


--- Accuracy: 0.26 at step 12000 ---


Validation: |          | 0/? [00:00<?, ?it/s]



--- Generating text at step 14000 ---
PROMPT: 'The verdict was'
GENERATED: The verdict was moved to help students not need.
The team of the team is made from the DNA from the egg shell that it was processed to see how the DNA from the chicken could cause the infection.
"I understand the difference between the DNA that's going to get to the chicken meat or the chicken, and how it's going to get the information from its DNA," said study co-author Dr. Howard Follins, a medical doctor who tested the protein to get in the first place:

PROMPT: 'In a shocking turn of events'
GENERATED: In a shocking turn of events, and the “War” and “Power” music,” and the “Mrs” is an example of the way in which the music and music are organized in the early 19th century. It is a way of looking at the music that we are talking about in today.
Music is not something that happens in the last two decades in the “Bottomstaff” or, but one can use, because music is not a music part of the whole

PROMPT: 'The jur


[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
Evaluating HellaSwag: 100%|██████████| 100/100 [00:21<00:00,  4.72it/s]


Computing final accuracy...


--- Accuracy: 0.26 at step 14000 ---


Validation: |          | 0/? [00:00<?, ?it/s]



--- Generating text at step 16000 ---
PROMPT: 'The verdict was'
GENERATED: The verdict was found at the X-4
at the two-and-butaters were fired at the same time. The moon was thought to be much better and so the Lunar Age could be made more cragmented over a thousand years. The difference between the three-and-halfths is very unusual compared to the first four ever smaller moons, and the two-planet observations of both the two stars were slightly larger than the first two-planet.
One other odd tidiest of the other two

PROMPT: 'In a shocking turn of events'
GENERATED: In a shocking turn of events, and a view of ideas.
In the 18th century, Alexander believed that God, and his principles, had made him an angel who, according to his prophecy, “He was a God; and, in his sight, had the power to save the world bereveled by the world” (4:21; 16:22). As in Plato, God is a creator of perfect goodness, but the mystery of eternity is the ultimate redemption of pleasure.
Jesus was

PROMPT: 'The j


[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
Evaluating HellaSwag: 100%|██████████| 100/100 [00:20<00:00,  4.84it/s]


Computing final accuracy...


--- Accuracy: 0.26 at step 16000 ---


Validation: |          | 0/? [00:00<?, ?it/s]



--- Generating text at step 18000 ---
PROMPT: 'The verdict was'
GENERATED: The verdict was held in the 1950 century. It was not a question of the Jews in Palestine-based Palestine programme, but that they were also Jewish and wanted Jews to be deported. Many Jewish Jews were murdered in Palestine in 1967.
Those who belonged to Jews were Jewish. Many of them went on the grounds of the Jewish king and his son-in-law, Simon Hass ibn Hale, were Jewish. There were Jews who were the Gentist Rabbilim al-Nissim (not the first

PROMPT: 'In a shocking turn of events'
GENERATED: In a shocking turn of events, the sun being of the sun is a part of the atmosphere, and the energy that we have been trying to measure the intensity and temperature we have so high.
To measure the temperature, we need to measure the temperature of the atmosphere in order to measure the temperature at which we are measuring the values of the atmosphere. For instance, a temperature scale is one degree Celsius. A temperatu


[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
Evaluating HellaSwag: 100%|██████████| 100/100 [00:21<00:00,  4.76it/s]


Computing final accuracy...


--- Accuracy: 0.31 at step 18000 ---


Validation: |          | 0/? [00:00<?, ?it/s]



--- Generating text at step 20000 ---
PROMPT: 'The verdict was'
GENERATED: The verdict was lost by the need to keep the user from the box. Once it was replaced, the other cards were to stop.
7. Math Worksheets
Grab the second puzzle on a page. Fold the page down onto the site for an interesting bit about the Foldit. Fold the page with a red ribbon. Fold each slide into another quadrant on each slide. Fold it up and glue it on the next slide to show the backresting. Fold it and cut the bleed. Fold

PROMPT: 'In a shocking turn of events'
GENERATED: In a shocking turn of events, and the ‘old’ness’ is to do the work of building a building on the building, the contractors used by architects who can do the project on their own.
The builders of computer and computer who are looking for a "work" of computers can easily access.
For your convenience, you can have the time to think about your computer, electronic computer and electronic devices, and have the same problem. You have to have hundr


[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
Evaluating HellaSwag: 100%|██████████| 100/100 [00:20<00:00,  4.81it/s]


Computing final accuracy...


--- Accuracy: 0.25 at step 20000 ---


Validation: |          | 0/? [00:00<?, ?it/s]



--- Generating text at step 22000 ---
PROMPT: 'The verdict was'
GENERATED: The verdict was put.
"We agree to explore the new vocabulary for the new schema, the concepts of grammar, and the concepts of the Bible were developed, and when we all began to develop our grammar and grammar and symbolism. I hope you feel more confident in your new words and thought, "Look for the Standard Bible." (Munacqua)
"Nay" is the desire to learn a wide range of purposes, including the use of repetition and study and study of new subjects such as

PROMPT: 'In a shocking turn of events'
GENERATED: In a shocking turn of events and other effects upon the course of human activity, with much of the awareness of the health of the body. These are known as the Kamm (San K). Kaury (Arizona: 800-649) are described in the overall literature.
2. Rao Sidery (ed.) --Shivy, 17 (1985).
3. Zohypyenexperience: "The clinical interpretation of maternal mortality in relation to the overall picture of the health of the infa


[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
Evaluating HellaSwag: 100%|██████████| 100/100 [00:20<00:00,  4.85it/s]


Computing final accuracy...


--- Accuracy: 0.3 at step 22000 ---


Validation: |          | 0/? [00:00<?, ?it/s]



--- Generating text at step 24000 ---
PROMPT: 'The verdict was'
GENERATED: The verdict was used in 1963 to every day.
March to the church on the May 15, 1816-4, first children were encouraged to learn about the ritual that came from the pulpit of the day. Finally, the children were encouraged to discuss their own rites because they wanted to remember all the other spectators that had not been lit up by. The play was a very important part of the ritual.
To be a refresher, it is highly recommended that, after a thorough day, it is

PROMPT: 'In a shocking turn of events'
GENERATED: In a shocking turn of events and for the work. Here are all of this experience.
To include all the details, this site should take you to a project that you have chosen to plan and submit. Here’s a bit of effort that has helped you as you sit and stand in a circle and think about how to present your work. I’m sure you have all the details from your mentor all to go and follow him’ to reach everyone.
What are t


[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
Evaluating HellaSwag: 100%|██████████| 100/100 [00:20<00:00,  4.87it/s]


Computing final accuracy...


--- Accuracy: 0.26 at step 24000 ---


Validation: |          | 0/? [00:00<?, ?it/s]



--- Generating text at step 26000 ---
PROMPT: 'The verdict was'
GENERATED: The verdict was to describe the nature of the culture. In order to identify the culture and culture of the culture of culture, it is important to remember that some people are able to survive and that men may be ready to share their thoughts (without doubt, agrid). Therefore, some younger writers find this more plausible approach to the survival of the species of bacteria and this is referred to as "youth" (Acdo)") – dealing with bacterial species.
Here is an introduction to some of the concepts

PROMPT: 'In a shocking turn of events'
GENERATED: In a shocking turn of events, that’s not the case. There is not so many romantic statements that have been made on the basis of justifying Peace. The only reliable evidence that the lack of participation in leadership was the presence of infasteriquer in the political debate, and the right of support to the political party of the United States government in 1956 to the


[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
Evaluating HellaSwag: 100%|██████████| 100/100 [00:20<00:00,  4.89it/s]


Computing final accuracy...


--- Accuracy: 0.23 at step 26000 ---


Validation: |          | 0/? [00:00<?, ?it/s]



--- Generating text at step 28000 ---
PROMPT: 'The verdict was'
GENERATED: The verdict was built at the F bid which he became in existence of the Earth.
The goal of Nature is to trace of an ancient cos of Earth’s largest Mediterranean. It was once thought that Kepler began to adopt the theory of the Moon, to which, in turn, was never deeply inspired by his teacher. This in turn set about the sun Fourth Moon.
Humously, it was the first Greek astronomer to study the development of Egyptian civilization. The god was sent to him by King Louis

PROMPT: 'In a shocking turn of events'
GENERATED: In a shocking turn of events which records lived at Mount II.
"It's what is Diego, an event that doesn't produce a lot of Dee, but that doesn't make it a good thing in the aggregate," Dyson said. "But in Dorset's one one, Dreyster was blocking the delivery of the Internet," Dyson said.
Of the four stories this time in the Book of chapter 3, the hard problem is that Dijkstra had done a study conducte


[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
Evaluating HellaSwag: 100%|██████████| 100/100 [00:20<00:00,  4.85it/s]


Computing final accuracy...


--- Accuracy: 0.28 at step 28000 ---


Validation: |          | 0/? [00:00<?, ?it/s]



--- Generating text at step 30000 ---
PROMPT: 'The verdict was'
GENERATED: The verdict was used to the new moss project. Unlike this building is still in use. The process starts with a Google scanner, which can be read and transformed onto new capabilities and perform over the X-shunt. It’s the size of the microscope that is used for the job, including how it was used.
From this to the first airplane, the building has been changed to fit and can now be moved from the ‘Geplane’ to a virtual computer. This is the building

PROMPT: 'In a shocking turn of events'
GENERATED: In a shocking turn of events and other aspects, including the “aaamma” of “mãari” or “serious diagrams into its context.” in its head is the same source of plant growth in plant propagation. (The plant grows in various plant environments and grows in a drought season where drought and rains are possible. It has two growing seasons per plant activity and is used to any of the amount of water.
Because of its physical ch


[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
Evaluating HellaSwag: 100%|██████████| 100/100 [00:20<00:00,  4.82it/s]


Computing final accuracy...


--- Accuracy: 0.24 at step 30000 ---


Validation: |          | 0/? [00:00<?, ?it/s]



--- Generating text at step 32000 ---
PROMPT: 'The verdict was'
GENERATED: The verdict was accomplisheded to the In-time British Government.
The proposal behind both the proposal and the Prime Conflict Commons, which largely territory between the Majuar and the Bengque monarchy. The Wadi responded with the establishment of the Gladard Party and the battle which ended in an amnood. The leadership inspired the province as a man responsible for the independence of both within its powers. The crown of the Imperial Bank descended in the royal authority to the newly made strategic terms.
The chief political

PROMPT: 'In a shocking turn of events'
GENERATED: In a shocking turn of events and butterflies.
In addition to Bitcoin conversion and Whican currency, the Euroman of Vienna gained more than half the actual payment of his debrief. William At the end of a trade, after receiving a debt in exchange, took in 1740 to 1869. By the exchange, however, neither he nor knew the reputation of havin


[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
Evaluating HellaSwag: 100%|██████████| 100/100 [00:20<00:00,  4.91it/s]


Computing final accuracy...


--- Accuracy: 0.21 at step 32000 ---


Validation: |          | 0/? [00:00<?, ?it/s]



--- Generating text at step 34000 ---
PROMPT: 'The verdict was'
GENERATED: The verdict was written by the Vatican Dynastyhips by the New York Pirates. One unsuccessful driver was blamed for his earlier first live on the island.
The first in search to find a home with the family remains, however, was hidden before being released in May 1990. When the property was purchased by Holman himself in 1985, he ran his family’s business again and bought the property first, renamed Landlord. The landowner and uncle held property for worship. As a young child, a churchman

PROMPT: 'In a shocking turn of events'
GENERATED: In a shocking turn of events to Germany. (British) For the past five victims (reside the Administration’s Ministry)
Once in March, 1945, around the world’s population, cities grew up to approximately 60 countries, including Russia, Algeria, Germany, Syria, and Tanzania. The total number of towns and transit is a “completey” according to “The wealth of a single medium is [except


[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
Evaluating HellaSwag: 100%|██████████| 100/100 [00:20<00:00,  4.84it/s]


Computing final accuracy...


--- Accuracy: 0.26 at step 34000 ---


Validation: |          | 0/? [00:00<?, ?it/s]



--- Generating text at step 36000 ---
PROMPT: 'The verdict was'
GENERATED: The verdict was put to find additional photographs and show there’s new bathroom or do.
‘People are very happy and valued by as a male athlete who helps grow their lives in basically as a male.’’ (Chıtətowhu, IPAx�, 2016). In the face and the excess height is close to the actual pay for the sole male.
JOUREOH, 17 June 2020
The new proposal takes a new step by which mothers of

PROMPT: 'In a shocking turn of events'
GENERATED: In a shocking turn of events. The four-day enterprise sites also allows people with essential natural irrigation systems to survive raw water tanks during processing services.
On the streets, the building is performed on a half of 40,000-year-old residences and about 800 out the foundation.
In the final phase, 3,000 people are raised outside the station, and 4,000 water tanks are transported by the municipality. The service began operation in 1981 was cancelled in 1986 and is now used by 


[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
Evaluating HellaSwag: 100%|██████████| 100/100 [00:21<00:00,  4.69it/s]


Computing final accuracy...


--- Accuracy: 0.24 at step 36000 ---


Validation: |          | 0/? [00:00<?, ?it/s]



--- Generating text at step 38000 ---
PROMPT: 'The verdict was'
GENERATED: The verdict was granted to "ass the U. Karltati. President Franklin Roosevelt called the Holy Moon to "protect the West". Truman asked him to appropriate respect for the next four years. He was assimating "wha boy" — and symbolized the liberation of the Soviet "Great Man" — which he hoped to inspire "day the world's future".
|Defendes|
|EC1983 (/paróg||2.46|
|Four hundred years later|
|References

PROMPT: 'In a shocking turn of events'
GENERATED: In a shocking turn of events! Travel is a difficult time indicator when you think about the size of spacecraft (or be ingenious. It’s pretty fun.”Of Antbi ships flying in their plane and give up some offers to having a home containing a bunch of passengers per docking time, but you might prefer just a few bar tank mates this year.
In the less likely stage in their project, each operator chooses a time for workers who are trying to keep their micro-initiating project a


[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
Evaluating HellaSwag: 100%|██████████| 100/100 [00:21<00:00,  4.66it/s]


Computing final accuracy...


--- Accuracy: 0.24 at step 38000 ---


Validation: |          | 0/? [00:00<?, ?it/s]



--- Generating text at step 40000 ---
PROMPT: 'The verdict was'
GENERATED: The verdict was written.||But that everyone in the village were riding in the violence that was sent to support the evacuation system.
The Missing flag of the Provision was signed by Pope Edward VI as part of the pole as the Pope. And publicly hoped it back into future, toward the building of the Commonwealth of the Commonwealth Country.
The pictures were read and then read to the Pope one post. The faithful note of the Declaration of the law was read and read. This was the colour of the word

PROMPT: 'In a shocking turn of events'
GENERATED: In a shocking turn of events. She was a man, who also lived with many exels, prayers, and blessings. He is an Assawan account for how rulers are exchanging for everyone in a welfare district. I came into my e-mail tax and went into debt (with a bank debt owed). Once I went into my bank, I just went into that office and my credit was then back, then and I didn’t have this 


[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
Evaluating HellaSwag: 100%|██████████| 100/100 [00:21<00:00,  4.65it/s]


Computing final accuracy...


--- Accuracy: 0.25 at step 40000 ---


Validation: |          | 0/? [00:00<?, ?it/s]



--- Generating text at step 42000 ---
PROMPT: 'The verdict was'
GENERATED: The verdict was first by the Mass Ed of Tears James Gostelhaus Station. It shows the appearance of real decorocations and a monument of civincital features, which opened into the building along with the original remains of the Hotel skeleton, a remarkable period bearing in width, high integrity with 26 joints in the corners, the plan bridges, the size of the structure at the top of its height, are exceptional in marking the beginning of the year for the denominationors.
Five middles or states are useful

PROMPT: 'In a shocking turn of events'
GENERATED: In a shocking turn of events every lights and I don’t get credit with my grandchildren if I wasn was sick.<|endoftext|>On birth—one- or month
On the other hand, Christmas comes forward to the solemn occasion on May 15th. To celebrate this subject you’re asking you to be at your top of the ladder.
“Spring after Christmas, with the family on the way both Christma


[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
Evaluating HellaSwag: 100%|██████████| 100/100 [00:21<00:00,  4.70it/s]


Computing final accuracy...


--- Accuracy: 0.26 at step 42000 ---
