<a href="https://colab.research.google.com/github/rushilbhat/AIMO/blob/main/RM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# !pip install datasets transformers torch bitsandbytes peft
!pip install datasets transformers torch bitsandbytes peft xformers trl accelerate

# !pip install packaging ninja einops flash-attn xformers trl peft accelerate bitsandbytes datasets transformers
# !pip install --upgrade accelerate
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"

Collecting datasets
  Downloading datasets-2.19.2-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.1/542.1 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
Collecting bitsandbytes
  Downloading bitsandbytes-0.43.1-py3-none-manylinux_2_24_x86_64.whl (119.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m119.8/119.8 MB[0m [31m13.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting peft
  Downloading peft-0.11.1-py3-none-any.whl (251 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m251.6/251.6 kB[0m [31m30.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting xformers
  Downloading xformers-0.0.26.post1-cp310-cp310-manylinux2014_x86_64.whl (222.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m222.7/222.7 MB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting trl
  Downloading trl-0.9.4-py3-none-any.whl (226 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m226.7/226.7

In [1]:
import time
import re
import pandas as pd
import torch
import torch.nn as nn
from datasets import load_dataset, load_from_disk
from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig, AutoConfig, LlamaForCausalLM, TrainingArguments, Trainer, BitsAndBytesConfig, DataCollatorForSeq2Seq
from peft import get_peft_model, LoraConfig, TaskType, prepare_model_for_kbit_training
from unsloth import FastLanguageModel

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


In [2]:
def extract_code(entry):
  code_match = re.search(r'<llm-code>(.*?)</llm-code>', entry['generated_solution'], re.DOTALL)
  if code_match:
      entry['generated_solution'] = code_match.group(1).strip()
  else:
    entry['generated_solution'] = ""
  return entry

def tokenize_and_label(entry):
    question = entry['question']
    generated_solution = entry['generated_solution']
    is_correct = entry['is_correct']

    input_text = f"{question} {generated_solution}"
    input_ids = tokenizer.encode(input_text)
    question_ids = tokenizer.encode(question)

    labels = [-100.0] * len(question_ids) + [1.0 if is_correct else 0.0] * (len(input_ids) - len(question_ids)) # don't need to worry about token that spans across the end of the question and the beginning of the generated solution
    return {"input_ids": input_ids, "labels": labels}

In [65]:
class CustomLinearLayer(nn.Module):
    def __init__(self, model):
        super().__init__()
        self.model = model
        self.special_token_id = config.bos_token_id
        self.verifier_head = nn.Linear(1, 1, bias=True) # dtype is torch.float32 so that gradients are torch.float32

    def forward(self, input_ids, attention_mask=None, labels=None, **kwargs):
        outputs = self.model(input_ids, attention_mask=attention_mask) #labels=labels
        logits = outputs.logits
        # print(f"Shape of logits: {outputs.logits.shape}")

        special_token_logits = logits[:, :, self.special_token_id].unsqueeze(-1)
        # print(f"Shape of special token logits: {special_token_logits.shape}")
        verification_scores = self.verifier_head(special_token_logits).squeeze(-1)
        # print(f"Dtype of verification_scores: {verification_scores.dtype}")

        updated_logits = logits.clone()
        updated_logits[:, :, self.special_token_id] = verification_scores


        if labels is not None:
            # print(outputs.loss)
            mask = (labels != -100)
            labels = labels.float()
            # print(f"Dtype of labels: {labels.dtype}")
            # print(f"Dtype of verification_scores: {verification_scores.dtype}")
            loss_fct = nn.MSELoss()
            verification_loss = loss_fct(verification_scores[mask], labels[mask])
            # print(f"Dtype of verifcation_loss: {verification_loss.dtype}")
            outputs['loss'] = verification_loss

        outputs['logits'] = updated_logits

        return outputs

In [3]:
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = torch.float16 # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "deepseek-ai/deepseek-math-7b-rl", # Choose ANY! eg mistralai/Mistral-7B-Instruct-v0.2
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit
)

model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

==((====))==  Unsloth: Fast Llama patching release 2024.5
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.3.0+cu121. CUDA = 7.5. CUDA Toolkit = 12.1.
\        /    Bfloat16 = FALSE. Xformers = 0.0.26.post1. FA = False.
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
deepseek-ai/deepseek-math-7b-rl does not have a padding token! Will use pad_token = <|PAD_TOKEN|>.
Unsloth 2024.5 patched 30 layers with 30 QKV layers, 30 O layers and 30 MLP layers.


In [4]:
tokenizer

LlamaTokenizerFast(name_or_path='deepseek-ai/deepseek-math-7b-rl', vocab_size=100000, model_max_length=4096, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<｜begin▁of▁sentence｜>', 'eos_token': '<｜end▁of▁sentence｜>', 'pad_token': '<|PAD_TOKEN|>'}, clean_up_tokenization_spaces=False),  added_tokens_decoder={
	100000: AddedToken("<｜begin▁of▁sentence｜>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	100001: AddedToken("<｜end▁of▁sentence｜>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	100002: AddedToken("<|PAD_TOKEN|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [4]:
from google.colab import drive
drive.mount('/content/drive')
verifier_dataset = load_from_disk('/content/drive/My Drive/verifier_dataset')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
model_name = "deepseek-ai/deepseek-math-7b-rl"

config = AutoConfig.from_pretrained(model_name)
# nf4_config = BitsAndBytesConfig(
#     load_in_4bit=True,
#     bnb_4bit_quant_type="nf4",
#     bnb_4bit_use_double_quant=True,
#     bnb_4bit_compute_dtype=torch.bfloat16
# )
# model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=nf4_config)

# model = prepare_model_for_kbit_training(model)

# lora_config = LoraConfig(
#     task_type=TaskType.CAUSAL_LM,   # Task type
#     r=8,                           # Rank of the low-rank matrices
#     lora_alpha=1,                 # Alpha scaling parameter
#     target_modules=['q_proj', 'v_proj'], # Target modules for LoRA
#     lora_dropout=0.1               # Dropout for LoRA
# )
# model = get_peft_model(model, lora_config)



# tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"


# dataset = load_dataset('nvidia/OpenMathInstruct-1', split='train')
# math_dataset = dataset.filter(lambda entry: entry['dataset'] == 'math')
# pot_math_dataset = math_dataset.filter(lambda entry: entry['error_message']=='').map(extract_code)
# verifier_dataset = pot_math_dataset.select(range(100000)).map(tokenize_and_label, remove_columns=pot_math_dataset.column_names)

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=None)

In [66]:
verifier = CustomLinearLayer(model)

In [15]:
for name, param in verifier.named_parameters():
    if param.requires_grad:
        print(f"{name}: {param.dtype}")

model.base_model.model.model.layers.0.self_attn.q_proj.lora_A.default.weight: torch.float32
model.base_model.model.model.layers.0.self_attn.q_proj.lora_B.default.weight: torch.float32
model.base_model.model.model.layers.0.self_attn.k_proj.lora_A.default.weight: torch.float32
model.base_model.model.model.layers.0.self_attn.k_proj.lora_B.default.weight: torch.float32
model.base_model.model.model.layers.0.self_attn.v_proj.lora_A.default.weight: torch.float32
model.base_model.model.model.layers.0.self_attn.v_proj.lora_B.default.weight: torch.float32
model.base_model.model.model.layers.0.self_attn.o_proj.lora_A.default.weight: torch.float32
model.base_model.model.model.layers.0.self_attn.o_proj.lora_B.default.weight: torch.float32
model.base_model.model.model.layers.0.mlp.gate_proj.lora_A.default.weight: torch.float32
model.base_model.model.model.layers.0.mlp.gate_proj.lora_B.default.weight: torch.float32
model.base_model.model.model.layers.0.mlp.up_proj.lora_A.default.weight: torch.float32

In [7]:
overfit = verifier_dataset.select(range(10))

In [67]:
training_args = TrainingArguments(
    output_dir="verifier_output",
    num_train_epochs=40,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=1,
    learning_rate=3e-4,
    fp16=True,
    logging_steps = 1,
    per_device_eval_batch_size=1,
    eval_accumulation_steps=1
)

trainer = Trainer(
    model=verifier,
    args=training_args,
    train_dataset=overfit,
    data_collator = data_collator,
)

In [68]:
# torch.autograd.set_detect_anomaly(True)
trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 10 | Num Epochs = 40
O^O/ \_/ \    Batch size per device = 4 | Gradient Accumulation steps = 1
\        /    Total batch size = 4 | Total steps = 120
 "-____-"     Number of trainable parameters = 37,478,402


Outputs: CausalLMOutputWithPast(loss=tensor(9.0286, device='cuda:0', grad_fn=<MseLossBackward0>), logits=tensor([[[ 16.8750,  18.2969,  19.4688,  ...,   8.6172,   8.6016,   8.6172],
         [ 31.2031,  33.2812,  32.9062,  ...,  23.4844,  23.5469,  23.5156],
         [ 58.4375,  60.2500,  57.7500,  ...,  42.4688,  42.4062,  42.5000],
         ...,
         [ 22.2031,  23.5938,  22.5156,  ...,  12.4531,  12.5078,  12.5156],
         [ 22.2500,  23.6719,  22.5938,  ...,  12.5234,  12.5859,  12.5859],
         [ 22.1562,  23.5938,  22.5000,  ...,  12.4609,  12.5234,  12.5234]],

        [[ 16.8750,  18.2969,  19.4688,  ...,   8.6172,   8.6016,   8.6172],
         [ 68.3125,  65.7500,  64.9375,  ...,  51.4375,  51.4062,  51.4688],
         [ 65.1250,  67.3125,  64.0000,  ...,  53.0312,  52.9688,  53.0312],
         ...,
         [-47.5625, -48.4062, -44.9688,  ..., -54.3438, -54.1875, -54.3125],
         [ 28.5781,  30.3750,  33.2188,  ...,  18.2812,  18.2344,  18.1250],
         [ 18.1250

Step,Training Loss
1,9.0286
2,6.0396
3,11.7724
4,7.1125
5,9.8088
6,6.5471
7,9.4843
8,8.7135
9,5.6077
10,5.9979


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
         ...,
         [24.1250, 24.7656, 24.7656,  ..., 14.7031, 14.7734, 14.7656],
         [24.2031, 24.8281, 24.8281,  ..., 14.7500, 14.8125, 14.8125],
         [24.3281, 24.9219, 24.9062,  ..., 14.8125, 14.8828, 14.8750]],

        [[17.5938, 19.7969, 20.5781,  ...,  8.6797,  8.6562,  8.6719],
         [38.4062, 38.7188, 38.5312,  ..., 28.8750, 28.8750, 28.8750],
         [58.3750, 57.6562, 59.0938,  ..., 40.0625, 40.0312, 40.2188],
         ...,
         [27.0469, 27.5000, 27.0781,  ..., 16.0625, 16.1094, 16.1094],
         [26.8438, 27.2969, 26.8906,  ..., 15.9062, 15.9609, 15.9609],
         [26.6719, 27.1094, 26.7031,  ..., 15.7500, 15.7969, 15.8047]],

        [[17.5938, 19.7969, 20.5781,  ...,  8.6797,  8.6562,  8.6719],
         [64.6875, 62.7812, 61.5938,  ..., 48.5625, 48.5312, 48.5625],
         [62.9688, 65.2500, 61.8750,  ..., 51.0312, 50.9688, 51.0312],
         ...,
         [13.8047, 14.6250, 16.0000, 

TrainOutput(global_step=120, training_loss=0.9020113650511727, metrics={'train_runtime': 373.3743, 'train_samples_per_second': 1.071, 'train_steps_per_second': 0.321, 'total_flos': 0.0, 'train_loss': 0.9020113650511727, 'epoch': 40.0})

In [73]:
predictions = trainer.predict(test_dataset=dataset)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
           -1.1680e+00,  1.4404e-01],
          [ 2.4548e-01, -3.5107e-01, -9.8047e-01,  ...,  1.3799e+00,
           -4.9121e-01, -7.3828e-01],
          ...,
          [-4.2694e-02, -2.2095e-02, -3.2074e-02,  ...,  8.5083e-02,
           -1.5602e-02,  2.7130e-02],
          [-4.3457e-02, -2.1637e-02, -3.0823e-02,  ...,  8.5510e-02,
           -1.6846e-02,  2.8229e-02],
          [-4.2664e-02, -2.1515e-02, -3.0243e-02,  ...,  8.5022e-02,
           -1.7853e-02,  2.9022e-02]],

         ...,

         [[-2.4048e-02,  2.6215e-02,  5.1331e-02,  ..., -4.1840e-02,
           -4.6539e-04,  1.4595e-02],
          [ 6.2354e-01, -2.5708e-01, -2.0391e+00,  ...,  6.7969e-01,
           -5.9082e-01, -4.9585e-01],
          [-2.3682e-01, -6.0596e-01, -2.8979e-01,  ...,  1.0713e+00,
           -1.5645e+00,  1.9287e+00],
          ...,
          [ 3.3508e-02,  6.6772e-02,  2.6398e-02,  ...,  1.0666e-02,
           -5.7922e-02, -1.6632e

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
           -9.8877e-01, -1.9623e-02],
          [ 1.3000e-01,  1.2158e+00,  1.2178e+00,  ...,  4.3433e-01,
            7.7930e-01, -1.7012e+00],
          ...,
          [-2.4185e-02, -1.8326e-02, -3.8513e-02,  ...,  8.7280e-02,
           -2.6077e-02,  2.6184e-02],
          [-2.4887e-02, -1.9318e-02, -3.8696e-02,  ...,  8.7646e-02,
           -2.6276e-02,  2.5757e-02],
          [-2.5818e-02, -2.0538e-02, -3.9917e-02,  ...,  8.7158e-02,
           -2.7222e-02,  2.5421e-02]],

         ...,

         [[-2.4048e-02,  2.6215e-02,  5.1331e-02,  ..., -4.1840e-02,
           -4.6539e-04,  1.4595e-02],
          [ 1.5254e+00, -4.7150e-03,  3.1421e-01,  ...,  1.5576e+00,
           -5.7373e-01, -2.1191e+00],
          [ 1.5781e+00,  4.5959e-02, -6.4453e-02,  ...,  4.1846e-01,
           -6.1328e-01,  2.1777e+00],
          ...,
          [ 5.6458e-02,  7.3486e-02,  1.5564e-02,  ...,  2.5467e-02,
           -5.1971e-02, -1.2192e

In [75]:
import numpy as np

In [100]:
scores = np.array(predictions.predictions[0][:,:,tokenizer.bos_token_id])
print(scores.shape)
scores[1][10:131]

(10, 691)


array([-0.8149414 , -0.44580078,  0.6645508 , -1.390625  , -0.6928711 ,
        0.859375  ,  1.2099609 ,  1.0302734 ,  1.578125  ,  1.3652344 ,
        2.1953125 ,  1.0273438 , -0.25927734,  0.18408203, -0.23388672,
        2.7226562 ,  0.9736328 ,  1.0273438 ,  1.0039062 ,  1.0058594 ,
        1.0058594 ,  1.0126953 ,  1.0107422 ,  0.9980469 ,  1.0029297 ,
        1.0019531 ,  1.0107422 ,  1.0117188 ,  1.0048828 ,  0.9902344 ,
        1.        ,  1.0048828 ,  1.0107422 ,  1.0107422 ,  1.0087891 ,
        1.0107422 ,  1.0019531 ,  1.0039062 ,  1.0029297 ,  1.        ,
        0.9995117 ,  1.0107422 ,  1.0097656 ,  1.0205078 ,  1.015625  ,
        1.0078125 ,  1.0029297 ,  1.0117188 ,  1.0068359 ,  1.0097656 ,
        1.0087891 ,  1.0029297 ,  1.0097656 ,  1.0048828 ,  0.9946289 ,
        1.0087891 ,  1.0039062 ,  1.0058594 ,  1.        ,  0.99560547,
        0.9970703 ,  1.0039062 ,  0.99902344,  1.0097656 ,  1.0068359 ,
        1.0029297 ,  1.0009766 ,  1.0019531 ,  1.0019531 ,  1.00

In [98]:
batch['input_ids'][1][20:131]

tensor([ 1217,   363,    32,     3,  2564,    30,   977,   317,    62,  1107,
        21142,    62,  1956,    62,    24,     7, 13967,   266,  1780,   185,
          300,   972,  1183,  3998,  3028,   207,    24,  2318,   207,    15,
          185,   185,  1467,   245,   279,  3169,     7,    16,    15,  1780,
          185,   300,  1183,  3998,   403,  1406,     7,    18,   575,   207,
           16,    15,    15,   919,   245,   575,   207,    16,    15,   919,
          245,   575,   207,    16,    15,   919,   207,    16,     8,   185,
          300,   565,   317,    62,  1107, 21142,    62,  1956,    62,    24,
            7,   572,     7, 13967,   266, 46189,   185,   391,  3640,     7,
           69,     1,  3287,   338,   317,   509,    64,  1078,   254,  1183,
         3998,   207,    18,  5413,    16,   317, 69543,   457,   207,    24,
        29074])

In [97]:
batch['labels'][1][26:131]

tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1])

In [76]:
import torch

x = torch.tensor([[1.0, 2.0], [3.0, 4.0]], requires_grad=True)
y = x ** 2
z = y.sum()
z.backward()

print(x.grad)

tensor([[2., 4.],
        [6., 8.]])


In [62]:
batch['labels'][0].shape

torch.Size([691])

In [62]:
import numpy as np
y_pred = np.argmax(predictions.predictions[1], axis=1)

In [14]:
predicted_scores = predictions.predictions.squeeze()
true_labels = overfit["labels"]

for pred_score, true_label in zip(predicted_scores, true_labels):
    print(f"Predicted Score: {pred_score:.4f}, True Label: {true_label}")


AttributeError: 'tuple' object has no attribute 'squeeze'

In [34]:
for example in overfit:
  print(len(example['input_ids']))

88
131
67
57
109
147
691
236
156
190


In [69]:
batch = data_collator(overfit)

In [72]:
from datasets import Dataset
dataset = Dataset.from_dict(batch)
dataset

Dataset({
    features: ['input_ids', 'labels'],
    num_rows: 10
})

In [71]:
batch.pop('attention_mask')

tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])

In [24]:
overfit[0]

{'input_ids': [100000,
  774,
  245,
  22069,
  15043,
  1317,
  11,
  856,
  317,
  254,
  1604,
  280,
  8650,
  279,
  254,
  5013,
  473,
  254,
  6947,
  276,
  254,
  1420,
  65026,
  16,
  20,
  11,
  207,
  23,
  1026,
  30,
  473,
  4300,
  4027,
  1666,
  83130,
  185,
  185,
  87,
  16,
  11,
  320,
  16,
  11,
  1376,
  17,
  11,
  320,
  17,
  403,
  207,
  15,
  11,
  207,
  15,
  11,
  570,
  16,
  20,
  11,
  207,
  23,
  185,
  185,
  20457,
  403,
  83130,
  6034,
  87,
  17,
  570,
  1376,
  16,
  8,
  746,
  17,
  919,
  334,
  88,
  17,
  570,
  320,
  16,
  8,
  746,
  17,
  8,
  185,
  20457],
 'labels': [-100.0,
  -100.0,
  -100.0,
  -100.0,
  -100.0,
  -100.0,
  -100.0,
  -100.0,
  -100.0,
  -100.0,
  -100.0,
  -100.0,
  -100.0,
  -100.0,
  -100.0,
  -100.0,
  -100.0,
  -100.0,
  -100.0,
  -100.0,
  -100.0,
  -100.0,
  -100.0,
  -100.0,
  -100.0,
  -100.0,
  -100.0,
  -100.0,
  -100.0,
  -100.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1

In [32]:
{k: v.shape for k, v in batch.items()}

{'input_ids': torch.Size([10, 691]),
 'attention_mask': torch.Size([10, 691]),
 'labels': torch.Size([10, 691])}

In [12]:
overfit[0], first_batch[0]

NameError: name 'first_batch' is not defined

In [26]:
predictions.pre

{'eval_loss': 8.612680435180664,
 'eval_runtime': 3.5315,
 'eval_samples_per_second': 2.832,
 'eval_steps_per_second': 2.832}

In [None]:
from torch.utils.data import DataLoader

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=None)
train_dataloader = DataLoader(verifier_dataset, batch_size=4, collate_fn=data_collator)

In [26]:
first_batch = next(iter(train_dataloader))

In [29]:
first_batch['input_ids'].shape

torch.Size([4, 131])

In [33]:
first_batch

{'input_ids': tensor([[100000,    774,    245,  22069,  15043,   1317,     11,    856,    317,
            254,   1604,    280,   8650,    279,    254,   5013,    473,    254,
           6947,    276,    254,   1420,  65026,     16,     20,     11,    207,
             23,   1026,     30,    473,   4300,   4027,   1666,  83130,    185,
            185,     87,     16,     11,    320,     16,     11,   1376,     17,
             11,    320,     17,    403,    207,     15,     11,    207,     15,
             11,    570,     16,     20,     11,    207,     23,    185,    185,
          20457,    403,  83130,   6034,     87,     17,    570,   1376,     16,
              8,    746,     17,    919,    334,     88,     17,    570,    320,
             16,      8,    746,     17,      8,    185,  20457, 100001, 100001,
         100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001,
         100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001,
         10000

In [15]:
first_batch['labels']

tensor([[-100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100],
        [-100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, 

In [None]:
pot_math_dataset_sub1 = pot_math_dataset.select(range(100000)).map(tokenize)
pot_math_dataset_sub2 = pot_math_dataset.select(range(100000,200000)).map(tokenize)
pot_math_dataset_sub3 = pot_math_dataset.select(range(200000,300000)).map(tokenize)

df1 = pd.DataFrame(pot_math_dataset_sub1)
df2 = pd.DataFrame(pot_math_dataset_sub2)
df3 = pd.DataFrame(pot_math_dataset_sub3)

print(df1['tokens'].sum()/100000)
print(df2['tokens'].sum()/100000)
print(df3['tokens'].sum()/100000)

1463566 * 79

In [11]:
torch.cuda.empty_cache()