<a href="https://colab.research.google.com/github/rushilbhat/AIMO/blob/main/RM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# !pip install datasets transformers torch bitsandbytes peft
!pip install datasets transformers torch bitsandbytes peft xformers trl accelerate

# !pip install packaging ninja einops flash-attn xformers trl peft accelerate bitsandbytes datasets transformers
# !pip install --upgrade accelerate
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"

Collecting datasets
  Downloading datasets-2.19.2-py3-none-any.whl (542 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/542.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m122.9/542.1 kB[0m [31m3.5 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m [32m532.5/542.1 kB[0m [31m9.0 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.1/542.1 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
Collecting bitsandbytes
  Downloading bitsandbytes-0.43.1-py3-none-manylinux_2_24_x86_64.whl (119.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m119.8/119.8 MB[0m [31m13.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting peft
  Downloading peft-0.11.1-py3-none-any.whl (251 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m251.6/251.6 kB[0m [31m23.1 MB/s[0m eta [36m0:00

In [2]:
import time
import re
import pandas as pd
import torch
import torch.nn as nn
import numpy as np
from datasets import load_dataset, load_from_disk, Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig, AutoConfig, LlamaForCausalLM, TrainingArguments, Trainer, BitsAndBytesConfig, DataCollatorForSeq2Seq
from peft import get_peft_model, LoraConfig, TaskType, prepare_model_for_kbit_training
from unsloth import FastLanguageModel

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


In [2]:
def extract_code(entry):
  code_match = re.search(r'<llm-code>(.*?)</llm-code>', entry['generated_solution'], re.DOTALL)
  if code_match:
      entry['generated_solution'] = code_match.group(1).strip()
  else:
    entry['generated_solution'] = ""
  return entry

def tokenize_and_label(entry):
    question = entry['question']
    generated_solution = entry['generated_solution']
    is_correct = entry['is_correct']

    input_text = f"{question} {generated_solution}"
    input_ids = tokenizer.encode(input_text)
    question_ids = tokenizer.encode(question)

    labels = [-100.0] * len(question_ids) + [1.0 if is_correct else 0.0] * (len(input_ids) - len(question_ids)) # don't need to worry about token that spans across the end of the question and the beginning of the generated solution
    return {"input_ids": input_ids, "labels": labels}

In [3]:
class CustomLinearLayer(nn.Module):
    def __init__(self, model):
        super().__init__()
        self.model = model
        self.special_token_id = config.bos_token_id
        self.verifier_head = nn.Linear(1, 1, bias=True) # dtype is torch.float32 so that gradients are torch.float32

    def forward(self, input_ids, attention_mask=None, labels=None, **kwargs):
        outputs = self.model(input_ids, attention_mask=attention_mask) #labels=labels
        logits = outputs.logits
        # print(f"Shape of logits: {outputs.logits.shape}")

        special_token_logits = logits[:, :, self.special_token_id].unsqueeze(-1)
        # print(f"Shape of special token logits: {special_token_logits.shape}")
        verification_scores = self.verifier_head(special_token_logits).squeeze(-1)
        # print(f"Dtype of verification_scores: {verification_scores.dtype}")

        updated_logits = logits.clone()
        updated_logits[:, :, self.special_token_id] = verification_scores


        if labels is not None:
            # print(outputs.loss)
            mask = (labels != -100)
            labels = labels.float()
            # print(f"Dtype of labels: {labels.dtype}")
            # print(f"Dtype of verification_scores: {verification_scores.dtype}")
            loss_fct = nn.MSELoss()
            verification_loss = loss_fct(verification_scores[mask], labels[mask])
            # print(f"Dtype of verifcation_loss: {verification_loss.dtype}")
            outputs['loss'] = verification_loss

        outputs['logits'] = updated_logits

        return outputs

In [4]:
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = torch.float16 # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "deepseek-ai/deepseek-math-7b-rl", # Choose ANY! eg mistralai/Mistral-7B-Instruct-v0.2
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit
)

model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

config.json:   0%|          | 0.00/626 [00:00<?, ?B/s]

==((====))==  Unsloth: Fast Llama patching release 2024.5
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.3.0+cu121. CUDA = 7.5. CUDA Toolkit = 12.1.
\        /    Bfloat16 = FALSE. Xformers = 0.0.26.post1. FA = False.
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


model.safetensors.index.json:   0%|          | 0.00/23.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-000002.safetensors:   0%|          | 0.00/8.59G [00:00<?, ?B/s]

model-00002-of-000002.safetensors:   0%|          | 0.00/5.23G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/121 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.09k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/4.61M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
deepseek-ai/deepseek-math-7b-rl does not have a padding token! Will use pad_token = <|PAD_TOKEN|>.
Unsloth 2024.5 patched 30 layers with 30 QKV layers, 30 O layers and 30 MLP layers.


In [4]:
tokenizer

LlamaTokenizerFast(name_or_path='deepseek-ai/deepseek-math-7b-rl', vocab_size=100000, model_max_length=4096, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<｜begin▁of▁sentence｜>', 'eos_token': '<｜end▁of▁sentence｜>', 'pad_token': '<|PAD_TOKEN|>'}, clean_up_tokenization_spaces=False),  added_tokens_decoder={
	100000: AddedToken("<｜begin▁of▁sentence｜>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	100001: AddedToken("<｜end▁of▁sentence｜>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	100002: AddedToken("<|PAD_TOKEN|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [5]:
from google.colab import drive
drive.mount('/content/drive')
verifier_dataset = load_from_disk('/content/drive/My Drive/verifier_dataset')


Mounted at /content/drive


In [6]:
model_name = "deepseek-ai/deepseek-math-7b-rl"

config = AutoConfig.from_pretrained(model_name)
# nf4_config = BitsAndBytesConfig(
#     load_in_4bit=True,
#     bnb_4bit_quant_type="nf4",
#     bnb_4bit_use_double_quant=True,
#     bnb_4bit_compute_dtype=torch.bfloat16
# )
# model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=nf4_config)

# model = prepare_model_for_kbit_training(model)

# lora_config = LoraConfig(
#     task_type=TaskType.CAUSAL_LM,   # Task type
#     r=8,                           # Rank of the low-rank matrices
#     lora_alpha=1,                 # Alpha scaling parameter
#     target_modules=['q_proj', 'v_proj'], # Target modules for LoRA
#     lora_dropout=0.1               # Dropout for LoRA
# )
# model = get_peft_model(model, lora_config)



# tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"


# dataset = load_dataset('nvidia/OpenMathInstruct-1', split='train')
# math_dataset = dataset.filter(lambda entry: entry['dataset'] == 'math')
# pot_math_dataset = math_dataset.filter(lambda entry: entry['error_message']=='').map(extract_code)
# verifier_dataset = pot_math_dataset.select(range(100000)).map(tokenize_and_label, remove_columns=pot_math_dataset.column_names)

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=None)

In [19]:
verifier = CustomLinearLayer(model)

In [15]:
for name, param in verifier.named_parameters():
    if param.requires_grad:
        print(f"{name}: {param.dtype}")

model.base_model.model.model.layers.0.self_attn.q_proj.lora_A.default.weight: torch.float32
model.base_model.model.model.layers.0.self_attn.q_proj.lora_B.default.weight: torch.float32
model.base_model.model.model.layers.0.self_attn.k_proj.lora_A.default.weight: torch.float32
model.base_model.model.model.layers.0.self_attn.k_proj.lora_B.default.weight: torch.float32
model.base_model.model.model.layers.0.self_attn.v_proj.lora_A.default.weight: torch.float32
model.base_model.model.model.layers.0.self_attn.v_proj.lora_B.default.weight: torch.float32
model.base_model.model.model.layers.0.self_attn.o_proj.lora_A.default.weight: torch.float32
model.base_model.model.model.layers.0.self_attn.o_proj.lora_B.default.weight: torch.float32
model.base_model.model.model.layers.0.mlp.gate_proj.lora_A.default.weight: torch.float32
model.base_model.model.model.layers.0.mlp.gate_proj.lora_B.default.weight: torch.float32
model.base_model.model.model.layers.0.mlp.up_proj.lora_A.default.weight: torch.float32

In [20]:
overfit = verifier_dataset.select(range(10))

In [21]:
training_args = TrainingArguments(
    output_dir="verifier_output",
    num_train_epochs=40,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=1,
    learning_rate=3e-4,
    fp16=True,
    logging_steps = 1,
    per_device_eval_batch_size=1,
    eval_accumulation_steps=1
)

trainer = Trainer(
    model=verifier,
    args=training_args,
    train_dataset=overfit,
    data_collator = data_collator,
)

In [22]:
# torch.autograd.set_detect_anomaly(True)
trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 10 | Num Epochs = 40
O^O/ \_/ \    Batch size per device = 4 | Gradient Accumulation steps = 1
\        /    Total batch size = 4 | Total steps = 120
 "-____-"     Number of trainable parameters = 37,478,402


Step,Training Loss
1,34.7241
2,21.2904
3,45.6155
4,26.1714
5,37.338
6,24.4272
7,36.8559
8,32.4519
9,20.6062
10,36.1481


TrainOutput(global_step=120, training_loss=3.705559525290543, metrics={'train_runtime': 395.6971, 'train_samples_per_second': 1.011, 'train_steps_per_second': 0.303, 'total_flos': 0.0, 'train_loss': 3.705559525290543, 'epoch': 40.0})

In [23]:
batch = data_collator(overfit)
batch.pop('attention_mask')
dataset = Dataset.from_dict(batch)
dataset

Dataset({
    features: ['input_ids', 'labels'],
    num_rows: 10
})

In [24]:
predictions = trainer.predict(test_dataset=dataset)

In [None]:
scores = np.array(predictions.predictions[0][:,:,tokenizer.bos_token_id])

for i in range(len(scores)):
  labels = batch['labels'][i]
  mask = (labels != -100)
  print(f"SEQUENCE {i}: {scores[i][mask]} \n {labels[mask]}")



In [32]:
{k: v.shape for k, v in batch.items()}

{'input_ids': torch.Size([10, 691]),
 'attention_mask': torch.Size([10, 691]),
 'labels': torch.Size([10, 691])}

In [None]:
from torch.utils.data import DataLoader

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=None)
train_dataloader = DataLoader(verifier_dataset, batch_size=4, collate_fn=data_collator)

In [26]:
first_batch = next(iter(train_dataloader))

In [29]:
first_batch['input_ids'].shape

torch.Size([4, 131])

In [33]:
first_batch

{'input_ids': tensor([[100000,    774,    245,  22069,  15043,   1317,     11,    856,    317,
            254,   1604,    280,   8650,    279,    254,   5013,    473,    254,
           6947,    276,    254,   1420,  65026,     16,     20,     11,    207,
             23,   1026,     30,    473,   4300,   4027,   1666,  83130,    185,
            185,     87,     16,     11,    320,     16,     11,   1376,     17,
             11,    320,     17,    403,    207,     15,     11,    207,     15,
             11,    570,     16,     20,     11,    207,     23,    185,    185,
          20457,    403,  83130,   6034,     87,     17,    570,   1376,     16,
              8,    746,     17,    919,    334,     88,     17,    570,    320,
             16,      8,    746,     17,      8,    185,  20457, 100001, 100001,
         100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001,
         100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001,
         10000

In [15]:
first_batch['labels']

tensor([[-100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100],
        [-100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, 

In [None]:
pot_math_dataset_sub1 = pot_math_dataset.select(range(100000)).map(tokenize)
pot_math_dataset_sub2 = pot_math_dataset.select(range(100000,200000)).map(tokenize)
pot_math_dataset_sub3 = pot_math_dataset.select(range(200000,300000)).map(tokenize)

df1 = pd.DataFrame(pot_math_dataset_sub1)
df2 = pd.DataFrame(pot_math_dataset_sub2)
df3 = pd.DataFrame(pot_math_dataset_sub3)

print(df1['tokens'].sum()/100000)
print(df2['tokens'].sum()/100000)
print(df3['tokens'].sum()/100000)

1463566 * 79

In [11]:
torch.cuda.empty_cache()