## Boolq (reading comprehension)

In [1]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
from transformers import StoppingCriteria, StoppingCriteriaList
from tqdm import tqdm
device = 'cuda:0'

In [3]:
checkpoint = "mistralai/Mistral-7B-Instruct-v0.2"
checkpoint = "meta-llama/Llama-2-7b-hf"

tokenizer = AutoTokenizer.from_pretrained(checkpoint, use_fast = False)
model = AutoModelForCausalLM.from_pretrained(checkpoint,
                                             torch_dtype = torch.float16,       
                                             device_map=device,
                                             cache_dir='/mnt/esperanto/et/huggingface/hub'
                                            )

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
dataset = load_dataset('google/boolq')

In [5]:
class StopOnTokens(StoppingCriteria):
    def __init__(self, stop_words):
        self.stop_words = stop_words
    def __call__(self, input_ids, scores, **kwargs):
        last_word = tokenizer.decode(input_ids[0,-1])
        return last_word.lower() in self.stop_words

In [6]:
## This implementation evaluation is base on llm-harness (ref used by papers): https://github.com/EleutherAI/lm-evaluation-harness/blob/3326c547a733d598b4377e54be96e194861b964c/lm_eval/tasks/superglue.py#L69
accuracy = 0

for sample in tqdm(dataset['validation']):
    prompt = f"{sample['passage']}\nQuestion: {sample['question']}?\nAnswer:"

    inputs_yes = tokenizer.encode(prompt + " yes", return_tensors='pt').to(device)
    inputs_no = tokenizer.encode(prompt + " no", return_tensors='pt').to(device)

    with torch.no_grad():
        out_yes = model(inputs_yes).logits
        out_no = model(inputs_no).logits

    ll_yes = out_yes[0, len(tokenizer.encode(prompt)):].sum().item()
    ll_no = out_no[0, len(tokenizer.encode(prompt)):].sum().item()

    accuracy += 1.0 if (ll_yes > ll_no) == sample['answer'] else 0.0

accuracy /= len(dataset['validation'])

100%|██████████| 3270/3270 [03:10<00:00, 17.21it/s]


In [7]:
accuracy

0.6235474006116208

## Hellaswag (reasoning)

In [20]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
from transformers import StoppingCriteria, StoppingCriteriaList
from tqdm import tqdm
device = 'cuda:0'

In [21]:
dataset = load_dataset('Rowan/hellaswag')

Downloading data: 100%|██████████| 24.4M/24.4M [00:04<00:00, 5.18MB/s]
Downloading data: 100%|██████████| 6.11M/6.11M [00:00<00:00, 24.2MB/s]
Downloading data: 100%|██████████| 6.32M/6.32M [00:00<00:00, 19.5MB/s]


Generating train split:   0%|          | 0/39905 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/10003 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10042 [00:00<?, ? examples/s]

## LoRA FT boolq

In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorWithPadding
from torch.utils.data import DataLoader
from peft import LoraConfig, get_peft_model
from datasets import load_dataset
import torch

device = 'cuda:0'

In [2]:
checkpoint = "mistralai/Mistral-7B-v0.1"

tokenizer = AutoTokenizer.from_pretrained(checkpoint, use_fast = False)
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(checkpoint,
                                             torch_dtype = torch.float16,       
                                             device_map=device,
                                             cache_dir='/mnt/esperanto/et/huggingface/hub'
                                            )

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [3]:
lora_config = LoraConfig(r=16,
                        lora_alpha=8,
                        lora_dropout=0.05,
                        bias="none",
                        target_modules=['q_proj', 'k_proj', 'v_proj', 'o_proj', 'up_proj', 'down_proj'])

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

trainable params: 32,505,856 || all params: 7,274,237,952 || trainable%: 0.44686269839526965


In [None]:
dataset = load_dataset('/mnt/esperanto/et/huggingface/datasets/super_glue/')

In [None]:
def prepare_data(sample):
    prompt = f"{sample['passage']}\nQuestion: {sample['question']}?\nAnswer: {'yes' if sample['label'] == 1 else 'no'}"
    return tokenizer(prompt, padding='max_length', truncation=True, max_length=512)

train_data = dataset['train'].map(prepare_data)
test_data = dataset['test'].map(prepare_data)


In [7]:
training_args = TrainingArguments(
    output_dir='./model/mistral-7b-v0.1_peft',
    auto_find_batch_size=True, 
    learning_rate= 1e-4,
    num_train_epochs=1,
    evaluation_strategy="steps",
    eval_steps=0.1,
    save_strategy="no",
    gradient_accumulation_steps=4,
    lr_scheduler_type='cosine',
    warmup_ratio=0.05,
    logging_steps=1,
    report_to='wandb'
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=test_data,
)

trainer.train()

In [None]:
trainer.model.save_pretrained('./models/mistral-7b-v0.1_peft')

In [2]:
checkpoint = "mistralai/Mistral-7B-v0.1"

tokenizer = AutoTokenizer.from_pretrained(checkpoint, use_fast = False)
model = AutoModelForCausalLM.from_pretrained(checkpoint,
                                             torch_dtype = torch.float16,       
                                             device_map=device,
                                             cache_dir='/mnt/esperanto/et/huggingface/hub'
                                            )

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [3]:
peft_model_id = './models/mistral-7b-v0.1_lora_boolq'

model.load_adapter(peft_model_id)

In [9]:
from peft import PeftModel
model = PeftModel.from_pretrained(model, peft_model_id, device_map=device)

In [14]:
model.merge_and_unload()

MistralForCausalLM(
  (model): MistralModel(
    (embed_tokens): Embedding(32000, 4096)
    (layers): ModuleList(
      (0-31): 32 x MistralDecoderLayer(
        (self_attn): MistralSdpaAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): MistralRotaryEmbedding()
        )
        (mlp): MistralMLP(
          (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): MistralRMSNorm()
        (post_attention_layernorm): MistralRMSNorm()
      )
    )
    (norm): MistralRMSNorm(

In [16]:
prompt = "Persian (/ˈpɜːrʒən, -ʃən/), also known by its endonym Farsi (فارسی fārsi (fɒːɾˈsiː) ( listen)), is one of the Western Iranian languages within the Indo-Iranian branch of the Indo-European language family. It is primarily spoken in Iran, Afghanistan (officially known as Dari since 1958), and Tajikistan (officially known as Tajiki since the Soviet era), and some other regions which historically were Persianate societies and considered part of Greater Iran. It is written in the Persian alphabet, a modified variant of the Arabic script, which itself evolved from the Aramaic alphabet.\nQuestion: do iran and afghanistan speak the same language?\nAnswer:"

inputs = tokenizer(prompt, return_tensors='pt').to(device)

with torch.no_grad():
    outputs = model.generate(**inputs, max_new_tokens=10)

tokenizer.decode(outputs[0])

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


'<s> Persian (/ˈpɜːrʒən, -ʃən/), also known by its endonym Farsi (فارسی fārsi (fɒːɾˈsiː) ( listen)), is one of the Western Iranian languages within the Indo-Iranian branch of the Indo-European language family. It is primarily spoken in Iran, Afghanistan (officially known as Dari since 1958), and Tajikistan (officially known as Tajiki since the Soviet era), and some other regions which historically were Persianate societies and considered part of Greater Iran. It is written in the Persian alphabet, a modified variant of the Arabic script, which itself evolved from the Aramaic alphabet.\nQuestion: do iran and afghanistan speak the same language?\nAnswer: yes</s>'

## LoRA FT hellaswag

In [7]:
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorWithPadding
from torch.utils.data import DataLoader
from peft import LoraConfig, get_peft_model
from datasets import load_dataset
import torch
import re

device = 'cuda:0'

In [None]:
checkpoint = "mistralai/Mistral-7B-v0.1"

tokenizer = AutoTokenizer.from_pretrained(checkpoint, use_fast = False)
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(checkpoint,
                                             torch_dtype = torch.float16,       
                                             device_map=device,
                                             cache_dir='/mnt/esperanto/et/huggingface/hub'
                                            )

In [12]:
dataset = load_dataset('Rowan/hellaswag', split='validation')

In [19]:
def _preprocess(text):
        text = text.strip()
        text = text.replace(" [title]", ". ")
        text = re.sub("\\[.*?\\]", "", text)
        text = text.replace("  ", " ")
        return text

def _process_doc(doc):
        ctx = doc["ctx_a"] + " " + doc["ctx_b"].capitalize()
        out_doc = {
            "query": _preprocess(doc["activity_label"] + ": " + ctx),
            "choices": [_preprocess(ending) for ending in doc["endings"]],
            "gold": int(doc["label"]),
        }
        return out_doc

def prepare_data(sample):
    out_doc = _process_doc(sample)
    prompt = out_doc['query'] + ' ' + out_doc['choices'][out_doc['gold']]
    inputs = tokenizer(prompt, padding='max_length', truncation=True, max_length=512)
    inputs.update({'labels': inputs['input_ids']})
    return inputs

In [22]:
dataset

Dataset({
    features: ['ind', 'activity_label', 'ctx_a', 'ctx_b', 'ctx', 'endings', 'source_id', 'split', 'split_type', 'label'],
    num_rows: 10042
})

In [20]:
tokenizer.decode(prepare_data(dataset[2])['input_ids'])

'<s> Canoeing: Two women in a child are shown in a canoe while a man pulls the canoe while standing in the water, with other individuals visible in the background. The child and a different man sit in a canoe while the man paddles.</s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s><

## LoRA FT gsm8k

In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorWithPadding
from torch.utils.data import DataLoader
from peft import LoraConfig, get_peft_model
from datasets import load_dataset
import torch
import re

device = 'cuda:0'

In [5]:
checkpoint = "mistralai/Mistral-7B-v0.1"

tokenizer = AutoTokenizer.from_pretrained(checkpoint, use_fast = False)
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(checkpoint,
                                             torch_dtype = torch.float16,       
                                             device_map=device,
                                             cache_dir='/mnt/esperanto/et/huggingface/hub'
                                            )

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [3]:
dataset = load_dataset('gsm8k', 'main', split='train')

Downloading data: 100%|██████████| 2.31M/2.31M [00:00<00:00, 10.0MB/s]
Downloading data: 100%|██████████| 419k/419k [00:00<00:00, 4.48MB/s]


Generating train split:   0%|          | 0/7473 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1319 [00:00<?, ? examples/s]

In [4]:
dataset

Dataset({
    features: ['question', 'answer'],
    num_rows: 7473
})

In [8]:
def prepare_data(sample):
    prompt = f"Question: {sample["question"]}\nAnswer: {sample["answer"]}"
    inputs = tokenizer(prompt, padding='max_length', truncation=True, max_length=512)
    inputs.update({'labels': inputs['input_ids']})
    return inputs

In [9]:
tokenizer.decode(prepare_data(dataset[0])['input_ids'])

'<s> Question: Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?\nAnswer: Natalia sold 48/2 = <<48/2=24>>24 clips in May.\nNatalia sold 48+24 = <<48+24=72>>72 clips altogether in April and May.\n#### 72</s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s>