<a href="https://colab.research.google.com/github/rishabhranawat/phi-2-csqa/blob/main/commonsense_qa_evaluation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install datasets
!pip uninstall -y transformers
!pip install git+https://github.com/huggingface/transformers
!pip install trl
!pip install transformers[torch]

Found existing installation: transformers 4.37.0.dev0
Uninstalling transformers-4.37.0.dev0:
  Successfully uninstalled transformers-4.37.0.dev0
Collecting git+https://github.com/huggingface/transformers
  Cloning https://github.com/huggingface/transformers to /tmp/pip-req-build-iaohp0bm
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/transformers /tmp/pip-req-build-iaohp0bm
  Resolved https://github.com/huggingface/transformers to commit 121641cab1d894ec4f344dda3e80f44c05cbcd92
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: transformers
  Building wheel for transformers (pyproject.toml) ... [?25l[?25hdone
  Created wheel for transformers: filename=transformers-4.37.0.dev0-py3-none-any.whl size=8338115 sha256=cc31c3b03a6e422891cde4d42c4015239b84b7b1a6d6f427d4501c4689b2950c
  Stored in 

Collecting trl
  Downloading trl-0.7.9-py3-none-any.whl (141 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/141.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━[0m [32m71.7/141.1 kB[0m [31m1.9 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m141.1/141.1 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
Collecting accelerate (from trl)
  Downloading accelerate-0.26.1-py3-none-any.whl (270 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m270.9/270.9 kB[0m [31m19.7 MB/s[0m eta [36m0:00:00[0m
Collecting tyro>=0.5.11 (from trl)
  Downloading tyro-0.6.4-py3-none-any.whl (78 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.9/78.9 kB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
Collecting docstring-parser>=0.14.1 (from tyro>=0.5.11->trl)
  Downloading docstring_parser-0.15-py3-none-any.whl (36 kB)
Collecting shtab>

In [None]:
import torch
import datasets
from tqdm import tqdm

from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments
from datasets import load_dataset
from torch.utils.data import DataLoader
from collections import defaultdict
from trl import SFTTrainer


In [None]:
torch.set_default_device("cuda")

In [None]:
csqa = load_dataset("tau/commonsense_qa", split="train")

In [None]:
model = AutoModelForCausalLM.from_pretrained("microsoft/phi-2", torch_dtype="auto", trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-2", trust_remote_code=True)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
tokenizer.pad_token = tokenizer.eos_token

In [None]:
def construct_few_shot_examples(batch):
  few_shot_examples = "Beginning Of Examples\n"
  for question, choices, answer_key in zip(batch['question'], batch['choices'], batch['answerKey']):
      choices_text = "\n".join([f"{label}: {text}" for label, text in zip(choices['label'], choices['text'])])
      question_text = f"Instruct: Choose the correct option: {question} \n{choices_text}. \nOutput: {answer_key}. \n \n"
      few_shot_examples += question_text
  few_shot_examples += "\nEnd of Examples"
  return few_shot_examples

# Tokenize a batch of examples
def tokenize_batch(batch, few_shot_examples=''):
    questions = []
    for question, choices in zip(batch['question'], batch['choices']):
        choices_text = "\n".join([f"{label}: {text}" for label, text in zip(choices['label'], choices['text'])])
        question_text = f"{few_shot_examples} \nInstruct: Choose the correct option: {question} \n{choices_text}. \nOutput:"
        questions.append(question_text)
    tokenized_batch = tokenizer(questions, return_tensors='pt', return_attention_mask=False, padding=True, truncation=True, max_length=512)
    return tokenized_batch

# Process a batch of examples through the model
def model_output_batch(model, batch, few_shot_examples):
    tokenized_batch = tokenize_batch(batch, few_shot_examples)
    outputs = model.generate(**tokenized_batch, max_length=200)
    return tokenizer.batch_decode(outputs, skip_special_tokens=True)

In [None]:
few_shot_example = construct_few_shot_examples({key: csqa[key][:5] for key in ['question', 'choices', 'answerKey']})

In [None]:
# Define batch size
batch_size = 8  # Adjust based on your GPU/CPU memory capacity

# Process dataset in batches
results = []
for i in range(0, 24, batch_size):
    print(f'Batch: {i}')
    batch = {key: csqa[key][i:i+batch_size] for key in ['question', 'choices', 'answerKey']}
    results.extend(model_output_batch(model, batch, few_shot_example))

Batch: 0
Batch: 8
Batch: 16


In [None]:
# def formatting_func(batch):
#   data = []
#   for question, choices, answer_key in zip(batch['question'], batch['choices'], batch['answerKey']):
#       choices_text = "\n".join([f"{label}: {text}" for label, text in zip(choices['label'], choices['text'])])
#       question_text = f"Instruct: Choose the correct option: {question} \n{choices_text}. \nOutput: {answer_key}. \n \n"
#       data.append(question_text)
#   return data

In [None]:
generator = torch.Generator(device="cuda")
dataloader = DataLoader(csqa, generator=generator)


AttributeError: 'Dataset' object has no attribute 'to'

In [None]:
csqa = csqa.with_format("torch", device="cuda")

### SFT

In [None]:
from trl import SFTTrainer, DataCollatorForCompletionOnlyLM

response_template = " Output:"
collator = DataCollatorForCompletionOnlyLM(response_template, tokenizer=tokenizer)

In [None]:
trainer = SFTTrainer(
    model.to("cuda"),
    train_dataset=csqa.with_format("torch", device="cuda"),
    dataset_text_field="answerKey",
    max_seq_length=512
    # data_collator=collator,
)
trainer.train()

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Map:   0%|          | 0/9741 [00:00<?, ? examples/s]

RuntimeError: Expected a 'cuda' device type for generator but found 'cpu'

In [None]:
torch.device

torch.device

In [None]:
print(torch.cuda.is_available())


True
