In [None]:
%%capture
#Installs Unsloth, Xformers (Flash Attention) and all other packages!
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"

!pip install --pre -U xformers -q

#Import all Libraries.
!pip install -r "requirements.txt" -q

In [None]:
!export CUDA_VISIBLE_DEVICES=0,1

In [None]:
import torch
from torch.utils.data import DataLoader
import os
from tqdm import tqdm
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers import default_data_collator, get_linear_schedule_with_warmup
from peft import get_peft_model, TaskType, PeftType, LoraConfig

In [None]:
# Please use your huggingface credentials
!huggingface-cli login --token "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"

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /home/jupyter/.cache/huggingface/token
Login successful


In [None]:
device="cuda:0" if torch.cuda.is_available() else "cpu"
print(device)
torch.cuda.empty_cache()

cuda:0


In [None]:
max_seq_length = 512 # Choose any! We auto support RoPE Scaling internally!
dtype = torch.float16 # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

model_name_or_path = "unsloth/llama-2-7b-bnb-4bit"

model = AutoModelForCausalLM.from_pretrained(model_name_or_path, torch_dtype=torch.float16, low_cpu_mem_usage=True, device_map="auto")
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, padding_side='right', padding=True,
                                         truncation=True, max_length=max_seq_length, low_cpu_mem_usage=True, device_map="auto")
# if tokenizer.pad_token is None:
#     tokenizer.add_special_tokens({'pad_token': '[PAD]'})
#     model.resize_token_embeddings(len(tokenizer))

model.enable_input_require_grads()

Unused kwargs: ['quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.


In [None]:
hyperparam_config = {
    'lr': 1e-4,
    'nepochs': 2,
    'batch_size':2,
    'wd': 1e-7,
    'eps': 0.1,
    'warmup_steps': 0,
}

In [None]:
# LoRA
peft_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    r=16,
    lora_alpha=16,
    use_rslora=True,
    lora_dropout=0.1,
    init_lora_weights="gaussian")

model = get_peft_model(model, peft_config)
model.print_trainable_parameters() #Ensure only prompt tuning params are trainable

trainable params: 8,388,608 || all params: 6,746,804,224 || trainable%: 0.12433454005023165


Load Datasets

In [None]:
# from datasets import interleave_datasets

# def pick_samples(dataset, n):
#   dataset_0 = dataset.filter(lambda example: example["answer"] == 0)
#   dataset_1 = dataset.filter(lambda example: example["answer"] == 1)
#   dataset_2 = dataset.filter(lambda example: example["answer"] == 2)
#   dataset_3 = dataset.filter(lambda example: example["answer"] == 3)

#   try:
#     dataset_0 = dataset_0.select(range(n))
#     dataset_1 = dataset_1.select(range(n))
#     dataset_2 = dataset_2.select(range(n))
#     dataset_3 = dataset_3.select(range(n))
#   except:
#     print("not enough samples")

#   target = interleave_datasets([dataset_0, dataset_1, dataset_2, dataset_3])
#   target = target.shuffle()
#   return target

In [None]:
train_file = "mmlu_01/permuted_trainset_32k.csv"
eval_file = "mmlu_01/permuted_valset_32k.csv"
# test_file = "mmlu_01/varying_option/testset.csv"

train_dataset =  load_dataset('csv', data_files=train_file, split='train')
eval_dataset = load_dataset('csv', data_files=eval_file, split='train')
# test_dataset = load_dataset('csv', data_files=test_file, split='train')

# train_dataset = pick_samples(train_dataset, 300) # Uncomment for taking smaller training set to test the code

**Model**

In [None]:
# optimizer and lr scheduler
optimizer = torch.optim.AdamW(model.parameters(),
                              lr=hyperparam_config['lr'],
                              weight_decay=hyperparam_config['wd'],
                              eps=hyperparam_config['eps'])
lr_scheduler = get_linear_schedule_with_warmup(
    optimizer=optimizer,
    num_warmup_steps=hyperparam_config['warmup_steps'],
    num_training_steps=(len(train_dataset) * hyperparam_config['nepochs']),
)

In [None]:
next(model.parameters()).is_cuda

True

**Training**

In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="train_logs_per2/llama-2-7b",
    learning_rate=hyperparam_config['lr'],
    per_device_train_batch_size=hyperparam_config['batch_size'],
    per_device_eval_batch_size=hyperparam_config['batch_size'],
    num_train_epochs=hyperparam_config['nepochs'],
    weight_decay=hyperparam_config['wd'],
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    gradient_checkpointing=True,
    group_by_length=True,
    dataloader_pin_memory=True,
    dataloader_num_workers=4,
    dataloader_prefetch_factor=1,
)

In [None]:
from trl import DataCollatorForCompletionOnlyLM

context_prompt = '''### Instruction: Dig into your knowledge and come up with an answer from the options A/B/C/D given below.
Then, choose the option that best completes the sentence regardless of its position. \n\n'''

def formatting_prompts_func(example):
    question = example['question'][:-10]
    # print(example)
    text = f"{context_prompt} ### {question}\n\n ### Answer: {example['gold_answer']}"
    return {'prompt': text}

response_template = "### Answer:"

train_dataset = train_dataset.map(formatting_prompts_func)
train_dataset.to_csv('train_dataset.csv')
eval_dataset = eval_dataset.map(formatting_prompts_func)
eval_dataset.to_csv('eval_dataset.csv')

Creating CSV from Arrow format:   0%|          | 0/32 [00:00<?, ?ba/s]

Creating CSV from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

671391

In [None]:
from trl import SFTTrainer

collator = DataCollatorForCompletionOnlyLM(response_template, tokenizer=tokenizer)

trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    optimizers=(optimizer, lr_scheduler),
    dataset_text_field="prompt",
    max_seq_length=max_seq_length,
    data_collator=collator,
)

trainer.train()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Then, choose the option that best completes the sentence regardless of its position. 

 ### Question: This question refers to the following information

Epoch,Training Loss,Validation Loss
1,1.1197,1.926071


Then, choose the option that best completes the sentence regardless of its position. 

 ### Question: This question refers to the following information.
Every two months His Majesty sends from Lima 60,000 pesos to pay for the mita of the Indians. Up on the Huanacavelica range there are 3,000 or 4,000 Indians working in the mercury mine, with picks and hammers, breaking up the ore. And when they have filled up their little sacks, the poor fellows, loaded down, climb up those ladders and rigging, so distressing that a man can hardly get up them. That is the way they work in this mine, with many lights and the loud noise of the pounding and great confusion. Nor is that the greatest evil; that is due to thievish and undisciplined superintendents. According to His Majesty's warrant, the mine owners at Potosí have a right to the mita of 13,300 Indians. These mita Indians earn each day 4 reals. Besides these there are others not under obligation, who hire themselves out voluntarily: these eac

In [None]:
model = trainer.model
model.push_to_hub("llama-2-7b-lora-tuned-per3")

In [None]:
model.save_pretrained("out_dir/llama-2-7b-lora-tuned-per3") # saving the models

In [None]:
# Function to generate answers to questions
def generate_answer(question, vocab_id_A, vocab_id_B, vocab_id_C, vocab_id_D):
  max_new_tokens = 1
  question = question[:-10]
  question = f"{context_prompt} ### {question}\n\n ### Answer: "
  input_ids = tokenizer.encode(question, return_tensors="pt").cuda()
  output_ids = model.generate(input_ids, max_new_tokens=max_new_tokens,return_dict=True,return_dict_in_generate=True,output_scores=True,do_sample=False)
  probs = output_ids.scores[0].softmax(-1).squeeze().cuda()

  #Scores is a tuple : (tensor of scores, empty) and tensor is of shape (1,vocab size of model)

  #Finding Probability of generating current predicted token output
  gen = tokenizer.decode(output_ids.sequences[0], skip_special_tokens=True)
  answer = gen[-max_new_tokens:]

  #Finding Probability of generating other tokens as answers ('A','B','C','D')
  #Assert that probability of prediction of the correct option is the same as above.
  vocab_id_A_probs = probs[vocab_id_A].item()
  vocab_id_B_probs = probs[vocab_id_B].item()
  vocab_id_C_probs = probs[vocab_id_C].item()
  vocab_id_D_probs = probs[vocab_id_D].item()
  total = vocab_id_A_probs + vocab_id_B_probs + vocab_id_C_probs + vocab_id_D_probs
  vocab_id_A_probs = vocab_id_A_probs/total
  vocab_id_B_probs = vocab_id_B_probs/total
  vocab_id_C_probs = vocab_id_C_probs/total
  vocab_id_D_probs = vocab_id_D_probs/total

  return answer, vocab_id_A_probs, vocab_id_B_probs, vocab_id_C_probs, vocab_id_D_probs

# Function to process CSV file containing questions

def process_csv(input_file, output_file):
  index = 0
  with open(input_csv_file, "r") as input_file, open(output_csv_file, "w", newline="") as output_file:
    reader = csv.reader(input_file)
    writer = csv.writer(output_file)

    # Read and write headers
    header = next(reader)
    writer.writerow(header + ["Predicted_token_ID", "Normalized_A_probs", "Normalized_B_probs", "Normalized_C_probs", "Normalized_D_probs"])

    # Process each row in the CSV file
    for row in reader:
        question = row[0]
        if question == '':
          break

        #get vocab keys for each of the option ID's to index the scores tensor based on these vocab id's
        out = tokenizer.get_vocab()
        vocab_id_A = out['A']
        vocab_id_B = out['B']
        vocab_id_C = out['C']
        vocab_id_D = out['D']

        answer, A_norm_prob, B_norm_prob, C_norm_prob, D_norm_prob = generate_answer(question,
                                                                                     vocab_id_A, vocab_id_B, vocab_id_C, vocab_id_D)
        # print(f'{index}: {answer}, {A_norm_prob}, {B_norm_prob}, {C_norm_prob}, {D_norm_prob}')
        writer.writerow(row + [answer, A_norm_prob, B_norm_prob, C_norm_prob, D_norm_prob])
        index += 1

In [None]:
data_tables = [
  "permuted_testset_32k",
  "professional_law",
  "prehistory",
  "philosophy",
  "high_school_mathematics",
  "conceptual_physics",
  "college_medicine",
  "abstract_algebra"
]

In [None]:
# Generating inference

import csv

# Input and output file paths
model_name = "llama-2-7b"
mmlu_01 = "mmlu_01/"
mmlu_02 = "mmlu_02/"

for is_varying_option in [True, False]:
    sub_folder = "varying_option" if is_varying_option else "varying_position"
    for file_name in data_tables:
      input_csv_file = mmlu_01 + sub_folder + "/" + file_name + ".csv"
      output_csv_file = mmlu_02 + model_name + "/" + sub_folder + "/" + file_name + ".csv"

      # Process CSV file containing questions and generate answers
      process_csv(input_csv_file, output_csv_file)

      print("Answers generated and saved to:", output_csv_file)

In [None]:
""" Load trained model """

# from peft import PeftModel, PeftConfig


# peft_config = LoraConfig(
#     task_type=TaskType.CAUSAL_LM,
#     r=16,
#     lora_alpha=16,
#     use_rslora=True,
#     lora_dropout=0.1,
#     init_lora_weights="gaussian")


# max_seq_length = 512 # Choose any! We auto support RoPE Scaling internally!
# # dtype = torch.float16 # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
# load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

# peft_model_id = "out_dir/llama-2-7b-lora-tuned-per"

# config = PeftConfig.from_pretrained(peft_model_id, inference_mode=True)
# model = AutoModelForCausalLM.from_pretrained(config.base_model_name_or_path, torch_dtype=torch.float16, low_cpu_mem_usage=True, device_map="auto")
# tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path, padding_side='right', padding=True,
#                                          truncation=True, max_length=512, low_cpu_mem_usage=True, device_map="auto")

# model = PeftModel.from_pretrained(model, peft_model_id, inference_mode=True)
