# Final Project
## ADSP 32021 IP01 Machine Learning Operations
### 4. Instruct-Tuning LLM
#### Group 2: Maria Clarissa Fionalita, Kajal Shukla, Mia Zhang, Priya Suvvaru Venkata

In [1]:
!python --version

Python 3.10.13


In [3]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

In [4]:
# from transformers import logging

# logging.set_verbosity_warning()

In [5]:
# from huggingface_hub import notebook_login
# # https://huggingface.co/settings/tokens

# notebook_login()

# Load Training Data

https://huggingface.co/docs/datasets/loading

In [6]:
import json
from pathlib import Path
from pprint import pprint

import datasets
from datasets import load_dataset

from transformers import DefaultDataCollator, AutoTokenizer

In [7]:
data_files = {
              "train": "gs://capstone-team-green/mlops_data/finetune_data/training_data_v2.json",
              "validation": "gs://capstone-team-green/mlops_data/finetune_data/validation_data_v2.json",
              "test": "gs://capstone-team-green/mlops_data/finetune_data/test_data_v2.json"
    }

data = load_dataset("json", data_files = data_files)

In [8]:
data

DatasetDict({
    train: Dataset({
        features: ['context', 'question', 'is_impossible', 'answers_text', 'url', 'id', 'answers'],
        num_rows: 47717
    })
    validation: Dataset({
        features: ['context', 'question', 'is_impossible', 'answers_text', 'url', 'id', 'answers'],
        num_rows: 6273
    })
    test: Dataset({
        features: ['context', 'question', 'is_impossible', 'answers_text', 'url', 'id', 'answers'],
        num_rows: 6107
    })
})

# Training Data Pre-Processing

https://huggingface.co/docs/transformers/tasks/question_answering

## Load Tokenizer

In [9]:
model_name = "facebook/opt-125m"

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"
tokenizer.add_eos_token = True
tokenizer.add_bos_token, tokenizer.add_eos_token

(True, True)

## Create a preprocess_function to 
Tokenize the input text and label

[Alpaca LoRA's finetuning method](https://github.com/tloen/alpaca-lora/blob/main/finetune.py)nsors.

In [10]:
cutoff_len: int = 256

def tokenize(prompt, add_eos_token=True):
    # there's probably a way to do this with the tokenizer settings
    # but again, gotta move fast
    result = tokenizer(
        prompt,
        truncation = True,
        max_length = cutoff_len,
        padding = "max_length",
        return_tensors=None,
    )
    if (
        result["input_ids"][-1] != tokenizer.eos_token_id
        and len(result["input_ids"]) < cutoff_len
        and add_eos_token
    ):
        result["input_ids"].append(tokenizer.eos_token_id)
        result["attention_mask"].append(1)

    result["labels"] = result["input_ids"].copy()

    return result

def format_prompt(sample):
    prompt = f"""### Instruction: You are a helpful assistant that can answer medical questions. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know.

    ### Context information is below:
    {sample['context']}

    ### Given the context information and not prior knowledge, answer the question: {sample['question']}
    
    ### answer: {sample["answers_text"]}
    """
    return prompt

def preprocess_function(examples):
    full_prompt = format_prompt(examples)

    tokenized_full_prompt = tokenize(full_prompt)
    
    return tokenized_full_prompt

In [11]:
%%time

train_data = data["train"].map(preprocess_function, remove_columns=list(data["train"].features))
validation_data = data["validation"].map(preprocess_function, remove_columns=list(data["validation"].features))

Map:   0%|          | 0/6273 [00:00<?, ? examples/s]

CPU times: user 35.1 s, sys: 4.37 s, total: 39.5 s
Wall time: 27.5 s


In [12]:
# create data collator
data_collator = DefaultDataCollator()

In [13]:
# https://discuss.huggingface.co/t/the-model-did-not-return-a-loss-from-the-inputs-only-the-following-keys-logits-for-reference-the-inputs-it-received-are-input-values/25420/9
train_data

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 47717
})

# Train

## Load OPT-125M

In [14]:
import torch
from transformers import AutoModelForCausalLM, BitsAndBytesConfig
from optimum.bettertransformer import BetterTransformer

In [15]:
model = AutoModelForCausalLM.from_pretrained(model_name)
model.config.use_cache = False
# model = BetterTransformer.transform(model, keep_original_model=True) # https://huggingface.co/docs/optimum/bettertransformer/tutorials/convert#training-compatibility

## Fine-Tune

In [16]:
from transformers import TrainingArguments, Trainer
import wandb


Welcome to bitsandbytes. For bug reports, please run

python -m bitsandbytes

 and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
CUDA SETUP: CUDA runtime path found: /usr/local/cuda/lib64/libcudart.so
CUDA SETUP: Highest compute capability among GPUs detected: 7.5
CUDA SETUP: Detected CUDA version 113
CUDA SETUP: Loading binary /opt/conda/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda113.so...


  warn(msg)


In [17]:
training_args = TrainingArguments(
    output_dir="model/result_v2/",
    push_to_hub=False,
    evaluation_strategy = "no",
    use_cpu = False,
    per_device_train_batch_size = 25, # i want to speed up the training
    learning_rate = 2e-4 
)

wandb.init(
    # set the wandb project where this run will be logged
    project="MLOps_OPT_125_v2",
    
    # track hyperparameters and run metadata
    config = training_args
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset = train_data,
    eval_dataset = validation_data,
    data_collator = data_collator
)

trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33mmariafshan[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
500,1.3924
1000,0.7763
1500,0.4919
2000,0.3341
2500,0.2179
3000,0.1674
3500,0.1359
4000,0.106
4500,0.0766
5000,0.067


TrainOutput(global_step=5727, training_loss=0.3359923796523027, metrics={'train_runtime': 9622.5015, 'train_samples_per_second': 14.877, 'train_steps_per_second': 0.595, 'total_flos': 1.8702107836416e+16, 'train_loss': 0.3359923796523027, 'epoch': 3.0})

In [18]:
new_model_name = "model/opt_125_data_v2"

trainer.save_model(new_model_name)
wandb.finish()

VBox(children=(Label(value='0.030 MB of 0.030 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
train/epoch,▁▂▂▃▄▄▅▆▆▇██
train/global_step,▁▂▂▃▄▄▅▆▆▇██
train/learning_rate,█▇▇▆▅▄▄▃▂▂▁
train/loss,█▅▃▂▂▂▁▁▁▁▁
train/total_flos,▁
train/train_loss,▁
train/train_runtime,▁
train/train_samples_per_second,▁
train/train_steps_per_second,▁

0,1
train/epoch,3.0
train/global_step,5727.0
train/learning_rate,1e-05
train/loss,0.0583
train/total_flos,1.8702107836416e+16
train/train_loss,0.33599
train/train_runtime,9622.5015
train/train_samples_per_second,14.877
train/train_steps_per_second,0.595


# Test the New Model

In [20]:
new_model = AutoModelForCausalLM.from_pretrained(new_model_name)
new_model.config.use_cache = False

In [21]:
def inference(text, model, tokenizer, max_input_tokens = 1000, max_output_tokens = 100):
    device = model.device
    # Tokenize
    input_ids = tokenizer.encode(text, return_tensors="pt", truncation=True, max_length=max_input_tokens).to(device)

    # Generate
    generated_tokens = model.generate(input_ids=input_ids.to(device), max_length=max_output_tokens, temperature = 0.4, pad_token_id=tokenizer.eos_token_id, do_sample = True)

    # Decode
    generated_text = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
    
    # Strip the prompt
    generated_text_answer = generated_text[0][len(text):]
    
    return generated_text_answer

def qa_gen(text, model, tokenizer, max_output_tokens = 100):
    # instruction = "instruction: please answer the following question\n"
    question = "question: " + str(text) + "\n"
    prompt = question + "answer:"
    print(prompt)
    print("-------------------BELOW IS GENERATED BY LANGUAGE MODEL---------------------------")
    print(inference(text = prompt, model = model, tokenizer = tokenizer, max_output_tokens = max_output_tokens))
    print("-------------------END OF TEXT GENERATED BY LANGUAGE MODEL------------------------")

## Zero-Shot

In [22]:
%%time

test_prompt = ["What types of exercise are best for people with asthma?", "How is obsessive-compulsive disorder diagnosed?", "When are you more likely to get a blood clot?", "How should you lift objects to prevent back pain?", "How can you be smart with antibiotics?"]

test_prompt[0]

CPU times: user 17 µs, sys: 0 ns, total: 17 µs
Wall time: 21.7 µs


'What types of exercise are best for people with asthma?'

In [23]:
for prompt in test_prompt:
    qa_gen(text = prompt, model = model, tokenizer = tokenizer, max_output_tokens = 100)
    print()

question: What types of exercise are best for people with asthma?
answer:
-------------------BELOW IS GENERATED BY LANGUAGE MODEL---------------------------




 Yes, those cheap calories give you some relief from inflammation. But when you have an asthma attack, it's important to know what you can do to prevent an attack so you can stop it before it starts. Here are some simple tips that can help you prevent an asthma attack. Call your doctor if you have any of these symptoms: Difficulty breathing or shortness of breath or wheezing Coughing up blood
-------------------END OF TEXT GENERATED BY LANGUAGE MODEL------------------------

question: How is obsessive-compulsive disorder diagnosed?
answer:
-------------------BELOW IS GENERATED BY LANGUAGE MODEL---------------------------
 A very strange or unusual explanation is given after a person has a seizure: probably a very gradual decline in behavior.

    ### answer: A very strange or strange explanation is given during a seizure: probably a consequence of the medicine being used for someone who has a seizure.
     ### answer: So after a seizure, you are likely to have some other problems, incl