# Final Project
## ADSP 32021 IP01 Machine Learning Operations
### 4. Instruct-Tuning LLM
#### Group 2: Maria Clarissa Fionalita, Kajal Shukla, Mia Zhang, Priya Suvvaru Venkata

In [None]:
!python --version

Python 3.10.13


In [None]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [None]:
# from transformers import logging

# logging.set_verbosity_warning()

In [None]:
# from huggingface_hub import notebook_login
# # https://huggingface.co/settings/tokens

# notebook_login()

# Load Training Data

https://huggingface.co/docs/datasets/loading

In [None]:
import json
from pathlib import Path
from pprint import pprint

import datasets
from datasets import load_dataset

from transformers import DefaultDataCollator, AutoTokenizer

In [None]:
data_files = {
              "train": "gs://capstone-team-green/mlops_data/finetune_data/training_data_v1.json",
              "validation": "gs://capstone-team-green/mlops_data/finetune_data/validation_data_v1.json",
              "test": "gs://capstone-team-green/mlops_data/finetune_data/test_data_v1.json"
    }

data = load_dataset("json", data_files = data_files)

Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/132M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [None]:
data

DatasetDict({
    train: Dataset({
        features: ['answers_text', 'question', 'id', 'is_impossible', 'answers', 'context', 'url'],
        num_rows: 19989
    })
    validation: Dataset({
        features: ['answers_text', 'question', 'id', 'is_impossible', 'answers', 'context', 'url'],
        num_rows: 2686
    })
    test: Dataset({
        features: ['answers_text', 'question', 'id', 'is_impossible', 'answers', 'context', 'url'],
        num_rows: 2614
    })
})

# Training Data Pre-Processing

https://huggingface.co/docs/transformers/tasks/question_answering

## Load Tokenizer

In [None]:
model_name = "facebook/opt-125m"

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"
tokenizer.add_eos_token = True
tokenizer.add_bos_token, tokenizer.add_eos_token

(True, True)

## Create a preprocess_function to
Tokenize the input text and label

[Alpaca LoRA's finetuning method](https://github.com/tloen/alpaca-lora/blob/main/finetune.py)nsors.

In [None]:
cutoff_len: int = 256

def tokenize(prompt, add_eos_token=True):
    # there's probably a way to do this with the tokenizer settings
    # but again, gotta move fast
    result = tokenizer(
        prompt,
        truncation = True,
        max_length = cutoff_len,
        padding = "max_length",
        return_tensors=None,
    )
    if (
        result["input_ids"][-1] != tokenizer.eos_token_id
        and len(result["input_ids"]) < cutoff_len
        and add_eos_token
    ):
        result["input_ids"].append(tokenizer.eos_token_id)
        result["attention_mask"].append(1)

    result["labels"] = result["input_ids"].copy()

    return result

def format_prompt(sample):
    prompt = f"""### Instruction: You are a helpful assistant that can answer medical questions. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know.

    ### Context information is below:
    {sample['context']}

    ### Given the context information and not prior knowledge, answer the question: {sample['question']}

    ### answer: {sample["answers_text"]}
    """
    return prompt

def preprocess_function(examples):
    full_prompt = format_prompt(examples)

    tokenized_full_prompt = tokenize(full_prompt)

    return tokenized_full_prompt

In [None]:
%%time

train_data = data["train"].map(preprocess_function, remove_columns=list(data["train"].features))
validation_data = data["validation"].map(preprocess_function, remove_columns=list(data["validation"].features))

Map:   0%|          | 0/19989 [00:00<?, ? examples/s]

Map:   0%|          | 0/2686 [00:00<?, ? examples/s]

CPU times: user 1min 56s, sys: 15.3 s, total: 2min 11s
Wall time: 1min 33s


In [None]:
# create data collator
data_collator = DefaultDataCollator()

In [None]:
# https://discuss.huggingface.co/t/the-model-did-not-return-a-loss-from-the-inputs-only-the-following-keys-logits-for-reference-the-inputs-it-received-are-input-values/25420/9
train_data

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 19989
})

# Train

## Load OPT-125M

In [None]:
import torch
from transformers import AutoModelForCausalLM, BitsAndBytesConfig
from optimum.bettertransformer import BetterTransformer

In [None]:
model = AutoModelForCausalLM.from_pretrained(model_name)
model.config.use_cache = False

## Fine-Tune

In [None]:
from transformers import TrainingArguments, Trainer
import wandb


Welcome to bitsandbytes. For bug reports, please run

python -m bitsandbytes

 and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
CUDA SETUP: CUDA runtime path found: /usr/local/cuda/lib64/libcudart.so
CUDA SETUP: Highest compute capability among GPUs detected: 7.5
CUDA SETUP: Detected CUDA version 113
CUDA SETUP: Loading binary /opt/conda/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda113.so...


  warn(msg)


In [None]:
training_args = TrainingArguments(
    output_dir="model/result/",
    push_to_hub=False,
    evaluation_strategy = "no",
    use_cpu = False,
    per_device_train_batch_size = 25, # i want to speed up the training
    learning_rate = 2e-4
)

wandb.init(
    # set the wandb project where this run will be logged
    project = "MLOps_OPT_125_v1",

    # track hyperparameters and run metadata
    config = training_args
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset = train_data,
    eval_dataset = validation_data,
    data_collator = data_collator
)

trainer.train()

VBox(children=(Label(value='0.026 MB of 0.026 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011113334044441418, max=1.0…

Step,Training Loss
500,1.0398
1000,0.4072




TrainOutput(global_step=1200, training_loss=0.6470496114095052, metrics={'train_runtime': 2265.8896, 'train_samples_per_second': 26.465, 'train_steps_per_second': 0.53, 'total_flos': 7834449641472000.0, 'train_loss': 0.6470496114095052, 'epoch': 3.0})

In [None]:
new_model_name = "model/opt_125_data_v1"

# trainer.model.save_pretrained(new_model_name)
trainer.save_model(new_model_name)
wandb.finish()

VBox(children=(Label(value='0.003 MB of 0.003 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
train/epoch,▁▆█
train/global_step,▁▆█
train/learning_rate,█▁
train/loss,█▁
train/total_flos,▁
train/train_loss,▁
train/train_runtime,▁
train/train_samples_per_second,▁
train/train_steps_per_second,▁

0,1
train/epoch,3.0
train/global_step,1200.0
train/learning_rate,3e-05
train/loss,0.4072
train/total_flos,7834449641472000.0
train/train_loss,0.64705
train/train_runtime,2265.8896
train/train_samples_per_second,26.465
train/train_steps_per_second,0.53


# Test the New Model

In [None]:
new_model_name = "model/opt_125_data_v1"

new_model = AutoModelForCausalLM.from_pretrained(new_model_name)
new_model.config.use_cache = False

In [None]:
def inference(text, model, tokenizer, max_input_tokens = 1000, max_output_tokens = 100):
    device = model.device
    # Tokenize
    input_ids = tokenizer.encode(text, return_tensors="pt", truncation=True, max_length=max_input_tokens).to(device)

    # Generate
    generated_tokens = model.generate(input_ids=input_ids.to(device), max_length=max_output_tokens, temperature = 0.4, pad_token_id=tokenizer.eos_token_id, do_sample = True)

    # Decode
    generated_text = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)

    # Strip the prompt
    generated_text_answer = generated_text[0][len(text):]

    return generated_text_answer

def qa_gen(text, model, tokenizer, max_output_tokens = 100):
    # instruction = "instruction: please answer the following question\n"
    question = "question: " + str(text) + "\n"
    prompt = question + "answer:"
    print(prompt)
    print("-------------------BELOW IS GENERATED BY LANGUAGE MODEL---------------------------")
    print(inference(text = prompt, model = model, tokenizer = tokenizer, max_output_tokens = max_output_tokens))
    print("-------------------END OF TEXT GENERATED BY LANGUAGE MODEL------------------------")

## Zero-Shot

In [None]:
%%time

test_prompt = ["What types of exercise are best for people with asthma?", "How is obsessive-compulsive disorder diagnosed?", "When are you more likely to get a blood clot?", "How should you lift objects to prevent back pain?", "How can you be smart with antibiotics?"]

test_prompt[0]

CPU times: user 14 µs, sys: 4 µs, total: 18 µs
Wall time: 23.1 µs


'What types of exercise are best for people with asthma?'

In [None]:
for prompt in test_prompt:
    qa_gen(text = prompt, model = model, tokenizer = tokenizer, max_output_tokens = 100)
    print()

question: What types of exercise are best for people with asthma?
answer:
-------------------BELOW IS GENERATED BY LANGUAGE MODEL---------------------------




 If you're allergic to one type of exercise, you may want to avoid it because it raises your risk of allergies. Exercise may also cause your symptoms. You can help prevent your allergies with medications, home remedies, and other simple things. The best thing you can do is to avoid these triggers. They can make you more likely to get allergies. Talk to your doctor about what you can do to avoid them.
-------------------END OF TEXT GENERATED BY LANGUAGE MODEL------------------------

question: How is obsessive-compulsive disorder diagnosed?
answer:
-------------------BELOW IS GENERATED BY LANGUAGE MODEL---------------------------
 Your doctor will help you know. But you may not know that you have it because you're not actually diagnosed.

    ### Context information is below:
    If you have obsessive compulsive disorder ( OCD), you may have noticed that certain things, especially things like eating too much sugar or having certain behaviors, make you more likely to have an episode of o