In [1]:
from datasets import load_dataset
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, GenerationConfig, TrainingArguments, Trainer
import torch
import time
# import evaluate
import pandas as pd
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model_name='google/flan-t5-base'

original_model = AutoModelForSeq2SeqLM.from_pretrained(model_name, torch_dtype=torch.bfloat16)
tokenizer = AutoTokenizer.from_pretrained(model_name)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
original_model.to(device)

model = original_model

In [14]:
# Test apply_chat_template // https://huggingface.co/docs/transformers/main/en/chat_templating

def format_choices(choices):
    a = zip(list(choices.keys()), choices.values())
    final_answers = []
    for x,y in a:
        final_answers.append(f'[{x}] : {y}')
    return "\n".join(final_answers)

prompt_template = f'''
{{question}} \n
{{choices}}
'''

question = "A 23-year-old pregnant woman at 22 weeks gestation presents with burning upon urination. She states it started 1 day ago and has been worsening despite drinking more water and taking cranberry extract. She otherwise feels well and is followed by a doctor for her pregnancy. Her temperature is 97.7°F (36.5°C), blood pressure is 122/77 mmHg, pulse is 80/min, respirations are 19/min, and oxygen saturation is 98% on room air. Physical exam is notable for an absence of costovertebral angle tenderness and a gravid uterus. Which of the following is the best treatment for this patient?"
choices = {
"A": "Ampicillin",
"B": "Ceftriaxone",
"C": "Doxycycline",
"D": "Nitrofurantoin"
}

formated_choices = format_choices(choices)

model_prompt = prompt_template.format(question = question, choices = formated_choices)

model_prompt += "Given four answer candidates, A, B, C, D, choose the best answer choice and provide explanation.\n"
model_prompt += "Provide intermediate steps to solve the problem.\n"
model_prompt += "The answer is:\n"

print(model_prompt)

inputs = tokenizer(model_prompt, return_tensors = "pt", padding=False).to(device)

outputs = model.generate(
                inputs["input_ids"],
                max_new_tokens = 500,
                attention_mask=inputs["attention_mask"],  # Use attention mask
                pad_token_id=tokenizer.eos_token_id,  # Set pad token ID
                do_sample = True)
output_text = tokenizer.batch_decode(outputs, skip_special_tokens = True)[0]

print(output_text)


A 23-year-old pregnant woman at 22 weeks gestation presents with burning upon urination. She states it started 1 day ago and has been worsening despite drinking more water and taking cranberry extract. She otherwise feels well and is followed by a doctor for her pregnancy. Her temperature is 97.7°F (36.5°C), blood pressure is 122/77 mmHg, pulse is 80/min, respirations are 19/min, and oxygen saturation is 98% on room air. Physical exam is notable for an absence of costovertebral angle tenderness and a gravid uterus. Which of the following is the best treatment for this patient? 

[A] : Ampicillin
[B] : Ceftriaxone
[C] : Doxycycline
[D] : Nitrofurantoin
Given four answer candidates, A, B, C, D, choose the best answer choice and provide explanation.
Provide intermediate steps to solve the problem.
The answer is:

D


In [13]:
# Prepare data from UltraMedical for model to train

from datasets import load_dataset

# chat = [
#     {"role": "user", "content": "Can you format the answer in JSON?"},
#     {"role": "assistant", "content": '{"name": "'},
# ]
# print(tokenizer.apply_chat_template(chat, add_generation_prompt = False, tokenize = False, continue_final_message = True))

ultraMedical = load_dataset("TsinghuaC3I/UltraMedical")

data_list = ultraMedical["train"]

first_N_data = data_list.select(range(0, 100))

def preprocess_function(data):
    # print("data:", data)
    # input_text = [f"Give a proper answer to the question:\nquestion: {conversation[0]['value']}\n\nThe answer is: " for conversation in data["conversations"]] # (context) + question + options
    # target_text = [f"{conversation[1]['value']}" for conversation in data["conversations"]]  # CoT + final answer
    # # target_text = [f"A" for conversation in data["conversations"]]  # CoT + final answer (TODO)
    
    prompt_added_text = """
Given four answer candidates, A, B, C, D, choose the best answer choice and provide explanation.
Provide intermediate steps to solve the problem.
The answer is:
"""

    input_text = [conversation[0]['value'] + prompt_added_text for conversation in data['conversations']] # (context) + question + options
    target_text = [conversation[1]['value'] for conversation in data["conversations"]]  # CoT + final answer


    # print("input:", input_text)
    # print("input_text:", input_text)
    # print("target_text:", target_text)
    
    # prompt = f"Give a proper answer to the question:\nquestion: {question}\n\nThe answer is: "

    inputs = tokenizer(input_text, padding="max_length", truncation=True, max_length=1024)
    targets = tokenizer(target_text, padding="max_length", truncation=True, max_length=1024)
    inputs["labels"] = targets["input_ids"]  # Set the target tokens as the 'labels'

    return inputs

    # prompt = input_text
    # data['input_ids'] = tokenizer(prompt, padding="max_length", truncation=True, return_tensors="pt").input_ids
    # data['labels'] = tokenizer(target_text, padding="max_length", truncation=True, return_tensors="pt").input_ids
    
    # return data

tokenized_datasets = first_N_data.map(preprocess_function, batched=True)


In [5]:
# output_dir = f'./dialogue-summary-training-{str(int(time.time()))}'
output_dir = f'./dialogue-summary-training'

training_args = TrainingArguments(
    output_dir=output_dir,
    save_strategy = "no", # TODO, now don't save checkpoints #"epoch",
    learning_rate=1e-4,
    num_train_epochs=3,
    # weight_decay=0.01,
    logging_steps=1,
    per_device_train_batch_size = 4, # As specified in the paper: batch_size: 32
    # max_steps=1
)

trainer = Trainer(
    model=original_model,
    args=training_args,
    train_dataset=tokenized_datasets#['train'],
    # eval_dataset=tokenized_datasets['validation']
)

trainer.train()

original_model.to(torch.bfloat16)  # Convert to bfloat16
original_model.save_pretrained(output_dir, 
                        # save_function=torch.save,  # Use standard PyTorch save
                        # state_dict=model.state_dict(),  # Only save the model weights
                        # safe_serialization=True,  # More efficient serializationsave_optimizer_state=False
                     )
tokenizer.save_pretrained(output_dir,
                         # legacy_format=False  # Use newer, more efficient format
                     )

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss
1,28.25
2,32.25
3,28.875
4,26.75
5,21.875
6,22.5
7,23.625
8,24.0
9,22.5
10,20.875


('./dialogue-summary-training/tokenizer_config.json',
 './dialogue-summary-training/special_tokens_map.json',
 './dialogue-summary-training/tokenizer.json')

In [9]:
instruct_model = AutoModelForSeq2SeqLM.from_pretrained('./dialogue-summary-training', torch_dtype=torch.bfloat16).to(device)
model = instruct_model