In [22]:
!pip install -q transformers datasets torch sentencepiece accelerate
!pip install -q --upgrade huggingface_hub
!pip install -q -U google-colab

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.6 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m64.0 MB/s[0m eta [36m0:00:00[0m
[?25h

In [23]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [24]:
import json
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer,
    BitsAndBytesConfig,
    DataCollatorForLanguageModeling
)
from datasets import Dataset, load_dataset
import pandas as pd
from google.colab import drive

In [25]:
drive.mount('/content/drive')
dataset_path = "/content/drive/MyDrive/medical_faq.json"

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [26]:
with open(dataset_path, 'r') as f:
    medical_faq = json.load(f)

In [29]:
df = pd.DataFrame(medical_faq['medical_qa'])


In [30]:
def format_instruction(sample):
    return f"""<start_of_turn>user
Question: {sample['question']}<end_of_turn>
<start_of_turn>model
Answer: {sample['answer']}<end_of_turn>"""

In [31]:
df['text'] = df.apply(format_instruction, axis=1)

In [32]:
dataset = Dataset.from_pandas(df[['text']])
dataset = dataset.train_test_split(test_size=0.1)


In [33]:
MODEL_NAME = "google/gemma-2b"
TOKENIZER_NAME = "google/gemma-2b"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
)

In [34]:
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_NAME)
tokenizer.pad_token = tokenizer.eos_token

tokenizer_config.json:   0%|          | 0.00/33.6k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

In [36]:
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, max_length=512)

# Tokenize dataset
tokenized_datasets = dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=["text"],
)


Map:   0%|          | 0/143 [00:00<?, ? examples/s]

Map:   0%|          | 0/16 [00:00<?, ? examples/s]

In [37]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False
)

In [38]:
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=bnb_config,
    device_map="auto",
    torch_dtype=torch.float16
)


config.json:   0%|          | 0.00/627 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/13.5k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

In [39]:
model.gradient_checkpointing_enable()


In [40]:
model_size = sum(t.numel() for t in model.parameters())
print(f"Model size: {model_size/1000**2:.1f}M parameters")


Model size: 1515.3M parameters


In [41]:
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

# Prepare model for k-bit training
model = prepare_model_for_kbit_training(model)

# LoRA configuration
peft_config = LoraConfig(
    r=8,
    lora_alpha=16,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"]
)

# Apply LoRA
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 1,843,200 || all params: 2,508,015,616 || trainable%: 0.0735


In [45]:
training_args = TrainingArguments(
    output_dir="gemma-2b-medical-qa",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=4,
    eval_strategy="epoch",  # Changed from evaluation_strategy
    save_strategy="epoch",
    logging_steps=10,
    learning_rate=2e-4,
    weight_decay=0.01,
    warmup_ratio=0.03,
    fp16=True,
    optim="paged_adamw_8bit",
    report_to="none",
    save_total_limit=2,
    max_grad_norm=0.3
)

In [43]:
!pip install transformers --upgrade



In [46]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    data_collator=data_collator,
)


No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [47]:
trainer.train()

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
  return fn(*args, **kwargs)


Epoch,Training Loss,Validation Loss
1,3.8313,2.424863
2,2.2333,1.848866
3,1.8144,1.746557


  return fn(*args, **kwargs)
  return fn(*args, **kwargs)


TrainOutput(global_step=54, training_loss=2.5148724450005426, metrics={'train_runtime': 118.6909, 'train_samples_per_second': 3.614, 'train_steps_per_second': 0.455, 'total_flos': 272707002826752.0, 'train_loss': 2.5148724450005426, 'epoch': 3.0})

In [48]:
trainer.save_model("gemma-2b-medical-qa-finetuned")
!cp -r gemma-2b-medical-qa-finetuned "/content/drive/MyDrive/"

In [49]:
from peft import PeftModel

base_model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=bnb_config,
    device_map="auto",
    torch_dtype=torch.float16
)

model = PeftModel.from_pretrained(base_model, "gemma-2b-medical-qa-finetuned")
model = model.merge_and_unload()

def generate_answer(question):
    prompt = f"""<start_of_turn>user
Question: {question}<end_of_turn>
<start_of_turn>model
Answer:"""

    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")

    outputs = model.generate(
        **inputs,
        max_new_tokens=256,
        do_sample=True,
        temperature=0.7,
        top_p=0.9,
        repetition_penalty=1.1
    )

    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return answer.split("Answer:")[-1].strip()

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



In [51]:
test_question = "How can I reduce my risk of kidney stones?"
print(generate_answer(test_question))

Kidney stones are not just a problem for older adults. They also affect younger people, and the risk increases with age.
But there are steps you can take to keep your kidneys healthy and prevent stones from forming.
The most important thing you can do is drink plenty of fluids every day. That will help you stay hydrated and flush out the minerals that form stones.
Eat foods high in citrate, which helps dissolve calcium. These include fruits and vegetables such as oranges, limes, lemons, cantaloupe, tomatoes, strawberries, pineapples, broccoli, spinach and peppers.
If you're prone to stone formation, ask your doctor about medicine or supplements to treat your condition.


In [52]:
test_question = "How can I prevent dry eyes?"
print(generate_answer(test_question))

There are a few ways to reduce the symptoms of dry eye syndrome. One option is to use artificial tears, which can be purchased over the counter. However, it is important to use these only as directed by a doctor, and to avoid using them too frequently.

Another option is to wear protective eyewear, such as glasses or sunglasses, which can help to reduce the amount of wind and other irritants that can cause dry eyes.

It is also important to keep the eyes clean and healthy by washing them regularly with a mild cleanser. Finally, it is important to stay hydrated, as dry eyes can be caused by a lack of moisture in the eyes.


In [53]:
model.save_pretrained("gemma-2b-medical-qa-merged")
tokenizer.save_pretrained("gemma-2b-medical-qa-merged")
!cp -r gemma-2b-medical-qa-merged "/content/drive/MyDrive/"