In [1]:
#Install libraries 
%pip install -q "transformers>=4.45" "datasets>=2.20" "accelerate>=0.34" "trl>=0.9" "peft>=0.12" bitsandbytes einops huggingface_hub

Note: you may need to restart the kernel to use updated packages.


In [2]:
#Check GPU
import torch, subprocess

print("PyTorch:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("GPU:", torch.cuda.get_device_name(0))
    try:
        print(subprocess.check_output(["nvidia-smi","-L"], text=True))
        print(subprocess.check_output(["nvidia-smi"], text=True))
    except Exception as e:
        print("nvidia-smi not available:", e)

PyTorch: 2.8.0+cu128
CUDA available: True
GPU: Tesla V100-PCIE-16GB
GPU 0: Tesla V100-PCIE-16GB (UUID: GPU-ae197c2c-be7d-4020-50e9-6dcdfa798789)

Thu Oct  2 16:23:02 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 575.57.08              Driver Version: 575.57.08      CUDA Version: 12.9     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla V100-PCIE-16GB           Off |   00000000:18:00.0 Off |                    0 |
| N/A   30C    P0             24W /  250W |       4MiB /  16384MiB |      0%      Default |
|                                         |                        |                  

In [3]:
DATA_PATH = "train.txt" 

In [4]:
#Parse prompt/response pairs and build a TRL chat dataset
import re, random, json
from datasets import Dataset

with open(DATA_PATH, "r", encoding="utf-8") as f:
    raw = f.read()

pair_re = re.compile(
    r'Prompt:\s*"(?P<prompt>.*?)"\s*[\r\n]+Response:\s*"(?P<response>.*?)"',
    re.DOTALL | re.IGNORECASE
)
pairs = pair_re.findall(raw)
print(f"Found {len(pairs)} prompt/response pairs")

records = []
for p, a in pairs:
    records.append({"messages": [
        {"role": "user", "content": p.strip()},
        {"role": "assistant", "content": a.strip()},
    ]})
random.shuffle(records)

dataset = Dataset.from_list(records)
print(dataset[0])


Found 7260 prompt/response pairs
{'messages': [{'content': 'Tell me the common excipient combinations for a Chewable tablet drug containing the active ingredient famotidine, calcium carbonate, magnesium hydroxide.', 'role': 'user'}, {'content': 'A Chewable tablet drug containing famotidine, calcium carbonate, magnesium hydroxide typically uses excipients such as anhydrous lactose, aspartame, dextrates, ferric oxide red, glyceryl monostearate, lactose monohydrate, magnesium stearate, microcrystalline cellulose, polysorbate 80, povidone, unspecified, talc.', 'role': 'assistant'}]}


In [5]:
import torch
use_cuda = torch.cuda.is_available()
cc = torch.cuda.get_device_capability(0) if use_cuda else (0, 0)

bf16_supported = False                 # V100 (7.0) doesn't support bf16
fp16_supported = use_cuda              # yes

print("Capability:", cc, "bf16:", bf16_supported, "fp16:", fp16_supported)
model_dtype = torch.float16 if fp16_supported else torch.float32

Capability: (7, 0) bf16: False fp16: True


In [6]:
#Load Tokenizer and model
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

MODEL_ID = "google/gemma-3-1b-it"

tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True)

bnb_cfg = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype="float16",   # <-- important for V100
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
)

base_model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    quantization_config=bnb_cfg,
    device_map="auto",
)

import os, gc
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
try:
    base_model.config.use_cache = False
except Exception:
    pass
gc.collect(); torch.cuda.empty_cache()

In [7]:
from trl import SFTConfig

sft_config = SFTConfig(
    output_dir="gemma3-1b-it-excipients-lora",
    per_device_train_batch_size=1,      
    gradient_accumulation_steps=16,     
    max_length=1024,
    packing=True,
    group_by_length=True,

    optim="paged_adamw_8bit",
    learning_rate=5e-5,
    lr_scheduler_type="cosine",
    warmup_ratio=0.05,

    bf16=False,                        
    fp16=True,                         
    gradient_checkpointing=True,       
    num_train_epochs=5,
    logging_steps=10,
    save_strategy="epoch",
    report_to="none",
)


In [8]:
#define peft-config
from peft import LoraConfig, get_peft_model
peft_cfg = LoraConfig(
    r=8,                     
    lora_alpha=16,            
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"]
)

In [9]:
import re, random
from datasets import Dataset

def load_dataset_from_txt(path):
    with open(path, "r", encoding="utf-8") as f:
        raw = f.read()

    pair_re = re.compile(
        r'Prompt:\s*"(?P<prompt>.*?)"\s*[\r\n]+Response:\s*"(?P<response>.*?)"',
        re.DOTALL | re.IGNORECASE
    )
    pairs = pair_re.findall(raw)

    records = []
    for p, a in pairs:
        records.append({"messages": [
            {"role": "user", "content": p.strip()},
            {"role": "assistant", "content": a.strip()},
        ]})
    random.shuffle(records)
    return Dataset.from_list(records)

val_dataset   = load_dataset_from_txt("val.txt")
test_dataset  = load_dataset_from_txt("test.txt")

print(len(val_dataset), len(test_dataset))

908 908


In [10]:
#Fine-tune model on data 
from trl import SFTTrainer

def formatting_prompts_func(example):
    # Convert the chat messages to a single string using the tokenizer’s chat template
    return tokenizer.apply_chat_template(
        example["messages"], tokenize=False, add_generation_prompt=False
    )

trainer = SFTTrainer(
    model=base_model,
    args=sft_config,
    train_dataset=dataset,
    eval_dataset=val_dataset,
    formatting_func=formatting_prompts_func,
    processing_class=tokenizer,
    peft_config=peft_cfg,
)

import gc, torch
gc.collect(); torch.cuda.empty_cache()

trainer.train()


Padding-free training is enabled, but the attention implementation is not set to 'flash_attention_2'. Padding-free training flattens batches into a single sequence, and 'flash_attention_2' is the only known attention mechanism that reliably supports this. Using other implementations may lead to unexpected behavior. To ensure compatibility, set `attn_implementation='flash_attention_2'` in the model configuration, or verify that your attention mechanism can handle flattened sequences.
You are using packing, but the attention implementation is not set to 'flash_attention_2' or 'kernels-community/vllm-flash-attn3'. Packing flattens batches into a single sequence, and Flash Attention is the only known attention mechanisms that reliably support this. Using other implementations may lead to cross-contamination between batches. To avoid this, either disable packing by setting `packing=False`, or set `attn_implementation='flash_attention_2'` or `attn_implementation='kernels-community/vllm-flash

Applying formatting function to train dataset:   0%|          | 0/7260 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/7260 [00:00<?, ? examples/s]

Packing train dataset:   0%|          | 0/7260 [00:00<?, ? examples/s]

Applying formatting function to eval dataset:   0%|          | 0/908 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/908 [00:00<?, ? examples/s]

Packing eval dataset:   0%|          | 0/908 [00:00<?, ? examples/s]

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 1}.
It is strongly recommended to train Gemma3 models with the `eager` attention implementation instead of `sdpa`. Use `eager` with `AutoModelForCausalLM.from_pretrained('<path-to-checkpoint>', attn_implementation='eager')`.
  return fn(*args, **kwargs)


Step,Training Loss
10,3.6952
20,2.1212
30,1.5204
40,1.2363
50,1.0751
60,0.8868
70,0.7665
80,0.6807
90,0.6377
100,0.5824


  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)


TrainOutput(global_step=205, training_loss=0.9156916967252405, metrics={'train_runtime': 1540.8579, 'train_samples_per_second': 2.109, 'train_steps_per_second': 0.133, 'total_flos': 1.382507412292224e+16, 'train_loss': 0.9156916967252405, 'entropy': 0.4992420447839273, 'num_tokens': 3271035.0, 'mean_token_accuracy': 0.8862284325264596, 'epoch': 5.0})

In [11]:
## Save full HF model 
from peft import AutoPeftModelForCausalLM
from transformers import AutoTokenizer, AutoModelForCausalLM
from pathlib import Path

ADAPTER_DIR = "/group/sbms003/spinelli/models/gemma3-1b-it-excipients-lora/latest"  #adapter folder
OUT_DIR = "/group/sbms003/spinelli/models/gemma3-1b-it-excipients-finetuned"        #merged model folder

Path(OUT_DIR).mkdir(parents=True, exist_ok=True)

tok = AutoTokenizer.from_pretrained(ADAPTER_DIR, use_fast=True)
peft_model = AutoPeftModelForCausalLM.from_pretrained(ADAPTER_DIR, device_map="cpu")

merged = peft_model.merge_and_unload()  

merged.save_pretrained(OUT_DIR, safe_serialization=True) 
tok.save_pretrained(OUT_DIR)

print("Saved full model to:", OUT_DIR)

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


Saved full model to: /group/sbms003/spinelli/models/gemma3-1b-it-excipients-finetuned


In [None]:
## Load Full Model 
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
tok = AutoTokenizer.from_pretrained(OUT_DIR, use_fast=True)
model = AutoModelForCausalLM.from_pretrained(OUT_DIR, device_map="auto")
pipe = pipeline("text-generation", model=model, tokenizer=tok)

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`
Device set to use cuda:0


In [None]:
# After trainer.train()
model = trainer.model.eval()  # PEFT-wrapped; already on device
tok = tokenizer               # reuse your tokenizer

from transformers import pipeline
pipe = pipeline("text-generation", model=model, tokenizer=tok, device_map="auto")

messages = [
    {"role": "user", "content": "Tell me the common excipient combinations for a Powder for suspension drug containing the active ingredient amoxicillin, clavulanic acid."}
]
prompt = tok.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

out = pipe(
    prompt,
    max_new_tokens=256,
    temperature=0.7,
    top_p=0.9,
    do_sample=True,
)
response = out[0]["generated_text"][len(prompt):].strip()
print(response)


Device set to use cuda:0


A Powder for suspension drug containing amoxicillin, clavulanic acid typically uses excipients such as methylparaben, sucrose, sucrose, water, carboxymethylcellulose sodium, sodium phosphate, tribasic, hypromellose, sucralose, fd&c red no. 40, fd&c red no. 40.


In [12]:
!pip install rouge_score
!pip install evaluate
!pip install evaluate bert-score

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [13]:
import math

def compute_perplexity(model, tokenizer, dataset, max_length=512):
    model.eval()
    losses = []

    for example in dataset:
        # Turn messages into a chat string using tokenizer’s template
        text = tokenizer.apply_chat_template(
            example["messages"],
            tokenize=False,
            add_generation_prompt=False
        )

        enc = tokenizer(
            text,
            return_tensors="pt",
            truncation=True,
            max_length=max_length
        ).to(model.device)

        with torch.no_grad():
            out = model(**enc, labels=enc["input_ids"])
        losses.append(out.loss.item())

    mean_loss = sum(losses) / len(losses)
    perplexity = math.exp(mean_loss)
    return mean_loss, perplexity

In [14]:
val_loss, val_ppl = compute_perplexity(trainer.model, tokenizer, val_dataset)
test_loss, test_ppl = compute_perplexity(trainer.model, tokenizer, test_dataset)

print(f"Validation Loss: {val_loss:.4f}, Perplexity: {val_ppl:.2f}")
print(f"Test Loss: {test_loss:.4f}, Perplexity: {test_ppl:.2f}")


Validation Loss: 0.7537, Perplexity: 2.12
Test Loss: 0.7541, Perplexity: 2.13


In [None]:
 import evaluate

# Load metrics
bleu = evaluate.load("bleu")
rouge = evaluate.load("rouge")
bertscore = evaluate.load("bertscore")

def evaluate_generation(model, tokenizer, dataset, num_samples=100, max_new_tokens=150):
    references = []
    predictions = []

    # pick a subset (avoid evaluating on huge dataset at once)
    subset = dataset.select(range(min(num_samples, len(dataset))))

    for example in subset:
        user_prompt = example["messages"][0]["content"]
        reference = example["messages"][1]["content"]

        # Prepare input
        input_text = tokenizer.apply_chat_template(
            [{"role": "user", "content": user_prompt}],
            tokenize=False,
            add_generation_prompt=True
        )
        inputs = tokenizer(input_text, return_tensors="pt").to(model.device)

        # Generate
        with torch.no_grad():
            gen = model.generate(
                **inputs,
                max_new_tokens=max_new_tokens,
                do_sample=True,
                temperature=0.7,
                top_p=0.9,
            )
        prediction = tokenizer.decode(
            gen[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True
        ).strip()

        predictions.append(prediction)
        references.append(reference)

    # Compute metrics
    bleu_score = bleu.compute(predictions=predictions, references=[[r] for r in references])
    rouge_score = rouge.compute(predictions=predictions, references=references)
    bert_score = bertscore.compute(predictions=predictions, references=references, lang="en")

    return {
        "BLEU": bleu_score["bleu"],
        "ROUGE-L": rouge_score["rougeL"],
        "BERTScore_F1": sum(bert_score["f1"]) / len(bert_score["f1"])
    }

results = evaluate_generation(trainer.model, tokenizer, test_dataset, num_samples=100)
print(results)


Downloading builder script: 0.00B [00:00, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]