In [9]:
#Install libraries
%pip install -q "transformers>=4.45" "datasets>=2.20" "accelerate>=0.34" "trl>=0.9" "peft>=0.12" bitsandbytes einops huggingface_hub

In [10]:
import os

os.environ["HF_TOKEN"] = "" #enter your token


In [11]:
#Check GPU
import torch, subprocess

print("PyTorch:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("GPU:", torch.cuda.get_device_name(0))
    try:
        print(subprocess.check_output(["nvidia-smi","-L"], text=True))
        print(subprocess.check_output(["nvidia-smi"], text=True))
    except Exception as e:
        print("nvidia-smi not available:", e)

PyTorch: 2.8.0+cu126
CUDA available: True
GPU: Tesla T4
GPU 0: Tesla T4 (UUID: GPU-9e45cc04-0393-7fba-b79d-9082b2bf14ae)

Sun Sep 28 06:48:03 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   47C    P8             11W /   70W |       2MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------

In [12]:
DATA_PATH = "train.txt"

In [13]:
#Parse prompt/response pairs and build a TRL chat dataset
import re, random, json
from datasets import Dataset

with open(DATA_PATH, "r", encoding="utf-8") as f:
    raw = f.read()

pair_re = re.compile(
    r'Prompt:\s*"(?P<prompt>.*?)"\s*[\r\n]+Response:\s*"(?P<response>.*?)"',
    re.DOTALL | re.IGNORECASE
)
pairs = pair_re.findall(raw)
print(f"Found {len(pairs)} prompt/response pairs")

records = []
for p, a in pairs:
    records.append({"messages": [
        {"role": "user", "content": p.strip()},
        {"role": "assistant", "content": a.strip()},
    ]})
random.shuffle(records)

dataset = Dataset.from_list(records)
print(dataset[0])


Found 7260 prompt/response pairs
{'messages': [{'content': 'Tell me the common excipient combinations for a ODT tablet drug containing the active ingredient loratadine.', 'role': 'user'}, {'content': 'A ODT tablet drug containing loratadine typically uses excipients such as mannitol, anhydrous citric acid, gelatin.', 'role': 'assistant'}]}


In [14]:
import torch
use_cuda = torch.cuda.is_available()
cc = torch.cuda.get_device_capability(0) if use_cuda else (0, 0)

bf16_supported = False                 # V100 (7.0) doesn't support bf16
fp16_supported = use_cuda              # yes

print("Capability:", cc, "bf16:", bf16_supported, "fp16:", fp16_supported)
model_dtype = torch.float16 if fp16_supported else torch.float32

Capability: (7, 5) bf16: False fp16: True


In [15]:
#Load Tokenizer and model
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

MODEL_ID = "google/gemma-3-1b-it"

tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True)

bnb_cfg = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype="float16",   # <-- important for V100
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
)

base_model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    quantization_config=bnb_cfg,
    device_map="auto",
)

import os, gc
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
try:
    base_model.config.use_cache = False
except Exception:
    pass
gc.collect(); torch.cuda.empty_cache()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/1.16M [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.69M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/33.4M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/35.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/899 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.00G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/215 [00:00<?, ?B/s]

In [16]:
from trl import SFTConfig

sft_config = SFTConfig(
    output_dir="gemma3-1b-it-excipients-lora",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=16,
    max_length=1024,
    packing=True,
    group_by_length=True,

    optim="paged_adamw_8bit",
    learning_rate=5e-5,
    lr_scheduler_type="cosine",
    warmup_ratio=0.05,

    bf16=False,
    fp16=True,
    gradient_checkpointing=True,
    num_train_epochs=5,
    logging_steps=10,
    save_strategy="epoch",
    report_to="none",
)


In [17]:
#define peft-config
from peft import LoraConfig, get_peft_model
peft_cfg = LoraConfig(
    r=8,
    lora_alpha=16,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"]
)

 **new code  in below  cell**

In [18]:
#new code added

import re, random
from datasets import Dataset

def load_dataset_from_txt(path):
    with open(path, "r", encoding="utf-8") as f:
        raw = f.read()

    pair_re = re.compile(
        r'Prompt:\s*"(?P<prompt>.*?)"\s*[\r\n]+Response:\s*"(?P<response>.*?)"',
        re.DOTALL | re.IGNORECASE
    )
    pairs = pair_re.findall(raw)

    records = []
    for p, a in pairs:
        records.append({"messages": [
            {"role": "user", "content": p.strip()},
            {"role": "assistant", "content": a.strip()},
        ]})
    random.shuffle(records)
    return Dataset.from_list(records)

val_dataset   = load_dataset_from_txt("val.txt")
test_dataset  = load_dataset_from_txt("test.txt")

print(len(val_dataset), len(test_dataset))


908 908


**New line added below IMPORTANT**

In [19]:
#Fine-tune model on data
from trl import SFTTrainer

def formatting_prompts_func(example):
    # Convert the chat messages to a single string using the tokenizer’s chat template
    return tokenizer.apply_chat_template(
        example["messages"], tokenize=False, add_generation_prompt=False
    )

trainer = SFTTrainer(
    model=base_model,
    args=sft_config,
    train_dataset=dataset,
    eval_dataset=val_dataset,  # new code added
    formatting_func=formatting_prompts_func,
    processing_class=tokenizer,
    peft_config=peft_cfg,
)

import gc, torch
gc.collect(); torch.cuda.empty_cache()

trainer.train()




Applying formatting function to train dataset:   0%|          | 0/7260 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/7260 [00:00<?, ? examples/s]

Packing train dataset:   0%|          | 0/7260 [00:00<?, ? examples/s]

Applying formatting function to eval dataset:   0%|          | 0/908 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/908 [00:00<?, ? examples/s]

Packing eval dataset:   0%|          | 0/908 [00:00<?, ? examples/s]

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 1}.
It is strongly recommended to train Gemma3 models with the `eager` attention implementation instead of `sdpa`. Use `eager` with `AutoModelForCausalLM.from_pretrained('<path-to-checkpoint>', attn_implementation='eager')`.
  return fn(*args, **kwargs)


Step,Training Loss
10,3.6601
20,2.1153
30,1.5327
40,1.218
50,1.0233
60,0.8952
70,0.7752
80,0.6997
90,0.6223
100,0.5833


  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)


TrainOutput(global_step=205, training_loss=0.9096639551767489, metrics={'train_runtime': 3255.9643, 'train_samples_per_second': 0.997, 'train_steps_per_second': 0.063, 'total_flos': 1.382507412292224e+16, 'train_loss': 0.9096639551767489, 'entropy': 0.5129931667079665, 'num_tokens': 3271035.0, 'mean_token_accuracy': 0.8816304794729573, 'epoch': 5.0})

In [20]:
# Save adapter + tokenizer
## FOR SARAH USE
import os, time, subprocess, sys, pathlib
STAMP = time.strftime("%Y%m%d-%H%M")
MODEL_ROOT = "/group/sbms003/spinelli/models/gemma3-1b-it-excipients-lora"
OUT_DIR = os.path.join(MODEL_ROOT, STAMP)
os.makedirs(OUT_DIR, exist_ok=True)

trainer.model.save_pretrained(OUT_DIR)
tokenizer.save_pretrained(OUT_DIR)

with open(os.path.join(OUT_DIR, "training_args.json"), "w") as f:
    f.write(trainer.args.to_json_string())
with open(os.path.join(OUT_DIR, "pip-freeze.txt"), "w") as f:
    f.write(subprocess.check_output([sys.executable, "-m", "pip", "freeze"], text=True))

latest = pathlib.Path(MODEL_ROOT) / "latest"
latest.parent.mkdir(parents=True, exist_ok=True)
if latest.exists() or latest.is_symlink():
    latest.unlink()
latest.symlink_to(STAMP)
print("Saved:", OUT_DIR)


Saved: /group/sbms003/spinelli/models/gemma3-1b-it-excipients-lora/20250928-0744


In [21]:
## FOR SARAH USE
from peft import AutoPeftModelForCausalLM
from transformers import AutoTokenizer, pipeline

ADAPTER_DIR = "/group/sbms003/spinelli/models/gemma3-1b-it-excipients-lora/latest"
tok = AutoTokenizer.from_pretrained(ADAPTER_DIR, use_fast=True)
model = AutoPeftModelForCausalLM.from_pretrained(ADAPTER_DIR, device_map="auto")

pipe = pipeline("text-generation", model=model, tokenizer=tok)

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`
Device set to use cuda:0


In [22]:
import torch
model = trainer.model.eval()
tok = tokenizer

prompt = "Tell me the common excipient combinations for a Solution drug containing the active ingredient meperidine hydrochloride."
chat = [{"role": "user", "content": prompt}]

text_in = tok.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
inputs = tok(text_in, return_tensors="pt").to(model.device)

with torch.no_grad():
    gen = model.generate(
        **inputs,
        max_new_tokens=200,
        do_sample=True,
        temperature=0.7,
        top_p=0.9,
    )

reply = tok.decode(gen[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True)
print(reply.strip())


A Solution drug containing meperidine hydrochloride typically uses excipients such as citric acid monohydrate, glycerin, propylene glycol, water, sodium benzoate, sodium citrate, sucralose, xanthan gum.


# Evaluation

In [23]:
!pip install rouge_score
!pip install evaluate
!pip install evaluate bert-score


Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=fcabd170a3e524f19102c0457696f797c600a0a8e8787650379d047cf94193a5
  Stored in directory: /root/.cache/pip/wheels/85/9d/af/01feefbe7d55ef5468796f0c68225b6788e85d9d0a281e7a70
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2
Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.6
Collecting bert-score
  Downloading bert

In [24]:
import math

def compute_perplexity(model, tokenizer, dataset, max_length=512):
    model.eval()
    losses = []

    for example in dataset:
        # Turn messages into a chat string using tokenizer’s template
        text = tokenizer.apply_chat_template(
            example["messages"],
            tokenize=False,
            add_generation_prompt=False
        )

        enc = tokenizer(
            text,
            return_tensors="pt",
            truncation=True,
            max_length=max_length
        ).to(model.device)

        with torch.no_grad():
            out = model(**enc, labels=enc["input_ids"])
        losses.append(out.loss.item())

    mean_loss = sum(losses) / len(losses)
    perplexity = math.exp(mean_loss)
    return mean_loss, perplexity


In [25]:
val_loss, val_ppl = compute_perplexity(trainer.model, tokenizer, val_dataset)
test_loss, test_ppl = compute_perplexity(trainer.model, tokenizer, test_dataset)

print(f"Validation Loss: {val_loss:.4f}, Perplexity: {val_ppl:.2f}")
print(f"Test Loss: {test_loss:.4f}, Perplexity: {test_ppl:.2f}")


Validation Loss: 0.7459, Perplexity: 2.11
Test Loss: 0.7459, Perplexity: 2.11


Explanation:

**Loss measures** how well the model predicts the next token in a sequence. Lower values indicate better predictions.

**Perplexity (PPL)** indicates the model’s uncertainty when predicting the next token. A PPL of 2.11 means the model is highly confident in its predictions.

**Our Interpretation**
These low values show that the model has learned the underlying language patterns very well and can reliably predict text similar to the training data.since validation and test losses are very close suggests model is not overfitting

In [26]:
 import evaluate

# Load metrics
bleu = evaluate.load("bleu")
rouge = evaluate.load("rouge")
bertscore = evaluate.load("bertscore")

def evaluate_generation(model, tokenizer, dataset, num_samples=100, max_new_tokens=150):
    references = []
    predictions = []

    # pick a subset (avoid evaluating on huge dataset at once)
    subset = dataset.select(range(min(num_samples, len(dataset))))

    for example in subset:
        user_prompt = example["messages"][0]["content"]
        reference = example["messages"][1]["content"]

        # Prepare input
        input_text = tokenizer.apply_chat_template(
            [{"role": "user", "content": user_prompt}],
            tokenize=False,
            add_generation_prompt=True
        )
        inputs = tokenizer(input_text, return_tensors="pt").to(model.device)

        # Generate
        with torch.no_grad():
            gen = model.generate(
                **inputs,
                max_new_tokens=max_new_tokens,
                do_sample=True,
                temperature=0.7,
                top_p=0.9,
            )
        prediction = tokenizer.decode(
            gen[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True
        ).strip()

        predictions.append(prediction)
        references.append(reference)

    # Compute metrics
    bleu_score = bleu.compute(predictions=predictions, references=[[r] for r in references])
    rouge_score = rouge.compute(predictions=predictions, references=references)
    bert_score = bertscore.compute(predictions=predictions, references=references, lang="en")

    return {
        "BLEU": bleu_score["bleu"],
        "ROUGE-L": rouge_score["rougeL"],
        "BERTScore_F1": sum(bert_score["f1"]) / len(bert_score["f1"])
    }

results = evaluate_generation(trainer.model, tokenizer, test_dataset, num_samples=100)
print(results)


Downloading builder script: 0.00B [00:00, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'BLEU': 0.40141230695599095, 'ROUGE-L': np.float64(0.5507008875261891), 'BERTScore_F1': 0.9067209345102311}


**BLEU Score**
BLEU: 0.401

Explanation:BLEU measures the exact n-gram overlap between generated text and reference text.It ranges from 0 to 1, with 1 being a perfect match.

Our Interpretation:

A BLEU score of 0.401 indicates that the model reproduces parts of the reference text correctly.BLEU is strict; slight wording differences can reduce the score even if the generated text is reasonable.

**ROUGE-L ScoreROUGE-L: 0.55**

Explanation:
ROUGE-L evaluates the longest common subsequence between the generated and reference text.It captures both content and word order similarity, focusing on recall.

Our Interpretation:

A score of 0.55 suggests the model retains about half of the reference content in a fluent manner.This shows the model produces coherent and contextually relevant text.



**BERTScore (F1)BERTScore F1: 0.906**

Explanation:

BERTScore uses contextual embeddings to measure semantic similarity between generated and reference text.Higher scores indicate stronger semantic alignment, regardless of exact word matches.

Our Interpretation:

A score of 0.906 indicates the model generates text that is very close in meaning to the reference, even if wording differs.This is a strong indicator of high-quality, semantically accurate generation.

## OVERALL EVALUATION

The model achieves low loss and perplexity, indicating accurate language modeling.

BERTScore is very high, showing excellent semantic understanding.

BLEU and ROUGE-L are moderate, reflecting minor variations in wording but good content coverage.

Conclusion:

The fine-tuned model is high-quality for generating fluent, contextually correct responses. It demonstrates strong understanding of the training data and can be effectively used for downstream tasks such as text completion or question answering in the pharmaceutical domain.