In [59]:
!pip install -q transformers datasets peft accelerate evaluate rouge-score nltk sacrebleu torch optimum[onnxruntime]

In [60]:
from datasets import load_dataset

In [61]:
dataset = load_dataset("BinKhoaLe1812/MedDialog-EN-100k")

In [62]:
if 'validation' not in dataset:
    split_dataset = dataset['train'].train_test_split(test_size=0.1, seed=42)
    dataset['train'] = split_dataset['train']
    dataset['validation'] = split_dataset['test']

In [63]:
print(f"Training: {len(dataset['train']):,}")
print(f"Validation: {len(dataset['validation']):,}")

Training: 100,948
Validation: 11,217


### Load Tokenizer

In [64]:
!pip install sacremoses

Defaulting to user installation because normal site-packages is not writeable


In [65]:
from transformers import AutoTokenizer

In [66]:
tokenizer = AutoTokenizer.from_pretrained("microsoft/biogpt")

In [67]:
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

In [68]:
import torch
from transformers import AutoModelForCausalLM

In [69]:
model = AutoModelForCausalLM.from_pretrained(
    "microsoft/biogpt",
    torch_dtype=torch.float16,
    device_map="auto",
)

### Setup LoRA

In [70]:
from peft import LoraConfig, get_peft_model, TaskType

In [71]:
lora_config = LoraConfig(
    r=16,  # Good rank for quality
    lora_alpha=32,
    lora_dropout=0.1,
    target_modules=["q_proj", "v_proj", "k_proj", "o_proj"],  # All 4 for quality
    bias="none",
    task_type=TaskType.CAUSAL_LM,
)

In [72]:
model = get_peft_model(model, lora_config)
model.enable_input_require_grads()

In [73]:
trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
total = sum(p.numel() for p in model.parameters())

In [74]:
print(f"Trainable: {trainable:,} ({100*trainable/total:.2f}%)")

Trainable: 2,359,296 (0.68%)


Preprocess

In [80]:
def make_preprocess_function(tokenizer):
    def preprocess_function(examples):
        texts = []
        for i in range(len(examples['instruction'])):
            instruction = examples['instruction'][i] or ""
            input_text = examples['input'][i] if 'input' in examples and examples['input'][i] else ""
            output_text = examples['output'][i] or ""

            prompt = f"{instruction} {input_text}".strip() if input_text else instruction.strip()
            text = f"{prompt}\n{output_text}"
            texts.append(text)

        tokenized = tokenizer(
            texts,
            truncation=True,
            max_length=512,
            padding="max_length",
            return_tensors="pt"
        )
        tokenized["labels"] = tokenized["input_ids"].clone()
        return tokenized
    return preprocess_function

# Create the safe function
preprocess_fn = make_preprocess_function(tokenizer)

tokenized_train = dataset['train'].map(
    preprocess_fn,
    batched=True,
    remove_columns=dataset['train'].column_names,
    desc="Train",
    num_proc=2  # now safe
)


Train (num_proc=2):   0%|          | 0/100948 [00:00<?, ? examples/s]

In [81]:
tokenized_eval = dataset['validation'].map(
    preprocess_fn,
    batched=True,
    remove_columns=dataset['validation'].column_names,
    desc="Val",
    num_proc=2
)

Val (num_proc=2):   0%|          | 0/11217 [00:00<?, ? examples/s]

### Balanced Training Setup

In [82]:
from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling
import numpy as np




In [84]:
training_args = TrainingArguments(
    output_dir="./biogpt-lora-balanced",
    per_device_train_batch_size=8,  # Balanced
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=2,  # Effective batch = 16
    learning_rate=2e-4,  # Standard LR
    num_train_epochs=1,  # Full epochs for quality
    logging_steps=50,
    eval_strategy="steps",
    eval_steps=500,
    save_strategy="steps",
    save_steps=500,
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    warmup_steps=100,
    fp16=True,
    gradient_checkpointing=True,  # For memory
    dataloader_num_workers=2,
    dataloader_pin_memory=True,
    push_to_hub=False,
    report_to="none",
)

In [85]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [86]:
class PerplexityTrainer(Trainer):
    def evaluate(self, eval_dataset=None, ignore_keys=None, metric_key_prefix="eval"):
        metrics = super().evaluate(eval_dataset, ignore_keys, metric_key_prefix)
        if f"{metric_key_prefix}_loss" in metrics:
            metrics[f"{metric_key_prefix}_perplexity"] = np.exp(metrics[f"{metric_key_prefix}_loss"])
        return metrics

In [87]:
trainer = PerplexityTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_eval,
    data_collator=data_collator,
)

### Train

In [88]:
train_result = trainer.train()

`use_cache=True` is incompatible with gradient checkpointing`. Setting `use_cache=False`...


Step,Training Loss,Validation Loss


KeyboardInterrupt: 

In [None]:
print(f"Loss: {train_result.training_loss:.4f}")

### Evaluate

In [None]:
eval_results = trainer.evaluate()

In [None]:
print(f"Loss: {eval_results['eval_loss']:.4f}")
print(f"Perplexity: {eval_results['eval_perplexity']:.4f}")

### Save Original + Merged

In [None]:
# Save LoRA adapter (small)
trainer.save_model("./biogpt-lora-balanced")
tokenizer.save_pretrained("./biogpt-lora-balanced")

In [None]:
# Save merged model for CPU
model_merged = model.merge_and_unload()
model_merged_fp32 = model_merged.to(torch.float32)
model_merged_fp32.save_pretrained("./biogpt-lora-balanced-merged")
tokenizer.save_pretrained("./biogpt-lora-balanced-merged")

### Comprehensive Metrics (20 samples)

In [None]:
import evaluate
from tqdm import tqdm

In [None]:
bleu_metric = evaluate.load("sacrebleu")
rouge_metric = evaluate.load("rouge")
meteor_metric = evaluate.load("meteor")

In [None]:
model_merged.eval()
sample_indices = np.random.choice(len(dataset['validation']), 20, replace=False)

predictions = []
references = []

for idx in tqdm(sample_indices, desc="Evaluating"):
    sample = dataset['validation'][int(idx)]

    instruction = sample.get('instruction', '')
    input_text = sample.get('input', '')
    output_text = sample.get('output', '')

    prompt = f"{instruction} {input_text}".strip() if input_text else instruction.strip()

    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=256)
    inputs = {k: v.to(model_merged.device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model_merged.generate(
            **inputs,
            max_new_tokens=100,
            temperature=0.7,  # Balanced creativity
            top_p=0.9,
            do_sample=True,
            repetition_penalty=1.2,  # Avoid repetition
            length_penalty=1.0,  # Natural length
            pad_token_id=tokenizer.pad_token_id
        )

    generated = tokenizer.decode(outputs[0], skip_special_tokens=True)
    prediction = generated[len(prompt):].strip() if generated.startswith(prompt) else generated.strip()

    predictions.append(prediction or "No response")
    references.append(output_text.strip())

# Calculate metrics
metrics = {}

try:
    bleu = bleu_metric.compute(predictions=predictions, references=[[r] for r in references])
    metrics['BLEU'] = bleu['score']
except:
    metrics['BLEU'] = 0.0

try:
    rouge = rouge_metric.compute(predictions=predictions, references=references)
    metrics['ROUGE-1'] = rouge['rouge1']
    metrics['ROUGE-2'] = rouge['rouge2']
    metrics['ROUGE-L'] = rouge['rougeL']
except:
    metrics['ROUGE-1'] = metrics['ROUGE-2'] = metrics['ROUGE-L'] = 0.0

try:
    meteor = meteor_metric.compute(predictions=predictions, references=references)
    metrics['METEOR'] = meteor['meteor']
except:
    metrics['METEOR'] = 0.0

In [None]:
for key, value in metrics.items():
    print(f"{key}: {value:.4f}")

### Quality Test with Analysis

In [None]:
test_cases = [
    {
        "question": "What are the symptoms of diabetes?",
        "expected_keywords": ["thirst", "urination", "fatigue", "weight", "vision"],
    },
    {
        "question": "How to treat high blood pressure?",
        "expected_keywords": ["lifestyle", "diet", "exercise", "medication", "salt"],
    },
    {
        "question": "What causes migraine headaches?",
        "expected_keywords": ["trigger", "stress", "food", "hormonal", "sleep"],
    },
]

total_score = 0

for i, test in enumerate(test_cases, 1):
    prompt = test["question"]
    keywords = test["expected_keywords"]

    inputs = tokenizer(prompt, return_tensors="pt")
    inputs = {k: v.to(model_merged.device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model_merged.generate(
            **inputs,
            max_new_tokens=150,
            temperature=0.7,
            top_p=0.9,
            do_sample=True,
            repetition_penalty=1.2,
            pad_token_id=tokenizer.pad_token_id
        )

    text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    response = text[len(prompt):].strip().lower()

    # Check keyword coverage
    found_keywords = [kw for kw in keywords if kw in response]
    coverage = len(found_keywords) / len(keywords) * 100
    total_score += coverage

    print(f"\n{i}. Q: {prompt}")
    print(f"   A: {text[len(prompt):].strip()}")
    print(f"   Keyword coverage: {coverage:.0f}% ({len(found_keywords)}/{len(keywords)})")
    print(f"   Found: {', '.join(found_keywords)}")

avg_score = total_score / len(test_cases)
print(f"\n" + "=" * 70)
print(f"QUALITY SCORE: {avg_score:.1f}%")
print("=" * 70)
print("\n Expected: 60-80% coverage = Good quality")
print("   Higher coverage = More comprehensive responses")

### Download

In [None]:
import shutil
from google.colab import files

In [None]:
# For small download (LoRA adapter only - ~20MB):
shutil.make_archive('biogpt-lora', 'zip', './biogpt-lora-balanced')
files.download('biogpt-lora.zip')

In [None]:
# For CPU inference (merged model - ~400MB):
shutil.make_archive('biogpt-merged', 'zip', './biogpt-lora-balanced-merged')
files.download('biogpt-merged.zip')