In [None]:
%%capture
%pip install -U bitsandbytes
%pip install -U transformers
%pip install -U accelerate
%pip install -U peft
%pip install -U trl
%pip install -U datasets

In [None]:
import os
import torch
import wandb
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    logging
)
from peft import LoraConfig, get_peft_model
from huggingface_hub import login
from trl import SFTTrainer, setup_chat_format
import bitsandbytes as bnb
import getpass

In [None]:
import getpass

# Hugging Face login
hf_token = getpass.getpass('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')
from huggingface_hub import login
login(token=hf_token)

# Weights & Biases login
wb_token = getpass.getpass('aa34e4d8a684a165a5b27a8c4af1f61e76606846')
import wandb
wandb.login(key=wb_token)

run = wandb.init(project='Fine-tune Gemma-2-2b-it on Medical Dataset', job_type="training", anonymous="allow")


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··········
aa34e4d8a684a165a5b27a8c4af1f61e76606846··········




In [None]:
# Model configurations
base_model = "google/gemma-2-2b-it"
new_model = "Gemma-2-2b-it-ChatDoctor-HealthCareMagicQA"
dataset_name = "lavita/ChatDoctor-HealthCareMagic-100k"

# Adjust precision and attention based on GPU
if torch.cuda.get_device_capability()[0] >= 8:
    torch_dtype = torch.bfloat16
    attn_implementation = "flash_attention_2"
    !pip install -qqq flash-attn  # Install flash attention if supported
else:
    torch_dtype = torch.float16
    attn_implementation = "eager"

# BitsAndBytes configuration for memory-efficient model loading
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch_dtype,
    bnb_4bit_use_double_quant=True,
)

In [None]:
# Load model with quantization and optimized attention
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=bnb_config,
    device_map="auto",
    attn_implementation=attn_implementation
)
tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)

# Efficient LoRA fine-tuning configuration
def find_all_linear_names(model):
    cls = bnb.nn.Linear4bit
    lora_module_names = set()
    for name, module in model.named_modules():
        if isinstance(module, cls):
            names = name.split('.')
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])
    lora_module_names.discard('lm_head')  # Exclude lm_head for 16-bit
    return list(lora_module_names)

modules = find_all_linear_names(model)

config.json:   0%|          | 0.00/838 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/24.2k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/241M [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/187 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/47.0k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

In [None]:
peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=modules
)

tokenizer.chat_template = None # Reset the chat template to prevent duplication error

model, tokenizer = setup_chat_format(model, tokenizer)
model = get_peft_model(model, peft_config)

In [None]:
import re
from datasets import load_dataset

# Load and preprocess dataset

dataset = load_dataset(dataset_name, split="all", cache_dir="./cache")
dataset = dataset.shuffle(seed=42).select(range(2000))  # Use 3k samples for a better demo

# Define a cleaning function to remove unwanted artifacts
def clean_text(text):
    # Remove URLs and any "Chat Doctor" or similar phrases
    text = re.sub(r'\b(?:www\.[^\s]+|http\S+)', '', text)                   # Remove URLs
    text = re.sub(r'\b(?:Chat Doctor(?:.com)?(?:.in)?|www\.(?:google|yahoo)\S*)', '', text)  # Remove site names
    text = re.sub(r'\s+', ' ', text)                                        # Collapse multiple spaces
    return text.strip()

# Format and clean the data
def format_chat_template(row):
    # Clean instruction, input, and output fields separately
    cleaned_instruction = clean_text(row["instruction"])
    cleaned_input = clean_text(row["input"])
    cleaned_output = clean_text(row["output"])

    # Construct chat format
    row_json = [
        {"role": "system", "content": cleaned_instruction},
        {"role": "user", "content": cleaned_input},
        {"role": "assistant", "content": cleaned_output}
    ]
    row["text"] = tokenizer.apply_chat_template(row_json, tokenize=False)
    return row

# Apply cleaning and formatting to each row in parallel
dataset = dataset.map(format_chat_template, num_proc=4)

# Split into train/test and prepare for tokenization
dataset = dataset.train_test_split(test_size=0.1)
data_collator = lambda batch: tokenizer(batch["text"], return_tensors="pt", padding=True, truncation=True)

README.md:   0%|          | 0.00/542 [00:00<?, ?B/s]

(…)-00000-of-00001-5e7cb295b9cff0bf.parquet:   0%|          | 0.00/70.5M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/112165 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/2000 [00:00<?, ? examples/s]

In [None]:
# Training arguments
training_args = TrainingArguments(
    output_dir=new_model,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=2,
    optim="paged_adamw_32bit",
    num_train_epochs=1,
    eval_strategy="steps",
    eval_steps=200,
    save_steps=500,  # Keep save_steps as 500
    logging_steps=1,
    warmup_steps=10,
    logging_strategy="steps",
    learning_rate=2e-4,
    fp16=True,
    bf16=False,
    group_by_length=True,
    report_to="wandb",
    load_best_model_at_end=False  # Disable loading best model at the end
)


# Trainer with early stopping callback
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    peft_config=peft_config,
    # max_seq_length=512,
    # dataset_text_field="text",  # Specify the text field in your dataset
    tokenizer=tokenizer,
    args=training_args,
    packing=False,
)

# Disable caching during training for gradient computation efficiency
model.config.use_cache = False

TypeError: SFTTrainer.__init__() got an unexpected keyword argument 'tokenizer'