In [32]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
model_name = "google/gemma-2b"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",           # Automatically split model between GPU and CPU
    load_in_4bit=True,           # Quantize weights to 4-bit
    torch_dtype=torch.float16,    # Use FP16 where possible
    offload_folder="offload"      # Temporary CPU offload folder
)


The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [2]:
from datasets import load_dataset

# Example: English only
dataset = load_dataset("FreedomIntelligence/medical-o1-reasoning-SFT", "en_mix")



In [3]:
print(dataset)
print(dataset['train'][0])  # look at one sample

DatasetDict({
    train: Dataset({
        features: ['Question', 'Complex_CoT', 'Response'],
        num_rows: 24887
    })
})
{'Question': 'Given the symptoms of sudden weakness in the left arm and leg, recent long-distance travel, and the presence of swollen and tender right lower leg, what specific cardiac abnormality is most likely to be found upon further evaluation that could explain these findings?', 'Complex_CoT': "Okay, let's see what's going on here. We've got sudden weakness in the person's left arm and leg - and that screams something neuro-related, maybe a stroke?\n\nBut wait, there's more. The right lower leg is swollen and tender, which is like waving a big flag for deep vein thrombosis, especially after a long flight or sitting around a lot.\n\nSo, now I'm thinking, how could a clot in the leg end up causing issues like weakness or stroke symptoms?\n\nOh, right! There's this thing called a paradoxical embolism. It can happen if there's some kind of short circuit in the

In [4]:
# Set pad token if missing
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token


In [5]:
def tokenize_function(examples):
    texts = [
        f"Question: {q}\nReasoning: {c}\nAnswer: {r}"
        for q, c, r in zip(examples["Question"], examples["Complex_CoT"], examples["Response"])
    ]
    return tokenizer(texts, truncation=True, padding="max_length", max_length=512)


tokenized_datasets = dataset.map(tokenize_function, batched=True)


In [14]:
from transformers import AutoTokenizer

# Example: using gemma tokenizer
tokenizer = AutoTokenizer.from_pretrained("google/gemma-2b")

# Step 1: Create a single "text" column combining prompt + answer
def preprocess(example):
    # Combine Question + Complex_CoT as input
    prompt = example["Question"] + "\n" + example["Complex_CoT"] + "\nAnswer: "
    # Tokenize the prompt and the response together
    full_text = prompt + example["Response"]
    tokenized = tokenizer(
        full_text,
        truncation=True,
        max_length=512,  # small max len to fit in 8GB GPU
    )
    tokenized["labels"] = tokenized["input_ids"].copy()  # causal LM needs labels
    return tokenized

tokenized_datasets = tokenized_datasets.map(preprocess, batched=False)


Map:   0%|          | 0/24887 [00:00<?, ? examples/s]

In [6]:
from peft import LoraConfig, get_peft_model

lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora_config)


In [7]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./fine_tuned_model",
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    num_train_epochs=3,
    logging_steps=10,
    save_steps=100,
    fp16=True
)


In [15]:
from transformers import Trainer, TrainingArguments
from peft import LoraConfig, get_peft_model

# LoRA config
lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],  # adjust based on model
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)
model = get_peft_model(model, lora_config)

# Training arguments (corrected)
training_args = TrainingArguments(
    output_dir="./gemma_finetune",
    per_device_train_batch_size=1,    # must be tiny for 8GB GPU
    gradient_accumulation_steps=4,    # simulates larger batch
    learning_rate=3e-4,
    fp16=True,                        # mixed precision
    optim="paged_adamw_32bit",
    save_strategy="steps",
    save_steps=100,
    logging_steps=10,
    max_steps=500,                     # for testing
    save_total_limit=2,
    report_to="none"                   # disables logging to wandb/tensorboard
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"]
)





In [16]:
trainer.train()


Step,Training Loss
10,2.1325
20,2.092
30,2.0468
40,1.9833
50,1.9132
60,1.8549
70,1.9089
80,1.82
90,1.8483
100,1.868


TrainOutput(global_step=500, training_loss=1.8326661529541015, metrics={'train_runtime': 809.2528, 'train_samples_per_second': 2.471, 'train_steps_per_second': 0.618, 'total_flos': 1.1810714899501056e+16, 'train_loss': 1.8326661529541015, 'epoch': 0.08036324185317635})

In [26]:
from peft import PeftModel

# Merge LoRA into base or save as PEFT
model.save_pretrained("./gemma_finetune")
tokenizer.save_pretrained("./gemma_finetune")



('./gemma_finetune/tokenizer_config.json',
 './gemma_finetune/special_tokens_map.json',
 './gemma_finetune/tokenizer.json')

In [33]:
merged_model = PeftModel.from_pretrained(model, "./gemma_finetune")
merged_model = merged_model.merge_and_unload()
merged_model.save_pretrained("./gemma_merged")
tokenizer.save_pretrained("./gemma_merged")


('./gemma_merged/tokenizer_config.json',
 './gemma_merged/special_tokens_map.json',
 './gemma_merged/tokenizer.json')

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

tokenizer = AutoTokenizer.from_pretrained("./gemma_merged")
test_model = AutoModelForCausalLM.from_pretrained(
    "./gemma_merged",
    load_in_4bit=True,
    device_map="auto",
    llm_int8_enable_fp32_cpu_offload=True  # allow some layers on CPU
)


# Put model in evaluation mode
test_model.eval()

# Ask a question
prompt = "Given sudden weakness in left arm and leg and a swollen right leg, what cardiac abnormality might be present?"

# Tokenize input
inputs = tokenizer(prompt, return_tensors="pt").to(test_model.device)

# Generate output
with torch.no_grad():
    outputs = test_model.generate(
        **inputs,
        max_new_tokens=200,
        temperature=0.7,
        do_sample=True,
        top_p=0.9
    )

# Decode and print
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(response)


In [17]:
import torch
print(torch.cuda.is_available())
print(torch.version.cuda)
print(torch.backends.cudnn.version())


True
12.4
90100


In [None]:
import os
print(os.environ.get("CUDA_VISIBLE_DEVICES"))


None


In [36]:
import torch
torch.cuda.empty_cache()
torch.cuda.reset_peak_memory_stats()

