In [1]:
# Install Unsloth
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"

Collecting unsloth@ git+https://github.com/unslothai/unsloth.git (from unsloth[colab-new]@ git+https://github.com/unslothai/unsloth.git)
  Cloning https://github.com/unslothai/unsloth.git to /tmp/pip-install-4k_15sc4/unsloth_e460fdf92b154384b6b9dfd19bff6adb
  Running command git clone --filter=blob:none --quiet https://github.com/unslothai/unsloth.git /tmp/pip-install-4k_15sc4/unsloth_e460fdf92b154384b6b9dfd19bff6adb
  Resolved https://github.com/unslothai/unsloth.git to commit 1c0ad844f170f67c7cdf6f7a9465bafb0f9627df
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting unsloth_zoo>=2025.11.3 (from unsloth@ git+https://github.com/unslothai/unsloth.git->unsloth[colab-new]@ git+https://github.com/unslothai/unsloth.git)
  Downloading unsloth_zoo-2025.11.3-py3-none-any.whl.metadata (32 kB)
Collecting tyro (from unsloth@ git+https://github.com/unslothai/unsloth.gi

In [2]:
from unsloth import FastLanguageModel
import torch

max_seq_length = 2048
dtype = None
load_in_4bit = True

# Load SmolLM2-135M for continued pre-training
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/SmolLM2-135M-Instruct",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

print("Model loaded for continued pre-training!")

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.11.2: Fast Llama patching. Transformers: 4.57.1.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.4.0
\        /    Bfloat16 = FALSE. FA [Xformers = None. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/269M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/158 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/423 [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Model loaded for continued pre-training!


In [3]:
# Configure LoRA for continued pre-training
model = FastLanguageModel.get_peft_model(
    model,
    r = 16,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj"],
    lora_alpha = 16,
    lora_dropout = 0,
    bias = "none",
    use_gradient_checkpointing = "unsloth",
    random_state = 3407,
    use_rslora = False,
)

print("LoRA configured!")
model.print_trainable_parameters()

Unsloth 2025.11.2 patched 30 layers with 30 QKV layers, 30 O layers and 30 MLP layers.


LoRA configured!
trainable params: 4,884,480 || all params: 139,400,064 || trainable%: 3.5039


In [4]:
# Option 1: Create custom text data (simple medical terminology)
medical_texts = [
    "Hypertension is a medical condition where blood pressure remains elevated over time. Common symptoms include headaches and dizziness.",
    "Diabetes mellitus is a metabolic disorder characterized by high blood glucose levels. Type 1 diabetes is autoimmune while Type 2 is lifestyle-related.",
    "Cardiovascular disease refers to conditions affecting the heart and blood vessels. Risk factors include smoking, obesity, and high cholesterol.",
    "Pneumonia is an infection that inflames air sacs in the lungs. Symptoms include cough, fever, and difficulty breathing.",
    "Osteoporosis is a bone disease characterized by decreased bone density. It increases fracture risk, especially in elderly patients.",
    "Asthma is a chronic respiratory condition causing airway inflammation. Triggers include allergens, exercise, and cold air.",
    "Arthritis involves inflammation of joints causing pain and stiffness. Common types include rheumatoid and osteoarthritis.",
    "Migraine is a neurological condition causing severe headaches. Symptoms include nausea, light sensitivity, and visual disturbances.",
    "Chronic kidney disease involves gradual loss of kidney function. Causes include diabetes and hypertension.",
    "Depression is a mood disorder characterized by persistent sadness. Treatment includes therapy and medication.",
] * 20  # Repeat 20 times for more training data

# Create dataset from raw text
from datasets import Dataset

dataset = Dataset.from_dict({"text": medical_texts})

print(f"Dataset created: {len(dataset)} text samples")
print(f"\nSample text:\n{dataset[0]['text']}")

Dataset created: 200 text samples

Sample text:
Hypertension is a medical condition where blood pressure remains elevated over time. Common symptoms include headaches and dizziness.


In [5]:
# For continued pre-training, we just need raw text
# No special formatting like Alpaca or chat templates

def tokenize_function(examples):
    # Simple tokenization for language modeling
    return tokenizer(
        examples["text"],
        truncation=True,
        max_length=max_seq_length,
        padding=False,
    )

# Tokenize dataset
tokenized_dataset = dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=dataset.column_names,
)

print("Dataset tokenized for continued pre-training!")
print(f"Total samples: {len(tokenized_dataset)}")

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Dataset tokenized for continued pre-training!
Total samples: 200


In [6]:
from trl import SFTTrainer
from transformers import TrainingArguments, DataCollatorForLanguageModeling

# Data collator for language modeling (masks tokens randomly)
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,  # We're doing causal LM, not masked LM
)

# Training configuration
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset,
    dataset_text_field="text",  # Just raw text field
    max_seq_length=max_seq_length,
    data_collator=data_collator,
    dataset_num_proc=2,
    packing=False,
    args=TrainingArguments(
        per_device_train_batch_size=2,
        gradient_accumulation_steps=4,
        warmup_steps=5,
        max_steps=30,  # Quick training for demo
        learning_rate=2e-4,
        fp16=not torch.cuda.is_bf16_supported(),
        bf16=torch.cuda.is_bf16_supported(),
        logging_steps=1,
        optim="adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="linear",
        seed=3407,
        output_dir="outputs",
        report_to="none",
    ),
)

print("Trainer configured for continued pre-training!")

Unsloth: We found double BOS tokens - we shall remove one automatically.


Unsloth: Tokenizing ["text"] (num_proc=6):   0%|          | 0/200 [00:00<?, ? examples/s]

Trainer configured for continued pre-training!


In [7]:
# Start continued pre-training
import time
start_time = time.time()

trainer_stats = trainer.train()

training_time = time.time() - start_time
print(f"\nContinued pre-training completed in {training_time/60:.2f} minutes")

The model is already on multiple devices. Skipping the move to device specified in `args`.
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 200 | Num Epochs = 2 | Total steps = 30
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 4,884,480 of 139,400,064 (3.50% trained)


Step,Training Loss
1,3.389
2,2.7564
3,3.1611
4,2.9064
5,2.8181
6,2.6154
7,2.8922
8,3.1849
9,2.8812
10,2.5097



Continued pre-training completed in 1.42 minutes


In [8]:
# Enable inference
FastLanguageModel.for_inference(model)

# Test medical knowledge
test_prompts = [
    "Hypertension is",
    "Diabetes mellitus refers to",
    "Osteoporosis affects",
    "Asthma is characterized by",
    "Treatment for depression includes",
]

print("="*70)
print("TESTING CONTINUED PRE-TRAINED MODEL")
print("="*70)
print("The model learned medical terminology from raw text\n")

for prompt in test_prompts:
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
    outputs = model.generate(
        **inputs,
        max_new_tokens=80,
        temperature=0.3,
        do_sample=True,
        top_p=0.9,
    )

    response = tokenizer.decode(outputs[0], skip_special_tokens=True)

    print(f"Prompt: {prompt}")
    print(f"Completion: {response}")
    print("-"*70)

TESTING CONTINUED PRE-TRAINED MODEL
The model learned medical terminology from raw text

Prompt: Hypertension is
Completion: Hypertension is a medical condition characterized by high blood pressure. It's a common cause of vision loss.

The most common cause of vision loss is diabetic retinopathy. It's a complication of diabetes.

The most common cause of blindness is macular degeneration. It's a complication of diabetes.

The most common cause of blindness is optic atrophy. It's a complication of diabetes.

The most
----------------------------------------------------------------------
Prompt: Diabetes mellitus refers to
Completion: Diabetes mellitus refers to a group of disorders characterized by high blood sugar levels. These disorders include type 1 and type 2 diabetes mellitus. Type 1 diabetes mellitus is an autoimmune disease characterized by insulin deficiency. Type 2 diabetes mellitus is an autoimmune disease characterized by insulin resistance.

Other disorders include Cushing'

In [9]:
# Show that model learned medical concepts
print("="*70)
print("DOMAIN KNOWLEDGE DEMONSTRATION")
print("="*70)

comparison_prompts = [
    "What is hypertension?",
    "Explain diabetes.",
    "Describe asthma symptoms.",
]

for prompt in comparison_prompts:
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
    outputs = model.generate(
        **inputs,
        max_new_tokens=100,
        temperature=0.5,
        do_sample=True,
    )

    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    clean_response = response[len(prompt):].strip()

    print(f"\nQuestion: {prompt}")
    print(f"Response: {clean_response[:200]}")
    print("-"*70)

print("\nThe model shows improved medical terminology understanding!")

DOMAIN KNOWLEDGE DEMONSTRATION

Question: What is hypertension?
Response: 
----------------------------------------------------------------------

Question: Explain diabetes.
Response: Some people may have diabetes without any symptoms. However, if they do have it, they may experience symptoms such as frequent urination, thirst, and blurred vision. If you suspect you have diabetes, 
----------------------------------------------------------------------

Question: Describe asthma symptoms.
Response: How does asthma symptoms differ between children and adults?
----------------------------------------------------------------------

The model shows improved medical terminology understanding!


In [10]:
# Save continued pre-trained model
save_dir = "smollm2_medical_continued_pretrain"

model.save_pretrained(save_dir)
tokenizer.save_pretrained(save_dir)

print(f"Model saved to: {save_dir}")

# Check size
!du -sh {save_dir}

Model saved to: smollm2_medical_continued_pretrain
24M	smollm2_medical_continued_pretrain
