In [1]:
import torch
from transformers import pipeline

model_id = "meta-llama/Llama-3.2-1B"

pipe = pipeline(
    "text-generation", 
    model=model_id, 
    torch_dtype=torch.bfloat16, 
    device_map="auto"
)

pipe("Can you provide an overview of the lung's squamous cell carcinoma?")


Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


[{'generated_text': "Can you provide an overview of the lung's squamous cell carcinoma? What are the most common symptoms?\nIn squamous cell carcinoma of the lung, the tumor is usually"}]

In [1]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer, DataCollatorForSeq2Seq
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from datasets import load_dataset
import bitsandbytes as bnb

# Load Model and Tokenizer
model_id = "meta-llama/Llama-3.2-1B"
tokenizer = AutoTokenizer.from_pretrained(model_id)

# Select Device
device = "cuda" if torch.cuda.is_available() else "cpu"

#  Load Model in **8-bit Quantization** for Memory Efficiency
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.float16 if device == "cuda" else torch.float32, 
    load_in_8bit=True,  
    device_map="auto"
)

# Prepare model for LoRA fine-tuning (Reduce memory usage)
model = prepare_model_for_kbit_training(model)

# Apply LoRA fine-tuning with **Minimal Trainable Parameters**
peft_config = LoraConfig(
    r=8,  # Increase LoRA rank for more expressive updates
    lora_alpha=16,  # Increase alpha to scale LoRA activations
    lora_dropout=0.05,  
    bias="none",
    task_type="CAUSAL_LM"
)
model = get_peft_model(model, peft_config)

# Move LoRA model to correct device
model.to(device)

#  Load Medical Instruction Dataset
dataset = load_dataset("medalpaca/medical_meadow_wikidoc")

#  Print available dataset splits
print("Available dataset splits:", dataset.keys())

#  Split dataset into **train and validation sets**
dataset = dataset["train"].train_test_split(test_size=0.1, seed=42)  # 90% Train, 10% Validation

# Print new dataset splits
print(f"Train size: {len(dataset['train'])}, Validation size: {len(dataset['test'])}")

# Tokenization function (Ensure Labels Are Correctly Created)
def tokenize_function(examples):
    """Format the input text for instruction-tuned fine-tuning."""
    prompt = f"### Instruction:\n{examples['instruction']}\n\n"
    if examples['input']:
        prompt += f"### Context:\n{examples['input']}\n\n"
    prompt += f"### Response:\n{examples['output']}"

    tokenized = tokenizer(prompt, truncation=True, max_length=512, return_tensors="pt")

    # Ensure labels are correct by shifting input_ids left for causal modeling
    tokenized["labels"] = tokenized["input_ids"].clone()
    return tokenized

#  Tokenize dataset (Ensures Loss Can Be Computed)
tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=dataset["train"].column_names)

#  Assign the correct splits
train_dataset = tokenized_datasets["train"]
val_dataset = tokenized_datasets["test"]  # Previously used train for validation, now using real val set

# Data Collator
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, padding=False)  

#  Training Arguments (Aggressive Memory Optimization)
training_args = TrainingArguments(
    output_dir="./llama3-medical-wikidoc",
    per_device_train_batch_size=1,  # **Minimal batch size to prevent OOM**
    per_device_eval_batch_size=1,  # **Minimal batch size**
    evaluation_strategy="steps",
    save_strategy="steps",
    eval_steps=500,  # **Evaluate every 500 steps**
    save_steps=500,
    logging_steps=10,
    learning_rate=5e-4,  # Slightly higher learning rate for small batch sizes
    num_train_epochs=5,
    weight_decay=0.01,
    push_to_hub=False
)

#  Trainer Setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,  #  Correct validation dataset
    tokenizer=tokenizer,
    data_collator=data_collator
)

#  Train the model
trainer.train()

#  Save the fine-tuned model
model.save_pretrained("./llama3-medical-wikidoc")
tokenizer.save_pretrained("./llama3-medical-wikidoc")


The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Available dataset splits: dict_keys(['train'])
Train size: 9000, Validation size: 1000


Map:   0%|          | 0/9000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

  trainer = Trainer(
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
  return fn(*args, **kwargs)


Step,Training Loss,Validation Loss


('./llama3-medical-wikidoc\\tokenizer_config.json',
 './llama3-medical-wikidoc\\special_tokens_map.json',
 './llama3-medical-wikidoc\\tokenizer.json')

In [2]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

# Load the fine-tuned model
model_path = "./llama3-medical-wikidoc"  # Path where you saved the trained model
tokenizer = AutoTokenizer.from_pretrained(model_path)

# Ensure tokenizer is consistent with model
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})

# Load model with correct precision
device = "cuda" if torch.cuda.is_available() else "cpu"
model = AutoModelForCausalLM.from_pretrained(
    model_path,
    torch_dtype=torch.float16 if device == "cuda" else torch.float32,
    device_map="auto"
)

# **Resize model embeddings to match tokenizer**
model.resize_token_embeddings(len(tokenizer))

# Move model to device
model.to(device)



The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128257, 2048)
    (layers): ModuleList(
      (0-15): 16 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): lora.Linear(
            (base_layer): Linear(in_features=2048, out_features=2048, bias=False)
            (lora_dropout): ModuleDict(
              (default): Dropout(p=0.05, inplace=False)
            )
            (lora_A): ModuleDict(
              (default): Linear(in_features=2048, out_features=8, bias=False)
            )
            (lora_B): ModuleDict(
              (default): Linear(in_features=8, out_features=2048, bias=False)
            )
            (lora_embedding_A): ParameterDict()
            (lora_embedding_B): ParameterDict()
            (lora_magnitude_vector): ModuleDict()
          )
          (k_proj): Linear(in_features=2048, out_features=512, bias=False)
          (v_proj): lora.Linear(
            (base_layer): Linear(in_features=2048, out_features=512, bi

In [None]:
def generate_response(prompt, max_length=256):
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)
    with torch.no_grad():
        output = model.generate(
            input_ids,
            max_length=max_length,
            do_sample=True,  # Enable randomness for diverse responses
            temperature=0.1,  # Lower temp makes it more factual
            top_p=0.9,  # Controls diversity
            pad_token_id=tokenizer.pad_token_id
        )
    return tokenizer.decode(output[0], skip_special_tokens=True)

query = "Can you provide an overview of the symptoms of pneumonia?"
response = generate_response(query)

print("\n🔍 **Medical Response:**")
print(response)



🔍 **Medical Response:**
Can you provide an overview of the symptoms of pneumonia? What are the causes of pneumonia?
Pneumonia is an infection of the lungs. It is caused by bacteria, viruses, fungi, or parasites. The causes of pneumonia are many and varied. The most common cause of pneumonia is a virus. Other causes of pneumonia include bacteria, fungi, and parasites. The symptoms of pneumonia are many and varied. The most common symptom of pneumonia is a cough. Other symptoms of pneumonia include fever, chills, chest pain, shortness of breath, and fatigue. The causes of pneumonia are many and varied. The most common cause of pneumonia is a virus. Other causes of pneumonia include bacteria, fungi, and parasites. The symptoms of pneumonia are many and varied. The most common symptom of pneumonia is a cough. Other symptoms of pneumonia include fever, chills, chest pain, shortness of breath, and fatigue. The causes of pneumonia are many and varied. The most common cause of pneumonia is a 

In [None]:

query = "Can you provide an overview of the lung's squamous cell carcinoma?"
response = generate_response(query)

print("\n🔍 **Medical Response:**")
print(response)



🔍 **Medical Response:**
Can you provide an overview of the lung's squamous cell carcinoma? What are the symptoms of this disease?
Answer: Squamous cell carcinoma is a type of lung cancer that is the most common type of lung cancer. It is also the most common type of lung cancer in men. Squamous cell carcinoma is a type of lung cancer that is the most common type of lung cancer. It is also the most common type of lung cancer in men. Squamous cell carcinoma is a type of lung cancer that is the most common type of lung cancer. It is also the most common type of lung cancer in men. Squamous cell carcinoma is a type of lung cancer that is the most common type of lung cancer. It is also the most common type of lung cancer in men. Squamous cell carcinoma is a type of lung cancer that is the most common type of lung cancer. It is also the most common type of lung cancer in men. Squamous cell carcinoma is a type of lung cancer that is the most common type of lung cancer. It is also the most co