In [None]:
%%capture
# Installs Unsloth, Xformers (Flash Attention) and all other packages!
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --no-deps "xformers<0.0.27" "trl<0.9.0" peft accelerate bitsandbytes

In [None]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 4000 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

ModuleNotFoundError: No module named 'unsloth'

In [None]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "aaditya/Llama3-OpenBioLLM-8B",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

==((====))==  Unsloth 2024.8: Fast Llama patching. Transformers = 4.43.3.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.3.1+cu121. CUDA = 7.5. CUDA Toolkit = 12.1.
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.26.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


pytorch_model.bin.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

pytorch_model-00001-of-00004.bin:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

pytorch_model-00002-of-00004.bin:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

pytorch_model-00003-of-00004.bin:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

pytorch_model-00004-of-00004.bin:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/449 [00:00<?, ?B/s]

aaditya/Llama3-OpenBioLLM-8B does not have a padding token! Will use pad_token = <|reserved_special_token_250|>.


In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth 2024.8 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [None]:
# Define the refined prompt template for alpaca style
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
You are a highly knowledgeable and empathetic dermatologist. Provide clear, accurate, and helpful information about various skin diseases, skin care routines, treatments, and related dermatological advice.

### Input:
{}

### Response:
{}
"""

EOS_TOKEN = tokenizer.eos_token  # Must add EOS_TOKEN

def formatting_prompts_func(examples):
    inputs = examples["Topic"]
    outputs = examples["Information"]
    texts = []
    for input, output in zip(inputs, outputs):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = alpaca_prompt.format(input, output) + EOS_TOKEN
        texts.append(text)
    return {"text": texts}

from datasets import load_dataset
dataset = load_dataset("brucewayne0459/Skin_diseases_and_care", split="train")
dataset = dataset.map(formatting_prompts_func, batched=True)


Downloading readme:   0%|          | 0.00/2.14k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/477k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/828 [00:00<?, ? examples/s]

Map:   0%|          | 0/828 [00:00<?, ? examples/s]

In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        max_steps = 120,
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
    ),
)

Map (num_proc=2):   0%|          | 0/828 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs


In [None]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 828 | Num Epochs = 2
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 120
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss
1,2.6382
2,2.8897
3,2.6666
4,2.3612
5,2.2058
6,2.1482
7,1.7816
8,1.5631
9,1.5229
10,1.5433


In [None]:
# alpaca_prompt = Copied from above
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
inputs = tokenizer(
[
    alpaca_prompt.format(
        "What are the precautions should be taken before going outside?", # input
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 128)

Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
You are a highly knowledgeable and empathetic dermatologist. Provide clear, accurate, and helpful information about various skin diseases, skin care routines, treatments, and related dermatological advice.

### Input:
What are the precautions should be taken becore going outside?

### Response:

Before going outside, carefully consider the following precautions to reduce your risk of skin conditions. 

**Sun Protection:** Use sunscreen with an SPF of 30 or higher every day, even on cloudy days. Apply it on all exposed skin, and remember to reapply every two hours, especially if you're swimming or sweating.

**Hives:**  Prevent hives by avoiding heat, cold, stress, and certain foods and medications. 

**Skin Cancer:**  Protect your skin from UV radiation by seeking shade, avoiding tanning beds, and wearing protect

In [None]:
model.save_pretrained("OpenBio-derm") # Local saving
tokenizer.save_pretrained("OpenBio-derm")
model.push_to_hub("brucewayne0459/OpenBioLLm-Derm", token = "your_token") # Online saving
tokenizer.push_to_hub("brucewayne0459/OpenBioLLm-Derm", token = "your_token") # Online saving

Saved model to https://huggingface.co/brucewayne0459/OpenBioLLm-Derm


In [None]:
from huggingface_hub import ModelCard, ModelCardData
from huggingface_hub.utils import EntryNotFoundError
from typing import Dict, List, Optional, Tuple, Union
def create_and_tag_model_card(
    repo_id: str,
    tags: Optional[List[str]] = None,
    token: Optional[str] = None,
    ignore_metadata_errors: bool = False,
):
    """
    Creates or loads an existing model card and tags it.

    Args:
        repo_id (`str`):
            The repo_id where to look for the model card.
        tags (`List[str]`, *optional*):
            The list of tags to add in the model card
        token (`str`, *optional*):
            Authentication token, obtained with `huggingface_hub.HfApi.login` method. Will default to the stored token.
        ignore_metadata_errors (`str`):
            If True, errors while parsing the metadata section will be ignored. Some information might be lost during
            the process. Use it at your own risk.
    """
    try:
        # Check if the model card is present on the remote repo
        model_card = ModelCard.load(repo_id, token=token, ignore_metadata_errors=ignore_metadata_errors)
    except EntryNotFoundError:
        # Otherwise create a simple model card from template
        model_description = "This is the model card of a 🤗 transformers model that has been pushed on the Hub. This model card has been automatically generated."
        card_data = ModelCardData(tags=[] if tags is None else tags, library_name="transformers")
        model_card = ModelCard.from_template(card_data, model_description=model_description)

    # Ensure tags are initialized
    if model_card.data.tags is None:
        model_card.data.tags = []

    if tags is not None:
        for model_tag in tags:
            if model_tag not in model_card.data.tags:
                model_card.data.tags.append(model_tag)

    return model_card


In [None]:
# Create and tag model card
repo_id = "brucewayne0459/OpenBioLLm-Derm"
tags = ["text-generation-inference", "transformers", "unsloth", "dermatology"]
model_card = create_and_tag_model_card(repo_id=repo_id, tags=tags, token="your_token")


In [None]:
model.push_to_hub_merged(repo_id, tokenizer, save_method="merged_16bit", token="your_token")


Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 4.13 out of 12.67 RAM for saving.


100%|██████████| 32/32 [01:35<00:00,  2.97s/it]


Unsloth: Saving tokenizer... Done.
Unsloth: Saving model... This might take 5 minutes for Llama-7b...
Unsloth: Saving OpenBioLLm-Derm/pytorch_model-00001-of-00004.bin...
Unsloth: Saving OpenBioLLm-Derm/pytorch_model-00002-of-00004.bin...
Unsloth: Saving OpenBioLLm-Derm/pytorch_model-00003-of-00004.bin...
Unsloth: Saving OpenBioLLm-Derm/pytorch_model-00004-of-00004.bin...


README.md:   0%|          | 0.00/5.36k [00:00<?, ?B/s]

  0%|          | 0/4 [00:00<?, ?it/s]

pytorch_model-00001-of-00004.bin:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

pytorch_model-00003-of-00004.bin:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

pytorch_model-00004-of-00004.bin:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

pytorch_model-00002-of-00004.bin:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

Done.
Saved merged model to https://huggingface.co/brucewayne0459/OpenBioLLm-Derm


In [None]:
model.push_to_hub_gguf("brucewayne0459/OpenBioLLm-Derm-gguf", tokenizer, token = "your_token")


Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 4.23 out of 12.67 RAM for saving.


100%|██████████| 32/32 [02:19<00:00,  4.37s/it]


Unsloth: Saving tokenizer... Done.
Unsloth: Saving model... This might take 5 minutes for Llama-7b...
Unsloth: Saving brucewayne0459/OpenBioLLm-Derm-gguf/pytorch_model-00001-of-00004.bin...
Unsloth: Saving brucewayne0459/OpenBioLLm-Derm-gguf/pytorch_model-00002-of-00004.bin...
Unsloth: Saving brucewayne0459/OpenBioLLm-Derm-gguf/pytorch_model-00003-of-00004.bin...
Unsloth: Saving brucewayne0459/OpenBioLLm-Derm-gguf/pytorch_model-00004-of-00004.bin...
Done.


Unsloth: Converting llama model. Can use fast conversion = False.


==((====))==  Unsloth: Conversion from QLoRA to GGUF information
   \\   /|    [0] Installing llama.cpp will take 3 minutes.
O^O/ \_/ \    [1] Converting HF to GGUF 16bits will take 3 minutes.
\        /    [2] Converting GGUF 16bits to ['q8_0'] will take 10 minutes each.
 "-____-"     In total, you will have to wait at least 16 minutes.

Unsloth: [0] Installing llama.cpp. This will take 3 minutes...
Unsloth: [1] Converting model at brucewayne0459/OpenBioLLm-Derm-gguf into q8_0 GGUF format.
The output location will be ./brucewayne0459/OpenBioLLm-Derm-gguf/unsloth.Q8_0.gguf
This will take 3 minutes...
INFO:hf-to-gguf:Loading model: OpenBioLLm-Derm-gguf
INFO:gguf.gguf_writer:gguf: This GGUF file is for Little Endian only
INFO:hf-to-gguf:Exporting model...
INFO:hf-to-gguf:gguf: loading model weight map from 'pytorch_model.bin.index.json'
INFO:hf-to-gguf:gguf: loading model part 'pytorch_model-00001-of-00004.bin'
INFO:hf-to-gguf:token_embd.weight,           torch.float16 --> Q8_0, shape = 

  0%|          | 0/1 [00:00<?, ?it/s]

unsloth.Q8_0.gguf:   0%|          | 0.00/8.54G [00:00<?, ?B/s]

Saved GGUF to https://huggingface.co/brucewayne0459/OpenBioLLm-Derm-gguf


In [None]:
model.push_to_hub_gguf("brucewayne0459/OpenBioLLm-Derm-gguf", tokenizer,quantization_method = "q4_k_m",token = "your_token")


Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 4.73 out of 12.67 RAM for saving.


100%|██████████| 32/32 [01:26<00:00,  2.69s/it]


Unsloth: Saving tokenizer... Done.
Unsloth: Saving model... This might take 5 minutes for Llama-7b...
Unsloth: Saving brucewayne0459/OpenBioLLm-Derm-gguf/pytorch_model-00001-of-00004.bin...
Unsloth: Saving brucewayne0459/OpenBioLLm-Derm-gguf/pytorch_model-00002-of-00004.bin...
Unsloth: Saving brucewayne0459/OpenBioLLm-Derm-gguf/pytorch_model-00003-of-00004.bin...
Unsloth: Saving brucewayne0459/OpenBioLLm-Derm-gguf/pytorch_model-00004-of-00004.bin...
Done.
==((====))==  Unsloth: Conversion from QLoRA to GGUF information
   \\   /|    [0] Installing llama.cpp will take 3 minutes.
O^O/ \_/ \    [1] Converting HF to GGUF 16bits will take 3 minutes.
\        /    [2] Converting GGUF 16bits to ['q4_k_m'] will take 10 minutes each.
 "-____-"     In total, you will have to wait at least 16 minutes.

Unsloth: [0] Installing llama.cpp. This will take 3 minutes...
Unsloth: [1] Converting model at brucewayne0459/OpenBioLLm-Derm-gguf into f16 GGUF format.
The output location will be ./brucewayne045

  0%|          | 0/1 [00:00<?, ?it/s]

unsloth.F16.gguf:   0%|          | 0.00/16.1G [00:00<?, ?B/s]

Saved GGUF to https://huggingface.co/brucewayne0459/OpenBioLLm-Derm-gguf
Unsloth: Uploading GGUF to Huggingface Hub...


  0%|          | 0/1 [00:00<?, ?it/s]

unsloth.Q4_K_M.gguf:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

Saved GGUF to https://huggingface.co/brucewayne0459/OpenBioLLm-Derm-gguf


In [None]:
if False: model.push_to_hub_gguf("brucewayne0459/OpenBioLLm-Derm-gguf", tokenizer, quantization_method = "f16", token = "your_token",temporary_location = "/content/brucewayne0459/OpenBioLLm-Derm-gguf")

In [None]:
model.push_to_hub_merged("brucewayne0459/OpenBioLLm-Derm-lora", tokenizer, save_method = "lora", token = "your_token")

Unsloth: Saving LoRA adapters. Please wait...


README.md:   0%|          | 0.00/5.28k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/704 [00:00<?, ?B/s]

  0%|          | 0/1 [00:00<?, ?it/s]

adapter_model.safetensors:   0%|          | 0.00/168M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.29k [00:00<?, ?B/s]

Saved lora model to https://huggingface.co/brucewayne0459/OpenBioLLm-Derm-lora
