<a href="https://colab.research.google.com/github/rezzie-rich/colab-notebooks/blob/main/unsloth_coco_se(step2).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Install Required Packages

In [1]:
# %%capture
!pip install torchdata -f https://download.pytorch.org/whl/nightly/cu118.html
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --no-deps xformers "trl<0.9.0" transformers datasets peft accelerate bitsandbytes safetensors portalocker urllib3

import torch
print("CUDA Available:", torch.cuda.is_available())
print("CUDA Version:", torch.version.cuda)
print("cuDNN Version:", torch.backends.cudnn.version())

Looking in links: https://download.pytorch.org/whl/nightly/cu118.html
Collecting torchdata
  Downloading torchdata-0.7.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.7/4.7 MB[0m [31m14.4 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=2->torchdata)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=2->torchdata)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=2->torchdata)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=2->torchdata)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch>=2->torchd

# Importing Libraries

In [2]:
import os
import urllib3
from unsloth import FastLanguageModel, is_bfloat16_supported
from datasets import load_dataset, DatasetDict, Dataset, concatenate_datasets
from torch.utils.data import DataLoader
from trl import SFTTrainer
from transformers import TrainingArguments, TextStreamer
from unsloth.chat_templates import get_chat_template

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


# Define local storage path

In [3]:
local_storage_path = "/content/unsloth-models"

# Custom function to handle HTTP range requests using urllib3

In [4]:
def fetch_data_with_range_requests(url, start, end):
    http = urllib3.PoolManager()
    headers = {'Range': f'bytes={start}-{end}'}

    try:
        response = http.request('GET', url, headers=headers, preload_content=False)
        if response.status == 206:
            data = response.read()
            response.release_conn()
            return data
        else:
            raise ValueError("Range requests are not supported, falling back to full download.")
    except ValueError as e:
        # Fallback: download the entire file
        print(e)
        response = http.request('GET', url, preload_content=False)
        data = response.read()
        response.release_conn()
        return data
    except Exception as e:
        log_error("fetching data", e)
        raise

# Function to log error messages

In [5]:
def log_error(stage, error):
    print(f"Error during {stage}: {error}")

# Initialize global tokenizer variable

In [6]:
tokenizer = None

# Dynamic function to load and prepare datasets from Hugging Face

In [7]:
def load_and_prepare_dataset(dataset_name, splits=None, chat_template="phi-3", training_args=None):
    try:
        global tokenizer
        if splits:
            datasets = [load_dataset(dataset_name, split=split, streaming=True, block_size=0, trust_remote_code=True) for split in splits]
            dataset = concatenate_datasets(datasets)
        else:
            dataset = load_dataset(dataset_name, streaming=True, block_size=0, trust_remote_code=True)
            splits = [None]

        # Check if the dataset has enough samples
        sample_count = sum(1 for _ in dataset)
        if sample_count < training_args.per_device_train_batch_size:
            raise ValueError("Dataset does not have enough samples to yield at least one packed sequence.")

        # Get the first sample to determine the field name
        sample = next(iter(dataset))
        field = list(sample.keys())[0]  # Assuming the first field is the one we need

        tokenizer = get_chat_template(
            tokenizer,
            chat_template=chat_template,
            mapping={"role": "from", "content": "value", "user": "human", "assistant": "gpt"}
        )

        def formatting_prompts_func(examples):
            texts = examples[field]
            formatted_texts = [tokenizer.apply_chat_template({"from": "human", "value": text}, tokenize=False, add_generation_prompt=False) for text in texts]
            return {field: formatted_texts}

        dataset = dataset.map(formatting_prompts_func, batched=True)
        train_loader = DataLoader(dataset, batch_size=training_args.per_device_train_batch_size, collate_fn=lambda x: tokenizer(x[field], padding=True, truncation=True, return_tensors="pt"))
        return dataset, train_loader, field
    except Exception as e:
        log_error("loading or preparing the dataset", e)
        raise

# Model Configuration

In [8]:
max_seq_length = 4096
dtype = None
load_in_4bit = True

# Initialize model and tokenizer

In [9]:
try:
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name="unsloth/Phi-3-medium-4k-instruct-bnb-4bit",
        max_seq_length=max_seq_length,
        dtype=dtype,
        load_in_4bit=load_in_4bit
    )
except Exception as e:
    log_error("model initialization", e)
    raise

config.json:   0%|          | 0.00/1.16k [00:00<?, ?B/s]

==((====))==  Unsloth: Fast Mistral patching release 2024.5
   \\   /|    GPU: NVIDIA L4. Max memory: 22.168 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.3.0+cu121. CUDA = 8.9. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. Xformers = 0.0.26.post1. FA = False.
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


model.safetensors.index.json:   0%|          | 0.00/165k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/3.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.72G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/172 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/3.14k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama_fast.LlamaTokenizerFast'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


# Determine the appropriate dtype for mixed precision training

In [10]:
use_fp16 = not is_bfloat16_supported()
use_bf16 = is_bfloat16_supported()

# Add LoRA adapters for the model

In [11]:
model = FastLanguageModel.get_peft_model(
    model,
    r=16,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    lora_alpha=16,
    lora_dropout=0.4,
    bias="none",
    use_gradient_checkpointing="unsloth",
    random_state=3399,
    use_rslora=True,
    loftq_config=None
)

Unsloth: Dropout = 0 is supported for fast patching. You are using dropout = 0.4.
Unsloth will patch all other layers, except LoRA matrices, causing a performance hit.
Unsloth 2024.5 patched 40 layers with 0 QKV layers, 0 O layers and 0 MLP layers.


# Verify dataset and load the first dataset

In [12]:
dataset_name1 = "allenai/dolma"
splits1 = ["train"]  # Specify the split you want to use
try:
    training_args1 = TrainingArguments(
        per_device_train_batch_size=4,
        gradient_accumulation_steps=16,
        warmup_steps=800,
        max_steps=300000,
        learning_rate=1e-4,
        fp16=use_fp16,
        bf16=use_bf16,
        logging_steps=7,
        optim="adamw_8bit",
        weight_decay=0.07,
        lr_scheduler_type="cosine",
        seed=3399,
        output_dir=f"{local_storage_path}/outputs_phase1",
    )
    dataset1, train_loader1, dataset_field1 = load_and_prepare_dataset(dataset_name1, splits1, training_args=training_args1)
except Exception as e:
    log_error("verifying or loading the first dataset", e)
    raise

Downloading builder script:   0%|          | 0.00/4.83k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/8.75k [00:00<?, ?B/s]

Error during loading or preparing the dataset: BuilderConfig BuilderConfig(name='v1_7', version='1.7.0', data_dir=None, data_files=None, description='Dolma: an Open Corpus of Three Trillion Tokens for Language Model Pretraining Research\n (Apr 2024)') doesn't have a 'block_size' key.
Error during verifying or loading the first dataset: BuilderConfig BuilderConfig(name='v1_7', version='1.7.0', data_dir=None, data_files=None, description='Dolma: an Open Corpus of Three Trillion Tokens for Language Model Pretraining Research\n (Apr 2024)') doesn't have a 'block_size' key.


ValueError: BuilderConfig BuilderConfig(name='v1_7', version='1.7.0', data_dir=None, data_files=None, description='Dolma: an Open Corpus of Three Trillion Tokens for Language Model Pretraining Research\n (Apr 2024)') doesn't have a 'block_size' key.

# Training with the first dataset

In [None]:
try:
    trainer1 = SFTTrainer(
        model=model,
        tokenizer=tokenizer,
        train_dataset=train_loader1,
        dataset_text_field=dataset_field1,
        max_seq_length=max_seq_length,
        dataset_num_proc=4,
        packing=True,
        args=training_args1,
    )

    trainer1.train()
except Exception as e:
    log_error("training with the first dataset", e)
    raise

# Save the intermediate model

In [None]:
try:
    os.makedirs(f"{local_storage_path}/intermediate_model", exist_ok=True)
    model.save_pretrained(f"{local_storage_path}/intermediate_model")
except Exception as e:
    log_error("saving the intermediate model", e)
    raise

# Load the intermediate model

In [None]:
try:
    model = FastLanguageModel.from_pretrained(f"{local_storage_path}/intermediate_model")
except Exception as e:
    log_error("loading the intermediate model", e)
    raise

# Add LoRA adapters for the model for the second dataset

In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r=32,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    lora_alpha=32,
    lora_dropout=0.2,
    bias="none",
    use_gradient_checkpointing="unsloth",
    random_state=3399,
    use_rslora=True,
    loftq_config=None
)

# Verify dataset and load the second dataset

In [None]:
dataset_name2 = "xingyaoww/code-act"
splits2 = ["codeact", "general"]  # Specify the splits you want to use
try:
    training_args2 = TrainingArguments(
        per_device_train_batch_size=16,
        gradient_accumulation_steps=2,
        warmup_steps=50,
        max_steps=1000,
        learning_rate=1e-4,
        fp16=use_fp16,
        bf16=use_bf16,
        logging_steps=7,
        optim="adamw_8bit",
        weight_decay=0.07,
        lr_scheduler_type="cosine",
        seed=3399,
        output_dir=f"{local_storage_path}/outputs_phase2",
    )
    dataset2, train_loader2, dataset_field2 = load_and_prepare_dataset(dataset_name2, splits2, training_args=training_args2)
except Exception as e:
    log_error("verifying or loading the second dataset", e)
    raise

# Training with the second dataset

In [None]:
try:
    trainer2 = SFTTrainer(
        model=model,
        tokenizer=tokenizer,
        train_dataset=train_loader2,
        dataset_text_field=dataset_field2,
        max_seq_length=max_seq_length,
        dataset_num_proc=4,
        packing=True,
        args=training_args2,
    )

    trainer2.train()
except Exception as e:
    log_error("training with the second dataset", e)
    raise

# Inference

In [None]:
tokenizer = get_chat_template(
    tokenizer,
    chat_template="phi-3",
    mapping={"role": "from", "content": "value", "user": "human", "assistant": "gpt"}
)

FastLanguageModel.for_inference(model)

messages = [
    {"from": "human", "value": "Continue the Fibonacci sequence: 1, 1, 2, 3, 5, 8,"},
]
inputs = tokenizer.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_tensors="pt",
).to("cuda")

streamer = TextStreamer(tokenizer)
outputs = model.generate(input_ids=inputs, max_new_tokens=64, use_cache=True, streamer=streamer)

response = tokenizer.batch_decode(outputs, skip_special_tokens=True)
print("Generated response:", response[0])

# Save the final model in FP16 safetensors format

In [None]:
final_model_name01 = "coco-se-phi3-medium-pro-4k"
try:
    model = model.to(dtype=torch.float16)
    model.push_to_hub(
        repo_id=f"{final_model_name01}",
        use_auth_token="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",  # Replace with your Hugging Face token
        safe_serialization=True
    )
except Exception as e:
    log_error("pushing FP16 safetensors model", e)
    raise

# Quantize and save Q8 model in safetensors format

In [None]:
final_model_name02 = "coco-se-phi3-medium-4k"
try:
    quantized_model = model.quantize(
        bits=8,
        dtype=torch.float16,
        quantization_method="q8_0"
    )
    quantized_model.push_to_hub(
        repo_id=f"{final_model_name02}",
        use_auth_token="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",  # Replace with your Hugging Face token
        safe_serialization=True
    )
except Exception as e:
    log_error("pushing Q8 safetensors model", e)
    raise

# Quantize and save Q8 model in GGUF format

In [None]:
final_model_name03 = "coco-se-phi3-medium-4k-GGUF"
try:
    model.push_to_hub_gguf(
        f"{final_model_name03}",
        quantization_method="q8_0",
        use_auth_token="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"  # Replace with your Hugging Face token
    )
except Exception as e:
    log_error("pushing GGUF model", e)
    raise