In [1]:
import os
if "COLAB_" not in "".join(os.environ.keys()):
  !pip install unsloth
else:
  !pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl triton cut_cross_entropy unsloth_zoo
  !pip install sentencepiece protobuf "datasets>=3.4.1,<4.0.0" huggingface_hub hf_transfer
  !pip install --no-deps unsloth
!pip install transformers==4.52.3

Collecting bitsandbytes
  Downloading bitsandbytes-0.46.1-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Collecting xformers==0.0.29.post3
  Downloading xformers-0.0.29.post3-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (1.0 kB)
Collecting trl
  Downloading trl-0.19.1-py3-none-any.whl.metadata (10 kB)
Collecting cut_cross_entropy
  Downloading cut_cross_entropy-25.1.1-py3-none-any.whl.metadata (9.3 kB)
Collecting unsloth_zoo
  Downloading unsloth_zoo-2025.7.7-py3-none-any.whl.metadata (8.1 kB)
Downloading xformers-0.0.29.post3-cp311-cp311-manylinux_2_28_x86_64.whl (43.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.4/43.4 MB[0m [31m19.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading bitsandbytes-0.46.1-py3-none-manylinux_2_24_x86_64.whl (72.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m72.9/72.9 MB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading trl-0.19.1-py3-none-any.whl (376 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
#@ Using FastModel from Unsloth which can load any model:
from unsloth import FastModel
import torch
from transformers import CsmForConditionalGeneration

model, processor=FastModel.from_pretrained(
    model_name='unsloth/csm-1b',
    max_seq_length=2048, #how many tokens can model inputs or attends at a time
    dtype=None, #for gpus, used None for auto detection
    auto_model=CsmForConditionalGeneration,
    load_in_4bit=False, #using false enbles lora 16-bit
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.7.5: Fast Mimi patching. Transformers: 4.52.3.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: QLoRA and full finetuning all not selected. Switching to 16bit LoRA.


model.safetensors:   0%|          | 0.00/4.15G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/264 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/449 [00:00<?, ?B/s]

chat_template.jinja: 0.00B [00:00, ?B/s]

unsloth/csm-1b does not have a padding token! Will use pad_token = <|PAD_TOKEN|>.


##Lora Adaptors:
- for updating 1 to 10% of all parameters

In [4]:
model=FastModel.get_peft_model(
    model, #our language model but can be any network module
    r=16, #defines the rank, higher more unique information but higher memory
    target_modules=['q_proj', 'k_proj', 'v_proj', 'o_proj', 'up_proj', 'down_proj', 'gate_proj'], #where lora should be applied esp attention, mlp
    lora_alpha=16,
    lora_dropout=0, #'0' is optimized
    bias='none', #disables bias tuning
    use_gradient_checkpointing='unsloth', #uses 30% less VRAM, fits 2x larger batch sizes
    random_state=44,
    use_rlora=False, #we support rank stabilized LoRA
    loftq_config=None
)

Unsloth: Making `model.base_model.model.backbone_model` require gradients


## Dataset Prep

In [13]:
#@title Dataset Prep functions
from datasets import load_dataset, Audio, Dataset
import os
from transformers import AutoProcessor
processor = AutoProcessor.from_pretrained("unsloth/csm-1b")

raw_ds = load_dataset("MrDragonFox/Elise", split="train")

# Getting the speaker id is important for multi-speaker models and speaker consistency
speaker_key = "source"
if "source" not in raw_ds.column_names and "speaker_id" not in raw_ds.column_names:
    print("Unsloth: No speaker found, adding default \"source\" of 0 for all examples")
    new_column = ["0"] * len(raw_ds)
    raw_ds = raw_ds.add_column("source", new_column)
elif "source" not in raw_ds.column_names and "speaker_id" in raw_ds.column_names:
    speaker_key = "speaker_id"

target_sampling_rate = 24000
raw_ds = raw_ds.cast_column("audio", Audio(sampling_rate=target_sampling_rate))

def preprocess_example(example):
    conversation = [
        {
            "role": str(example[speaker_key]),
            "content": [
                {"type": "text", "text": example["text"]},
                {"type": "audio", "path": example["audio"]["array"]},
            ],
        }
    ]

    try:
        model_inputs = processor.apply_chat_template(
            conversation,
            tokenize=True,
            return_dict=True,
            output_labels=True,
            text_kwargs = {
                "padding": "max_length", # pad to the max_length
                "max_length": 256, # this should be the max length of audio
                "pad_to_multiple_of": 8,
                "padding_side": "right",
            },
            audio_kwargs = {
                "sampling_rate": 24_000,
                "max_length": 240001, # max input_values length of the whole dataset
                "padding": "max_length",
            },
            common_kwargs = {"return_tensors": "pt"},
        )
    except Exception as e:
        print(f"Error processing example with text '{example['text'][:50]}...': {e}")
        return None

    required_keys = ["input_ids", "attention_mask", "labels", "input_values", "input_values_cutoffs"]
    processed_example = {}
    # print(model_inputs.keys())
    for key in required_keys:
        if key not in model_inputs:
            print(f"Warning: Required key '{key}' not found in processor output for example.")
            return None

        value = model_inputs[key][0]
        processed_example[key] = value


    # Final check (optional but good)
    if not all(isinstance(processed_example[key], torch.Tensor) for key in processed_example):
         print(f"Error: Not all required keys are tensors in final processed example. Keys: {list(processed_example.keys())}")
         return None

    return processed_example

processed_ds = raw_ds.map(
    preprocess_example,
    remove_columns=raw_ds.column_names,
    desc="Preprocessing dataset",
)

Unsloth: No speaker found, adding default "source" of 0 for all examples


Preprocessing dataset:   0%|          | 0/1195 [00:00<?, ? examples/s]

## Model Training:

In [16]:
from transformers import TrainingArguments, Trainer
from unsloth import is_bfloat16_supported

trainer=Trainer(
    model=model,
    train_dataset=processed_ds,

    #core config:
    args=TrainingArguments(
        per_device_train_batch_size=2,
        gradient_accumulation_steps=4, #gradient are accumulated before backprop
        warmup_steps=5,
        max_steps=60, #training model for 60 optimizer steps(not epoch)
        learning_rate=2e-4, #0.0002
        optim="adamw_8bit",

        #precision setting:
        fp16=not is_bfloat16_supported(),
        bf16=is_bfloat16_supported(), #if gpu supports

        #other setting:
        logging_steps=1, #logs training metric every 1 steps
        weight_decay=0.01, #prevents overfitting by penalizing larger wts
        # lr_schedular_type="linear",
        seed=44,
        output_dir="outputs", #model checkpoints and logs will be saved
        report_to="none"
    )
)

In [17]:
trainer_stats=trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 1,195 | Num Epochs = 1 | Total steps = 60
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 14,516,224 of 1,646,616,385 (0.88% trained)
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
1,4.5084
2,4.9894
3,4.9111
4,5.1426
5,5.6272
6,5.1014
7,5.1716
8,4.7915
9,5.0954
10,5.3728
