# Assignment -3-part-e Continued Fine Tuning


In [1]:
%%capture
!pip install unsloth
# Also get the latest nightly Unsloth!
!pip uninstall unsloth -y && pip install --upgrade --no-cache-dir "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"

In [2]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/gemma-2-2b", # using gemma-2b for faster training
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
==((====))==  Unsloth 2024.9.post4: Fast Gemma2 patching. Transformers = 4.44.2.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.4.1+cu121. CUDA = 7.5. CUDA Toolkit = 12.1.
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/2.22G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/46.4k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

In [3]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 128, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",
                      "embed_tokens", "lm_head",], # Add for continual pretraining
    lora_alpha = 32,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = True,   # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth: Offloading input_embeddings to disk to save VRAM


  offloaded_W = torch.load(filename, map_location = "cpu", mmap = True)


Unsloth: Offloading output_embeddings to disk to save VRAM


Unsloth 2024.9.post4 patched 26 layers with 26 QKV layers, 26 O layers and 26 MLP layers.


Unsloth: Casting embed_tokens to float32
Unsloth: Casting lm_head to float32


### Data Prep

In [4]:
_wikipedia_prompt = """Wikipedia Article
### Title: {}

### Article:
{}"""
# becomes:
wikipedia_prompt = """Artículo de Wikipedia
### Título: {}

### Artículo:
{}"""

EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN
def formatting_prompts_func(examples):
    titles = examples["title"]
    texts  = examples["text"]
    outputs = []
    for title, text in zip(titles, texts):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = wikipedia_prompt.format(title, text) + EOS_TOKEN
        outputs.append(text)
    return { "text" : outputs, }
pass

We only use 1% of the dataset to speed things up! Use more for longer runs!

In [5]:
from datasets import load_dataset

dataset = load_dataset("wikimedia/wikipedia", "20231101.es", split = "train",)

# We select 1% of the data to make training faster!
dataset = dataset.train_test_split(train_size = 0.01)["train"]

dataset = dataset.map(formatting_prompts_func, batched = True,)

README.md:   0%|          | 0.00/131k [00:00<?, ?B/s]

train-00000-of-00013.parquet:   0%|          | 0.00/688M [00:00<?, ?B/s]

train-00001-of-00013.parquet:   0%|          | 0.00/376M [00:00<?, ?B/s]

train-00002-of-00013.parquet:   0%|          | 0.00/287M [00:00<?, ?B/s]

train-00003-of-00013.parquet:   0%|          | 0.00/245M [00:00<?, ?B/s]

train-00004-of-00013.parquet:   0%|          | 0.00/168M [00:00<?, ?B/s]

train-00005-of-00013.parquet:   0%|          | 0.00/178M [00:00<?, ?B/s]

train-00006-of-00013.parquet:   0%|          | 0.00/216M [00:00<?, ?B/s]

train-00007-of-00013.parquet:   0%|          | 0.00/241M [00:00<?, ?B/s]

train-00008-of-00013.parquet:   0%|          | 0.00/227M [00:00<?, ?B/s]

train-00009-of-00013.parquet:   0%|          | 0.00/223M [00:00<?, ?B/s]

train-00010-of-00013.parquet:   0%|          | 0.00/167M [00:00<?, ?B/s]

train-00011-of-00013.parquet:   0%|          | 0.00/254M [00:00<?, ?B/s]

train-00012-of-00013.parquet:   0%|          | 0.00/226M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1841155 [00:00<?, ? examples/s]

Map:   0%|          | 0/18411 [00:00<?, ? examples/s]

### Continued Pretraining

In [11]:
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported
from unsloth import UnslothTrainer, UnslothTrainingArguments

trainer1 = UnslothTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = 128,  # Significantly reduce sequence length for less memory use
    dataset_num_proc = 2,  # Keep this for multi-processing if necessary
    args = UnslothTrainingArguments(
        per_device_train_batch_size = 1,  # Keep batch size minimal to reduce memory usage
        gradient_accumulation_steps = 2,  # Lower gradient accumulation steps to help with memory
        max_steps = 60,  # Keep steps low for faster training
        warmup_steps = 5,  # Minimal warmup steps
        learning_rate = 3e-5,  # Lowered learning rate to reduce memory overhead
        embedding_learning_rate = 1e-5,  # Lowered learning rate for embeddings
        fp16 = True,  # Ensure FP16 is enabled for lower memory consumption
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,  # Keep logging for tracking
        optim = "adamw_8bit",  # Efficient 8-bit optimizer to reduce memory load
        weight_decay = 0.01,  # Moderate weight decay
        lr_scheduler_type = "linear",  # Linear learning rate scheduler
        seed = 3407,
        output_dir = "outputs",
    ),
)


Map (num_proc=2):   0%|          | 0/18411 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs


### Show current memory stats

In [12]:
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = Tesla T4. Max memory = 14.748 GB.
14.289 GB of memory reserved.


In [None]:
trainer_stats = trainer1.train()

### Instruction Finetuning

We now use the [Alpaca in GPT4 Dataset](https://huggingface.co/datasets/FreedomIntelligence/alpaca-gpt4-korean) but translated in Korean!

Go to [vicgalle/alpaca-gpt4](https://huggingface.co/datasets/vicgalle/alpaca-gpt4) for the original GPT4 dataset for Alpaca or [MultilingualSIFT project](https://github.com/FreedomIntelligence/MultilingualSIFT) for other translations of the Alpaca dataset.

In [14]:
from datasets import load_dataset
alpaca_dataset = load_dataset("FreedomIntelligence/alpaca-gpt4-italian", split = "train")

README.md:   0%|          | 0.00/124 [00:00<?, ?B/s]

Repo card metadata block was not found. Setting CardData to empty.


alpaca-gpt4-italian.json:   0%|          | 0.00/51.7M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/49969 [00:00<?, ? examples/s]

We print 1 example:

In [15]:
print(alpaca_dataset[0])

{'conversations': [{'from': 'human', 'value': 'Suggerisci uno slogan per una campagna di riciclaggio.\n'}, {'from': 'gpt', 'value': '1. "Riduci, riutilizza, ricicla: Insieme per un futuro più verde."\n2. "Ricicla oggi, per un domani migliore."\n3. "Trasforma la tua spazzatura in tesoro - Ricicla!"\n4. "Ricicla per il ciclo della vita."\n5. "Risparmia risorse, ricicla di più."'}], 'id': '23712'}


We again use https://translate.google.com/ to translate the Alpaca format into Korean

In [16]:
alpaca_prompt = """Di seguito è riportata un'istruzione che descrive un compito. Scrivi una risposta che completi adeguatamente la richiesta.

### Istruzione:
{}

### Risposta:
{}"""

EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN
def formatting_prompts_func(conversations):
    texts = []
    conversations = conversations["conversations"]
    for convo in conversations:
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = alpaca_prompt.format(convo[0]["value"], convo[1]["value"]) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }
pass

alpaca_dataset = alpaca_dataset.map(formatting_prompts_func, batched = True,)

Map:   0%|          | 0/49969 [00:00<?, ? examples/s]

We again employ `UnslothTrainer` and do instruction finetuning!

In [17]:
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported
from unsloth import UnslothTrainer, UnslothTrainingArguments

trainer = UnslothTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = alpaca_dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 8,

    args = UnslothTrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 8,

        # Use num_train_epochs and warmup_ratio for longer runs!
        max_steps = 120,
        warmup_steps = 10,
        # warmup_ratio = 0.1,
        # num_train_epochs = 1,

        # Select a 2 to 10x smaller learning rate for the embedding matrices!
        learning_rate = 5e-5,
        embedding_learning_rate = 1e-5,

        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.00,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
    ),
)

Map (num_proc=8):   0%|          | 0/49969 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs


In [None]:
trainer_stats = trainer.train()

<a name="Inference"></a>
### Inference
Let's run the model! You can change the instruction and input - leave the output blank!

Remember to use https://translate.google.com/!

 You can also use a `TextStreamer` for continuous inference - so you can see the generation token by token, instead of waiting the whole time!

In [None]:
inputs = tokenizer(
[
    alpaca_prompt.format(
        "Come posso fare una paella?", # instruction
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 128)

In [None]:
inputs = tokenizer(
[
    alpaca_prompt.format(
        "Quali luoghi turistici posso visitare a Barcellona?", # instruction
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 128)

<bos>Di seguito è riportata un'istruzione che descrive un compito. Scrivi una risposta che completi adeguatamente la richiesta.

### Istruzione:
Quali luoghi turistici posso visitare a Barcellona?

### Risposta:
Ci sono molti luoghi turistici da visitare a Barcellona, ​​tra cui:

1. La Sagrada Familia: Questa famosa cattedrale progettata da Antoni Gaudí è un must-see per tutti i visitatori.

2. La Rambla: Questa famosa strada è un'area popolare per fare shopping, mangiare e fare passeggiate.

3. La Casa Batlló: Questa casa progettata da Gaudí è un'altra attrazione popolare.

4. La Casa Milà: Questa casa progettata da Gaudí è un'altra attrazione popolare.

5. La Casa Vicens: Questa casa proget
