### Summary
- Fine tuning NLLB using GaLore 
### version 1: 
- 


In [1]:
%pip install --upgrade pip
# Restart kernel

Collecting pip
  Using cached pip-24.2-py3-none-any.whl (1.8 MB)
Installing collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 22.2.2
    Uninstalling pip-22.2.2:
      Successfully uninstalled pip-22.2.2
Successfully installed pip-24.2
Note: you may need to restart the kernel to use updated packages.


In [1]:
%pip install -q -U bitsandbytes
%pip install git+https://github.com/jiaweizzhao/GaLore
%pip install -U git+https://github.com/huggingface/transformers.git
%pip install -q -U accelerate
%pip install -q -U datasets
%pip install -q -U trl
%pip install tensorly
%pip install -U flash-attn
%pip install boto3

Note: you may need to restart the kernel to use updated packages.
Collecting git+https://github.com/jiaweizzhao/GaLore
  Cloning https://github.com/jiaweizzhao/GaLore to /tmp/pip-req-build-g0vyc9tm
  Running command git clone --filter=blob:none --quiet https://github.com/jiaweizzhao/GaLore /tmp/pip-req-build-g0vyc9tm
  Resolved https://github.com/jiaweizzhao/GaLore to commit 2cc66f88cce189e505affbb91042a8e77f5bf4e9
  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: galore-torch
  Building wheel for galore-torch (setup.py) ... [?25ldone
[?25h  Created wheel for galore-torch: filename=galore_torch-1.0-py3-none-any.whl size=14682 sha256=0ee70f3808c0ee20fbdc25d415cf8bb4bb6e4fa80625144be0b6ad9e8ba9f3d2
  Stored in directory: /tmp/pip-ephem-wheel-cache-edngv0cq/wheels/88/47/b5/ca5f75e9f8a2eef76440b7070f8e82f0099831c3d13ebbe221
Successfully built galore-torch
Installing collected packages: galore-torch
Successfully installed galore-torch-1.0
Note: you may

In [2]:
import torch
from datasets import load_dataset
from trl import SFTTrainer
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, Seq2SeqTrainingArguments, Seq2SeqTrainer

In [4]:
model_name = "facebook/nllb-200-3.3B"
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.padding_side = 'right' 

In [5]:
import json
import os

with open('root/credentials.json') as f:
    credentials = json.load(f)

# Set environment variables
for key, value in credentials.items():
    if not isinstance(value, str):
        value = json.dumps(value)  # Convert dicts or lists to string
    os.environ[key] = value

huggingface_api_key = os.getenv('HUGGINGFACE_API_KEY')
aws_access_key = os.getenv('AWS_ACCESS_KEY')
aws_secret_key = os.getenv('AWS_SECRET_KEY')

In [7]:
%pip install -q nbconvert
# Import Task from clearml
from clearml import Task
task = Task.init(project_name="HuggingFace Transformers",
    task_name="GaLore NLLB",
    output_uri=False) # don’t save any of the models to clearml
task.set_parameters_as_dict({ 'save_checkpoints': False })    

Note: you may need to restart the kernel to use updated packages.
ClearML Task: created new task id=790323c33bab41c698922a60b6f7abc5
2024-10-06 22:29:46,012 - clearml.Task - INFO - Storing jupyter notebook directly as code
ClearML results page: https://app.sil.hosted.allegro.ai/projects/96152c47119843d7a026576ef08e348f/experiments/790323c33bab41c698922a60b6f7abc5/output/log


In [8]:
################### read jsonl file into dataset #################################
import json
from datasets import Dataset, DatasetDict

language = 'hejazi'

# Initialize a dictionary to hold the lists for each field
train = {'model_inputs': [], 'completion': []}
val = {'model_inputs': [], 'completion': []}

# Open the file and read line by line
with open('/root/all_llm_data/'+language+'_train_data.jsonl', 'r', encoding='utf-8') as file:
    for line in file:
        # Each line is a complete JSON object
        json_object = json.loads(line)
        # Append each field to the appropriate list
        train['model_inputs'].append(json_object.get('model_inputs', ''))  
        train['completion'].append(json_object.get('completion', ''))  

# Open the file and read line by line
with open('/root/all_llm_data/'+language+'_val_data.jsonl', 'r', encoding='utf-8') as file:
    for line in file:
        # Each line is a complete JSON object
        json_object = json.loads(line)
        # Append each field to the appropriate list
        val['model_inputs'].append(json_object.get('model_inputs', ''))  
        val['completion'].append(json_object.get('completion', ''))         

# Convert the dictionary of lists into a `Dataset`
dataset_train = Dataset.from_dict(train)
dataset_val = Dataset.from_dict(val)

# Create a `DatasetDict` for a train/val split
dataset_dict = DatasetDict({'train': dataset_train,'val': dataset_val})

# Print the first entry of the dataset to check its structure
print(dataset_dict['train'][0])
print(dataset_dict['val'][0])

{'model_inputs': 'translate arb_Arab to acw_Arab: فِي الْبَدْءِ خَلَقَ اللهُ السَّمَاوَاتِ وَالأَرْضَ،', 'completion': 'فِي الْبَدْاية خَلَقَ اللهُ السَّمَاوَاتِ وَالأَرْضَ،\r\n'}
{'model_inputs': 'translate arb_Arab to acw_Arab: كَانَ قَدْ تَلَقَّنَ طَرِيقَ الرَّبِّ. فَبَدَأَ يَخْطُبُ بِحَمَاسَةٍ شَدِيدَةٍ، وَيُعَلِّمُ الْحَقَائِقَ الْمُخْتَصَّةَ بِيَسُوعَ تَعْلِيماً صَحِيحاً. وَمَعَ أَنَّهُ لَمْ يَكُنْ يَعْرِفُ سِوَى مَعْمُودِيَّةِ يُوحَنَّا،', 'completion': 'وكان يعرف طريق الرب. وصار يتكلم بحماس شديد، ويعلم الحقايق اللي تخص يسوع تعليم صحيح. ورغم أنو كان يعرف معمودية يوحنا بس،\r\n'}


In [9]:
src_lang = "arb_Arab"  # Hejazi code or general Arabic code
tgt_lang = "arb_Arab" 


In [10]:
def tokenize_function(examples):
    tokenizer.src_lang = src_lang
    tokenizer.tgt_lang = tgt_lang
    
    inputs = tokenizer(
        examples['model_inputs'], 
        max_length=1024,  
        truncation=True, 
        padding="max_length"
    )
    
    labels = tokenizer(
        text_target=examples['completion'], 
        max_length=1024, 
        truncation=True, 
        padding="max_length"
    ).input_ids
    
    inputs['labels'] = labels
    
    return inputs 

# Apply the tokenization function to the dataset
tokenized_datasets = dataset_dict.map(
    tokenize_function, 
    batched=True, 
    remove_columns=['model_inputs', 'completion'] 
)

print(tokenized_datasets['train'][0])
print(tokenized_datasets['val'][0])


Map:   0%|          | 0/17582 [00:00<?, ? examples/s]

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

{'input_ids': [256011, 3292, 28064, 248059, 256011, 202, 762, 248083, 248120, 248085, 28560, 248144, 365, 248151, 248127, 241, 248135, 248129, 248109, 248117, 248135, 248850, 248151, 396, 248109, 248106, 248109, 248187, 248109, 4171, 248176, 5084, 577, 248111, 248109, 1412, 248109, 471, 248151, 141, 248109, 45283, 248109, 248104, 248135, 248670, 248109, 248238, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 

In [11]:
attn_implementation='flash_attention_2'
torch_dtype = torch.float16

model = AutoModelForSeq2SeqLM.from_pretrained(
          model_name, device_map={"": 0},  attn_implementation=attn_implementation, torch_dtype=torch_dtype
)
model.gradient_checkpointing_enable()

Attention with Flash Attention 2 does not support `layer_head_mask`. If you need this feature, please use standard attention.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

2024-10-06 22:31:00,160 - clearml.model - INFO - Selected model id: 1ee663f7272943ffa31c4d0a3b4005d1
2024-10-06 22:31:05,371 - clearml.model - INFO - Selected model id: 9f7fe73c058147c28f1eabe98028f3bd
2024-10-06 22:31:11,147 - clearml.model - INFO - Selected model id: 44632fc281c0481a97cd6b7fb9c02e98


In [12]:
training_arguments = Seq2SeqTrainingArguments(
        output_dir="/root/galore_adamw_nllb_8bit_layerwise_r1024_1e-5_3ep/",
        eval_strategy="steps",   
        do_eval=True,
        per_device_train_batch_size=4,
        per_device_eval_batch_size=4,
        log_level="debug",
        optim="galore_adamw_8bit_layerwise",
        optim_args="rank=1024, update_proj_gap=100, scale=2",
        optim_target_modules=[r".*attn.*", r".*mlp.*"],
        save_strategy = 'epoch',
        logging_steps=200,
        learning_rate=1e-5,   
        eval_steps=200,
        #fp16= False,
        bf16= torch.cuda.is_bf16_supported(),
        num_train_epochs=1,
        warmup_ratio=0.05,
        lr_scheduler_type="linear",
        max_grad_norm=1.0,              
        weight_decay=0.01,
)

In [13]:
trainer =  Seq2SeqTrainer(
        model=model,
        train_dataset=tokenized_datasets['train'],  
        eval_dataset=tokenized_datasets['val'],  
        tokenizer=tokenizer,
        args=training_arguments,
)

trainer.train()

  trainer =  Seq2SeqTrainer(
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
Using auto half precision backend
Currently training with a batch size of: 4
Activated GaLoRE fine-tuning, depending on your model size and hardware, the training might take a while before starting. Please be patient !
***** Running training *****
  Num examples = 17,582
  Num Epochs = 1
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 4,396
  Number of trainable parameters = 3,344,863,232
Automatic ClearML logging enabled.
External ClearML Task has been connected.




Detected flash_attn version: 2.6.3
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


ClearML Monitor: Could not detect iteration reporting, falling back to iterations as seconds-from-start


Step,Training Loss,Validation Loss



***** Running Evaluation *****
  Num examples = 250
  Batch size = 4
  Num examples = 250
  Batch size = 4

***** Running Evaluation *****
  Num examples = 250
  Batch size = 4

***** Running Evaluation *****
  Num examples = 250
  Batch size = 4

***** Running Evaluation *****
  Num examples = 250
  Batch size = 4

***** Running Evaluation *****
  Num examples = 250
  Batch size = 4

***** Running Evaluation *****
  Num examples = 250
  Batch size = 4
Saving model checkpoint to /root/galore_adamw_nllb_8bit_layerwise_r1024_1e-5_3ep/checkpoint-4396
Configuration saved in /root/galore_adamw_nllb_8bit_layerwise_r1024_1e-5_3ep/checkpoint-4396/config.json
Configuration saved in /root/galore_adamw_nllb_8bit_layerwise_r1024_1e-5_3ep/checkpoint-4396/generation_config.json
The model is bigger than the maximum size per checkpoint (5GB) and is going to be split in 2 checkpoint shards. You can find where each parameters has been saved in the index located at /root/galore_adamw_nllb_8bit_layerwise

TrainOutput(global_step=4396, training_loss=1.125884014872446, metrics={'train_runtime': 11029.4398, 'train_samples_per_second': 1.594, 'train_steps_per_second': 0.399, 'total_flos': 3.0464370336281395e+17, 'train_loss': 1.125884014872446, 'epoch': 1.0})

In [20]:
import boto3
from botocore.exceptions import ClientError

s3 = boto3.client('s3',
    aws_access_key_id=aws_access_key,
    aws_secret_access_key=aws_secret_key,
)

def upload_file(file_name, bucket, object_name):
    if object_name is None:
        object_name = file_name
    try:
        s3.upload_file(file_name, bucket, Key=object_name)
    except ClientError as e:
        print(e)
        return False
    print("Success!")
    return True