In [1]:
# ! pip install -r requirements.txt

In [15]:
import tarfile


LLAMA_RECIPES_TARBALL = "./llama_recipes.tar.gz"

def untar_llama_finetuning_recipe_tarball(tarball_path: str, target: str) -> None:
    """Untar the LLama Finetuning receipe repo."""
    with tarfile.open(tarball_path, "r") as llama_recipe_tar:
        llama_recipe_tar.extractall(target)

untar_llama_finetuning_recipe_tarball(tarball_path=LLAMA_RECIPES_TARBALL, target=".")

In [2]:
import fire

from llama_recipes.configs import train_config, fsdp_config
from llama_recipes.utils.config_utils import update_config

In [3]:
# I: Default training config
train_config()

train_config(model_name='PATH/to/LLAMA/7B', enable_fsdp=False, low_cpu_fsdp=False, run_validation=True, batch_size_training=4, batching_strategy='packing', context_length=4096, gradient_accumulation_steps=1, gradient_clipping=False, gradient_clipping_threshold=1.0, num_epochs=3, num_workers_dataloader=1, lr=0.0001, weight_decay=0.0, gamma=0.85, seed=42, use_fp16=False, mixed_precision=True, val_batch_size=1, peft_method='lora', use_peft=False, output_dir='PATH/to/save/PEFT/model', freeze_layers=False, num_freeze_layers=1, quantization=False, one_gpu=False, save_model=True, dist_checkpoint_root_folder='PATH/to/save/FSDP/model', dist_checkpoint_folder='fine-tuned', save_optimizer=False, use_fast_kernels=False, use_wandb=False, save_metrics=False)

In [4]:
# I: Default FSDP config
fsdp_config()

fsdp_config(mixed_precision=True, use_fp16=False, sharding_strategy=<ShardingStrategy.FULL_SHARD: 1>, hsdp=False, sharding_group_size=0, replica_group_size=0, checkpoint_type=<StateDictType.SHARDED_STATE_DICT: 3>, fsdp_activation_checkpointing=True, fsdp_cpu_offload=False, pure_bf16=False, optimizer='AdamW')

In [None]:
/opt/conda/bin/python3.10 
transfer_learning.py 
--add_input_output_demarcation_key True -> SM
--chat_dataset False - SM
--enable_fsdp True 
--epoch 1 
--instruction_tuned True -> SM
--int8_quantization False 
--learning_rate 0.0001 -> lr in train_config
--lora_alpha 32 -> PEFT
--lora_dropout 0.05 -> PEFT
--lora_r 8 -> PEFT
--max_input_length 2048 -> SM
--max_train_samples -1 -> SM
--max_val_samples -1 -> SM
--per_device_eval_batch_size 1 -> SM Finetuning using deepseed
--per_device_train_batch_size 2 -> PEFT
--preprocessing_num_workers None -> SM
--seed 10 
--target_modules q_proj,v_proj -> PEFT
--train_data_split_seed 0 -> SM
--validation_split_ratio 0.2 -> SM

In [5]:
def main(**kwargs) -> None:
    # Untar llama recipes tarball
    print(kwargs)
    update_config((train_config, fsdp_config), **kwargs)
    # Delete untarred llama recipes tarball
    return

# if __name__ == "__main__":
#     fire.Fire(main)

In [6]:
main(
    model_name="./models/CodeLlama-13b-Python-HF",
    dist_checkpoint_root_folder="./checkpoints/CodeLlama-13b-Python-HF",
    output_dir="./output/CodeLlama-13b-Python-HF",
    training_dataset="./dataset/train",
    validation_dataset="./dataset/validation",
    prompt_template="template.json",
    enable_fsdp=True,
    num_epochs=1,
    quantization=False, # int_8 Quantization
    learning_rate=0.001,
    seed=10,
)

{'model_name': './models/CodeLlama-13b-Python-HF', 'dist_checkpoint_root_folder': './checkpoints/CodeLlama-13b-Python-HF', 'output_dir': './output/CodeLlama-13b-Python-HF', 'training_dataset': './dataset/train', 'validation_dataset': './dataset/validation', 'prompt_template': 'template.json', 'enable_fsdp': True, 'num_epochs': 1, 'quantization': False, 'learning_rate': 0.001, 'seed': 10}


In [None]:
# Create a folders called utils, config, constants

In [7]:
import os

In [10]:
os.path.exists("llama_recipe")

False

In [36]:
%%sh

python train.py \
    --model_dir ./models/CodeLlama-13b-Python-HF \
    --training_dataset ./dataset/train \
    --validation_dataset ./dataset/validation \
    --prompt_template template.json \
    --enable_fsdp True \
    --fsdp_checkpoint_root_dir ./checkpoints/CodeLlama-13b-Python-HF \
    --num_epochs 1 \
    --int8_quantization False \
    --learning_rate 0.001 \
    --seed 10 \
    --use_peft True \
    --peft_output_dir ./output/CodeLlama-13b-Python-HF

INFO:root:Executing command:
INFO:root:['torchrun', '--nnodes', '1', '--nproc_per_node', '4', 'finetuning.py', '--num_gpus', '4', '--model_name', './models/CodeLlama-13b-Python-HF', '--batch_size_training', '4', '--batching_strategy', 'packing', '--context_length', '4096', '--gradient_accumulation_steps', '1', '--gradient_clipping', 'False', '--gradient_clipping_threshold', '1.0', '--num_epochs', '1', '--num_workers_dataloader', '1', '--lr', '0.001', '--weight_decay', '0.0', '--gamma', '0.85', '--seed', '10', '--freeze_layers', 'False', '--num_freeze_layers', '1', '--use_fast_kernels', 'False', '--save_metrics', 'False', '--run_validation', 'True', '--val_batch_size', '1', '--quantization', 'False', '--enable_fsdp', '--dist_checkpoint_root_folder', './checkpoints/CodeLlama-13b-Python-HF', '--low_cpu_fsdp', 'False', '--mixed_precision', 'True', '--use_fp16', 'False', '--pure_bf16', 'False', '--optimizer', 'AdamW', '--save_optimizer', 'False', '--use_peft', '--peft_method', 'lora', '--ou

--> Running with torch dist debug set to detail


INFO:root:Local rank is 2. Rank is 2. World Size is 4
INFO:root:Setting torch device = 2
INFO:root:Local rank is 3. Rank is 3. World Size is 4
INFO:root:Setting torch device = 3
INFO:root:Local rank is 1. Rank is 1. World Size is 4
INFO:root:Setting torch device = 1
INFO:root:Loading the pre-trained model and setup its configuration
INFO:root:Model Name: ./models/CodeLlama-13b-Python-HF
INFO:root:enable_fsdp is set to True and low_cpu_fsdp is set to False
INFO:root:Loading the pre-trained model and setup its configuration
INFO:root:Model Name: ./models/CodeLlama-13b-Python-HF
INFO:root:enable_fsdp is set to True and low_cpu_fsdp is set to False
INFO:root:Loading the pre-trained model and setup its configuration
INFO:root:Model Name: ./models/CodeLlama-13b-Python-HF
INFO:root:enable_fsdp is set to True and low_cpu_fsdp is set to False
[2024-03-31 06:17:35,549] torch.distributed.elastic.multiprocessing.api: [ERROR] failed (exitcode: -9) local_rank: 2 (pid: 15117) of binary: /home/ec2-use

Namespace(model_dir='./models/CodeLlama-13b-Python-HF', per_device_train_batch_size=4, batching_strategy='packing', context_length=4096, gradient_accumulation_steps=1, gradient_clipping=False, gradient_clipping_threshold=1.0, num_epochs=1, num_workers_dataloader=1, learning_rate=0.001, weight_decay=0.0, gamma=0.85, seed=10, int8_quantization=False, freeze_layers=False, num_freeze_layers=1, use_fast_kernels=False, save_metrics=False, run_validation=True, val_batch_size=1, enable_fsdp=True, fsdp_checkpoint_root_dir='./checkpoints/CodeLlama-13b-Python-HF', low_cpu_fsdp=False, mixed_precision=True, use_fp16=False, pure_bf16=False, optimizer='AdamW', save_optimizer=False, use_peft=True, peft_method='lora', peft_output_dir='./output/CodeLlama-13b-Python-HF', lora_r=8, lora_alpha=32, lora_dropout=0.05, target_modules='q_proj,v_proj')



    subprocess.run(command, shell=shell, check=True)
  File "/home/ec2-user/anaconda3/envs/pytorch_p310/lib/python3.10/subprocess.py", line 526, in run
    raise CalledProcessError(retcode, process.args,
subprocess.CalledProcessError: Command '['torchrun', '--nnodes', '1', '--nproc_per_node', '4', 'finetuning.py', '--num_gpus', '4', '--model_name', './models/CodeLlama-13b-Python-HF', '--batch_size_training', '4', '--batching_strategy', 'packing', '--context_length', '4096', '--gradient_accumulation_steps', '1', '--gradient_clipping', 'False', '--gradient_clipping_threshold', '1.0', '--num_epochs', '1', '--num_workers_dataloader', '1', '--lr', '0.001', '--weight_decay', '0.0', '--gamma', '0.85', '--seed', '10', '--freeze_layers', 'False', '--num_freeze_layers', '1', '--use_fast_kernels', 'False', '--save_metrics', 'False', '--run_validation', 'True', '--val_batch_size', '1', '--quantization', 'False', '--enable_fsdp', '--dist_checkpoint_root_folder', './checkpoints/CodeLlama-13b-Python-

CalledProcessError: Command 'b'\npython train.py \\\n    --model_dir ./models/CodeLlama-13b-Python-HF \\\n    --training_dataset ./dataset/train \\\n    --validation_dataset ./dataset/validation \\\n    --prompt_template template.json \\\n    --enable_fsdp True \\\n    --fsdp_checkpoint_root_dir ./checkpoints/CodeLlama-13b-Python-HF \\\n    --num_epochs 1 \\\n    --int8_quantization False \\\n    --learning_rate 0.001 \\\n    --seed 10 \\\n    --use_peft True \\\n    --peft_output_dir ./output/CodeLlama-13b-Python-HF\n'' returned non-zero exit status 1.

In [None]:
# int_8 Quantization

# Retar the llama_finetuning folder

In [46]:
import torch

def get_num_gpus():
    if torch.cuda.is_available():
        return torch.cuda.device_count()
    else:
        return 0
    
get_num_gpus()

4