In [1]:
from IPython.display import clear_output

!pip install accelerate -U
!pip install transformers -U
!pip install bitsandbytes deepspeed wandb peft
!pip install mpi4py
!pip install flash-attn --no-build-isolation

clear_output()

## **DATA SETUP**

In [2]:
import os

!git clone --branch maya_pretrain https://github.com/rsk2327/LLaVA.git

%load_ext autoreload
%autoreload 2

Cloning into 'LLaVA'...
remote: Enumerating objects: 2490, done.[K
remote: Counting objects: 100% (224/224), done.[K
remote: Compressing objects: 100% (124/124), done.[K
remote: Total 2490 (delta 145), reused 161 (delta 100), pack-reused 2266[K
Receiving objects: 100% (2490/2490), 13.80 MiB | 30.86 MiB/s, done.
Resolving deltas: 100% (1529/1529), done.


In [None]:
#### Downloading Test PALO JSON file
!wget https://huggingface.co/datasets/pilotj/PALO_SUBSET_MAYA/resolve/main/palo_multilingual_dataset_subset.json

!mv palo_multilingual_dataset_subset.json /content/LLaVA/playground/data/


#### Downloading PALO Validation subset
!wget https://huggingface.co/datasets/pilotj/PALO_SUBSET_MAYA/resolve/main/coco.zip

!mkdir -p /content/LLaVA/playground/data/PALO

!unzip /content/coco.zip

!mv /content/coco/ /content/LLaVA/playground/data/PALO/


#### Downloading the pretrained projector file
!wget https://huggingface.co/nahidalam/Maya/resolve/main/mm_projector.bin

!mkdir -p /content/LLaVA/checkpoints/maya/

!mv /content/mm_projector.bin /content/LLaVA/checkpoints/maya/

--2024-07-24 10:40:23--  https://huggingface.co/nahidalam/Maya/resolve/main/mm_projector.bin
Resolving huggingface.co (huggingface.co)... 54.230.18.95, 54.230.18.110, 54.230.18.85, ...
Connecting to huggingface.co (huggingface.co)|54.230.18.95|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://cdn-lfs-us-1.huggingface.co/repos/61/39/6139d62870886ce41aaa81928043ac513708f9716fecfe436f17531145fd6574/53c407282335fdff53de8f1cb54eda4d25a98cda1cb4729a90892669845713f3?response-content-disposition=inline%3B+filename*%3DUTF-8%27%27mm_projector.bin%3B+filename%3D%22mm_projector.bin%22%3B&response-content-type=application%2Foctet-stream&Expires=1722076823&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTcyMjA3NjgyM319LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy11cy0xLmh1Z2dpbmdmYWNlLmNvL3JlcG9zLzYxLzM5LzYxMzlkNjI4NzA4ODZjZTQxYWFhODE5MjgwNDNhYzUxMzcwOGY5NzE2ZmVjZmU0MzZmMTc1MzExNDVmZDY1NzQvNTNjNDA3MjgyMzM1ZmRmZjUzZGU4ZjFjYjU0ZWRhN

In [10]:
import json

data = json.load(open('/content/LLaVA/playground/data/palo_multilingual_dataset_subset.json'))

In [7]:
def get_projector_file_path(projector_path, model_name = None, redownload = False, base_projector_folder = '/content'):

    if projector_path.startswith("http"):
        # Link to online projector file

        if model_name is None:
            raise ValueError("If using an online link to the projector file, you need to provide a model name to differentiate the model")

        projector_folder = Path(base_projector_folder).joinpath(model_name)

        if projector_folder.exists():
            # model projector folder exists

            if 'mm_projector.bin' not in os.listdir(projector_folder):
                # model projector file doesnt exist
                os.system(f"wget {projector_path} -P {projector_folder.absolute().as_posix()}")

        else:

            os.mkdir(projector_folder)
            os.system(f"wget {projector_path} -P {projector_folder.absolute().as_posix()}")

        return projector_folder.joinpath('mm_projector.bin').absolute().as_posix()

    else:
        # path to on-system projector file

        if os.path.exists(projector_path):
            return projector_path


## **RUN INPUTS**

In [4]:
PROJECTOR_FILE_PATH = 'https://huggingface.co/nahidalam/Maya/resolve/main/mm_projector.bin'

LLAVA_DIRECTORY_PATH = '/content/LLaVA/'

MODEL_BASE = 'CohereForAI/aya-23-8B'

MODEL_PATH = 'nahidalam/Maya'

OUTPUT_DIR = '/content/checkpoints/cohere23/'

LOGGING_DIR = '/content/checkpoints/cohere23/logs'

IMAGE_FOLDER = '/content/LLaVA/playground/data/'

LABEL_FILE_PATH = '/content/LLaVA/playground/data/palo_multilingual_dataset_1k_val2017.json'


## To do
# Writes tests to check for validity of inputs (folder exists or not, file exists or not etc)

## **MODEL SETUP**

In [5]:
from IPython.display import clear_output
import torch
from pathlib import Path
from transformers import AutoTokenizer, AutoConfig

import sys
sys.path.insert(0,LLAVA_DIRECTORY_PATH)
sys.path.insert(0, Path(LLAVA_DIRECTORY_PATH).joinpath('playground/finetuning').absolute().as_posix())  # For importing the finetuning specific files

from transformers.models.cohere.tokenization_cohere_fast import CohereTokenizerFast
from llava.model.language_model.llava_cohere import LlavaCohereForCausalLM, LlavaCohereConfig
from llava.constants import IGNORE_INDEX, IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN

from finetune_args import *
from llava.model import *
from llava import conversation as conversation_lib
from dataset_utils import *


%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [8]:
#### Finetuning Arguments

# model_args, data_args, training_args = get_finetune_args(
#                                                         # model_name = 'lmsys/vicuna-13b-v1.5',
#                                                         # model_name = 'liuhaotian/llava-v1.5-7b',
#                                                         model_name = 'lmsys/vicuna-7b-v1.5',
#                                                         pretrain_mm_mlp_adapter = '/content/checkpoints/llava-v1.5-7b-pretrain/mm_projector.bin',
#                                                         data_path = '/content/LLaVA/playground/data/palo_multilingual_dataset_1k_val2017.json',
#                                                         image_folder = '/content/LLaVA/playground/data/',
#                                                         output_dir = '/content/checkpoints/llava-v1.5-7b/',
#                                                         logging_dir = '/content/checkpoints/llava-v1.5-7b/logs/'
#                                                         )


## For LORA version
model_args, data_args, training_args = get_finetune_lora_args(
                                                        model_name = MODEL_BASE,
                                                        # model_name = 'lmsys/vicuna-7b-v1.5',
                                                        # pretrain_mm_mlp_adapter = '/content/checkpoints/llava-v1.5-7b-pretrain/mm_projector.bin',
                                                        pretrain_mm_mlp_adapter = get_projector_file_path(PROJECTOR_FILE_PATH, model_name = 'Cohere23'),
                                                        data_path = LABEL_FILE_PATH,
                                                        image_folder = IMAGE_FOLDER,
                                                        output_dir = OUTPUT_DIR,
                                                        logging_dir = LOGGING_DIR
                                                        )

attn_implementation="flash_attention_2"
bnb_model_from_pretrained_args = {}

training_args.per_device_train_batch_size = 4
training_args.gradient_accumulation_steps = 4

local_rank = None




In [9]:
model_args.model_name_or_path

'CohereForAI/aya-23-8B'

In [None]:
model = LlavaCohereForCausalLM.from_pretrained(
                model_args.model_name_or_path,
                cache_dir=training_args.cache_dir,
                attn_implementation=attn_implementation,
                torch_dtype=(torch.bfloat16 if training_args.bf16 else None),
                **bnb_model_from_pretrained_args
            )

# model.to('cuda') In case the model doesnt automatically get loaded to GPU

You are using a model of type cohere to instantiate a model of type llava_cohere. This is not supported for all configurations of models and can yield errors.
You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [None]:
model.config.use_cache = False

if model_args.freeze_backbone:
    model.model.requires_grad_(False)

if training_args.gradient_checkpointing:
    if hasattr(model, "enable_input_require_grads"):
        model.enable_input_require_grads()
    else:
        def make_inputs_require_grad(module, input, output):
            output.requires_grad_(True)
        model.get_input_embeddings().register_forward_hook(make_inputs_require_grad)

if training_args.lora_enable:
    from peft import LoraConfig, get_peft_model
    lora_config = LoraConfig(
        r=training_args.lora_r,
        lora_alpha=training_args.lora_alpha,
        target_modules=find_all_linear_names(model),
        lora_dropout=training_args.lora_dropout,
        bias=training_args.lora_bias,
        task_type="CAUSAL_LM",
    )
    if training_args.bits == 16:
        if training_args.bf16:
            model.to(torch.bfloat16)
        if training_args.fp16:
            model.to(torch.float16)
    model = get_peft_model(model, lora_config)  # should probably rename this to something else to differentiate peft and non-peft version

tokenizer = transformers.AutoTokenizer.from_pretrained(
            model_args.model_name_or_path,
            cache_dir=training_args.cache_dir,
            model_max_length=training_args.model_max_length,
            padding_side="right",
            use_fast=True,
        )

## Treating the pad_token/unk_token issue
# tokenizer.pad_token = tokenizer.unk_token




if model_args.version in conversation_lib.conv_templates:
    conversation_lib.default_conversation = conversation_lib.conv_templates[model_args.version]
else:
    conversation_lib.default_conversation = conversation_lib.conv_templates["vicuna_v1"]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
model.get_model().initialize_vision_modules(
            model_args=model_args,
            fsdp=training_args.fsdp
        )

vision_tower = model.get_vision_tower()
vision_tower.to(dtype=torch.bfloat16 if training_args.bf16 else torch.float16, device=training_args.device)

data_args.image_processor = vision_tower.image_processor
data_args.is_multimodal = True

model.config.image_aspect_ratio = data_args.image_aspect_ratio
model.config.tokenizer_padding_side = tokenizer.padding_side
model.config.tokenizer_model_max_length = tokenizer.model_max_length

model.config.tune_mm_mlp_adapter = training_args.tune_mm_mlp_adapter = model_args.tune_mm_mlp_adapter

# if model_args.tune_mm_mlp_adapter:
#     model.requires_grad_(False)
#     for p in model.get_model().mm_projector.parameters():
#         p.requires_grad = True

model.config.freeze_mm_mlp_adapter = training_args.freeze_mm_mlp_adapter
# if training_args.freeze_mm_mlp_adapter:
#     for p in model.get_model().mm_projector.parameters():
#         p.requires_grad = False

# if training_args.bits in [4, 8]:
#     model.get_model().mm_projector.to(dtype=compute_dtype, device=training_args.device)

model.config.mm_use_im_start_end = data_args.mm_use_im_start_end = model_args.mm_use_im_start_end
model.config.mm_projector_lr = training_args.mm_projector_lr
training_args.use_im_start_end = model_args.mm_use_im_start_end
model.config.mm_use_im_patch_token = model_args.mm_use_im_patch_token
model.initialize_vision_tokenizer(model_args, tokenizer=tokenizer)

# if training_args.bits in [4, 8]:
#     from peft.tuners.lora import LoraLayer
#     for name, module in model.named_modules():
#         if isinstance(module, LoraLayer):
#             if training_args.bf16:
#                 module = module.to(torch.bfloat16)
#         if 'norm' in name:
#             module = module.to(torch.float32)
#         if 'lm_head' in name or 'embed_tokens' in name:
#             if hasattr(module, 'weight'):
#                 if training_args.bf16 and module.weight.dtype == torch.float32:
#                     module = module.to(torch.bfloat16)

data_module = make_supervised_data_module(tokenizer=tokenizer,
                                              data_args=data_args)

trainer = LLaVATrainer(model=model,
                    tokenizer=tokenizer,
                    args=training_args,
                    **data_module)

In [None]:
if list(pathlib.Path(training_args.output_dir).glob("checkpoint-*")):
    trainer.train(resume_from_checkpoint=True)
else:
    trainer.train()

  self.pid = os.fork()










  self.pid = os.fork()
Token indices sequence length is longer than the specified maximum sequence length for this model (3167 > 2048). Running this sequence through the model will result in indexing errors







OutOfMemoryError: CUDA out of memory. Tried to allocate 3.91 GiB. GPU 

# **Running with finetune_lora.sh script**

In [None]:
#### Downloading Test PALO JSON file
!wget https://huggingface.co/datasets/pilotj/PALO_SUBSET_MAYA/resolve/main/palo_multilingual_dataset_subset.json

!mv palo_multilingual_dataset_subset.json /content/LLaVA/playground/data/


#### Downloading PALO Validation subset
!wget https://huggingface.co/datasets/pilotj/PALO_SUBSET_MAYA/resolve/main/coco.zip

!mkdir -p /content/LLaVA/playground/data/PALO

!unzip /content/coco.zip

!mv /content/coco/ /content/LLaVA/playground/data/PALO/


#### Downloading the pretrained projector file
!wget https://huggingface.co/nahidalam/Maya/resolve/main/mm_projector.bin

!mkdir -p /content/LLaVA/checkpoints/maya/

!mv /content/mm_projector.bin /content/LLaVA/checkpoints/maya/

In [None]:
chmod +x scripts/cohere/finetune_lora.sh

export PYTHONPATH=/content/LLaVA:$PYTHONPATH

export HF_TOKEN=

cd LLaVA/

scripts/cohere/finetune.sh