In [None]:
from datasets import load_dataset
import torch

In [None]:
print("++++++Reading the Dataset++++++++++")
dataset = load_dataset("Anonymous/Final_idiom_all",split='train') #<======== Please change the dataset in csv file.

In [None]:
from huggingface_hub import login

login(token='Anonymous_xyzkajwjewkncjqnkj') #<======= PlaceHolder

In [None]:
system_message='''You are an polyglot, who are having exceptional linguistic and cultural domain knowledge. Also, you are an native speaker of hindi, bengali and thai.'''

In [None]:
def format_data(sample):
    return [
        {
            "role": "system",
            "content": [
                {
                    "type": "text",
                    "text": system_message
                }
            ],
        },
        {
            "role": "user",
            "content": [
                {
                    "type": "image",
                    "image": sample["image"],
                },
                {
                    "type": "text",
                    "text": sample['Actual idiom'],
                }
            ],
        },
        {
            "role": "assistant",
            "content": [
                {
                    "type": "text",
                    "text": sample["Descriptive Meaning(Human Annotation)"]
                }
            ],
        },
    ]

In [None]:
print(dataset)

In [None]:
# print(dataset[3132])
# print(len(dataset))

In [None]:
print("++++++Seperating the Dataset on Lingual Basis++++++++++")
dataset_hindi = dataset.select(range(0,1277))
dataset_thai = dataset.select(range(1382,3133))
bengali_indicies = list(range(1277,1382))+list(range(3133,3533))
dataset_bengali= dataset.select(bengali_indicies)

In [None]:
from datasets import Dataset,concatenate_datasets

In [None]:
def split_dataset(dataset1):
    train_testvalid = dataset1.train_test_split(test_size=0.3, seed=42)
    train_dataset = train_testvalid['train']
    temp_dataset = train_testvalid['test']

    # Step 2: Split the remaining 30% into 2/3 (validation) and 1/3 (test)
    # 2/3 of 30% = 20%, 1/3 of 30% = 10%
    val_test = temp_dataset.train_test_split(test_size=2/3, seed=42)
    val_dataset = val_test['train']    # 20%
    test_dataset = val_test['test']    # 10%

    return train_dataset, val_dataset, test_dataset

In [None]:
print("++++++Splitting the Dataset and Merging++++++++++")
train_dataset_hindi, val_dataset_hindi, test_dataset_hindi = split_dataset(dataset_hindi)
train_dataset_thai, val_dataset_thai, test_dataset_thai = split_dataset(dataset_thai)
train_dataset_bengali, val_dataset_bengali, test_dataset_bengali = split_dataset(dataset_bengali)


#Merging the dataset
train_dataset_final = concatenate_datasets([train_dataset_hindi,train_dataset_thai,train_dataset_bengali])
val_dataset_final = concatenate_datasets([val_dataset_hindi,val_dataset_thai,val_dataset_bengali])
test_dataset_final = concatenate_datasets([test_dataset_hindi,test_dataset_thai,test_dataset_bengali])

print(len(train_dataset_final),len(val_dataset_final),len(test_dataset_final))


# dataset.save_to_disk("test_dataset_final")

In [None]:
print("++++++Converting the Dataset to JSON format++++++++++")
train_dataset = [format_data(sample) for sample in train_dataset_final]
eval_dataset = [format_data(sample) for sample in val_dataset_final]
test_dataset = [format_data(sample) for sample in test_dataset_final]

In [None]:
print(train_dataset[2000])

In [None]:
import torch
from transformers import Idefics3ForConditionalGeneration, AutoProcessor

model_id = "HuggingFaceTB/SmolVLM-Instruct"
print("++++++Importing Models++++++++++")


In [None]:
print("++++++Cleaning up space++++++++++")
import gc
import time

def clear_memory():
    # Delete variables if they exist in the current global scope
    if 'inputs' in globals(): del globals()['inputs']
    if 'model' in globals(): del globals()['model']
    if 'processor' in globals(): del globals()['processor']
    if 'trainer' in globals(): del globals()['trainer']
    if 'peft_model' in globals(): del globals()['peft_model']
    if 'bnb_config' in globals(): del globals()['bnb_config']
    time.sleep(2)

    # Garbage collection and clearing CUDA memory
    gc.collect()
    time.sleep(2)
    torch.cuda.empty_cache()
    torch.cuda.synchronize()
    time.sleep(2)
    gc.collect()
    time.sleep(2)

    print(f"GPU allocated memory: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")
    print(f"GPU reserved memory: {torch.cuda.memory_reserved() / 1024**3:.2f} GB")

clear_memory()

In [None]:
print("++++++setting up BitsAndBytesConfig++++++++++")
from transformers import BitsAndBytesConfig

# BitsAndBytesConfig int-4 config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

# Load model and tokenizer
model = Idefics3ForConditionalGeneration.from_pretrained(
    model_id,
    device_map="auto",
    torch_dtype=torch.bfloat16,
    quantization_config=bnb_config,
    _attn_implementation="flash_attention_2",
)
processor = AutoProcessor.from_pretrained(model_id)

In [None]:
from peft import LoraConfig, get_peft_model

# Configure LoRA
peft_config = LoraConfig(
    r=8,
    lora_alpha=8,
    lora_dropout=0.1,
    target_modules=['down_proj','o_proj','k_proj','q_proj','gate_proj','up_proj','v_proj'],
    use_dora=True,
    init_lora_weights="gaussian"
)

# Apply PEFT model adaptation
peft_model = get_peft_model(model, peft_config)

# Print trainable parameters
peft_model.print_trainable_parameters()

In [None]:
from trl import SFTConfig

# Configure training arguments using SFTConfig
training_args = SFTConfig(
    output_dir="SmolVLM_Idiom_VL",
    num_train_epochs=3,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=8,
    warmup_steps=50,
    learning_rate=2e-4,
    weight_decay=0.01,
    logging_steps=10,
    save_strategy="steps",
    save_steps=20,
    # save_total_limit=1,
    optim="adamw_torch_fused",
    bf16=True,
    push_to_hub=True,
    report_to="wandb",
    remove_unused_columns=False,
    gradient_checkpointing=True,
    dataset_text_field="",
    dataset_kwargs={"skip_prepare_dataset": True},
)

In [None]:
print("++++++connecting to wanb++++++++++")
import wandb

wandb.init(
    project="SmolVLM_Idiom_VL",  # change this
    name="SmolVLM_Idiom_VL",  # change this
    config=training_args,
)

In [None]:
image_token_id = processor.tokenizer.additional_special_tokens_ids[
            processor.tokenizer.additional_special_tokens.index("<image>")]

def collate_fn(examples):
    texts = [processor.apply_chat_template(example, tokenize=False) for example in examples]

    image_inputs = []
    for example in examples:
      image = example[1]['content'][0]['image']
      if image.mode != 'RGB':
          image = image.convert('RGB')
      image_inputs.append([image])

    batch = processor(text=texts, images=image_inputs, return_tensors="pt", padding=True)
    labels = batch["input_ids"].clone()
    labels[labels == processor.tokenizer.pad_token_id] = -100  # Mask padding tokens in labels
    labels[labels == image_token_id] = -100  # Mask image token IDs in labels

    batch["labels"] = labels

    return batch

In [None]:
from trl import SFTTrainer

trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=collate_fn,
    peft_config=peft_config,
    processing_class=processor.tokenizer,
)

In [None]:
print("++++++Starting the training++++++++++")
trainer.train()

In [None]:
print("++++++Saving the Model++++++++++")
trainer.save_model(training_args.output_dir)


In [None]:
clear_memory()
