In [None]:
!pip3 install -q -U datasets accelerate
!pip3 install -q -U tensorflow sentencepiece tf-keras
!pip3 install -q -U bitsandbytes==0.42.0
!pip3 install -q -U peft==0.8.2
!pip3 install -q -U trl==0.7.10
!pip3 install -q -U accelerate==0.27.1
!pip3 install -q -U datasets==2.17.0
!pip3 install -q -U transformers==4.38.1

## Prepare the data

In [None]:
import pandas as pd

data = pd.read_csv("examples_cohere_embeds.csv")

In [None]:
import datasets

data_dict = data.to_dict(orient="records")
data_list = []

for i, row in enumerate(data_dict):
    word = row["UnDiacWord"]
    definition = row["Definition"]
    example = row["examples"]

    if example:
        try:
            example = eval(example)[0]
        except:
            continue

    data_list.append({"text": f"{definition}, {example}", "label": f"{word}"})

# Convert the list to HF dataset
dataset = datasets.Dataset.from_list(data_list)
dataset = dataset.train_test_split(test_size=0.1)

## Fine-tuning AYA

In [None]:
import torch
from transformers import AutoTokenizer, BitsAndBytesConfig
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

# model_id = "google/gemma-7b"
model_id = "CohereForAI/aya-101"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16
)

# tokenizer = AutoTokenizer.from_pretrained(model_id, token=os.environ['HF_TOKEN'])
tokenizer = AutoTokenizer.from_pretrained(model_id, token=os.environ["HF_TOKEN"])
model = AutoModelForSeq2SeqLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map={"": 0},
    token=os.environ["HF_TOKEN"],
)

In [None]:
os.environ["WANDB_DISABLED"] = "true"

In [None]:
gemma_data = dataset.map(lambda samples: tokenizer(samples["text"]), batched=True)

In [None]:
max_input_length = 200
max_target_length = 100


def preprocess_function(examples):
    model_inputs = tokenizer(
        text=examples["text"],
        max_length=max_input_length,
        truncation=True,
    )
    labels = tokenizer(examples["label"], max_length=max_target_length, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


tokenized_dataset = dataset.map(
    preprocess_function, batched=True, remove_columns=["label", "text"]
)

In [None]:
tokenized_dataset

In [None]:
from peft import LoraConfig, get_peft_model, prepare_model_for_int8_training, TaskType

# Define LoRA Config
lora_config = LoraConfig(
    r=4,
    lora_alpha=16,
    target_modules=["q", "v"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM,
)

# prepare int-8 model for training
model = prepare_model_for_int8_training(model)

# add LoRA adaptor
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

In [None]:
from transformers import DataCollatorForSeq2Seq

# we want to ignore tokenizer pad token in the loss
label_pad_token_id = -100
# Data collator
data_collator = DataCollatorForSeq2Seq(
    tokenizer, model=model, label_pad_token_id=label_pad_token_id, pad_to_multiple_of=8
)

In [None]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

output_dir = "lora-aya"

# Define training args
training_args = Seq2SeqTrainingArguments(
    output_dir=output_dir,
    auto_find_batch_size=True,
    learning_rate=1e-3,  # higher learning rate
    num_train_epochs=3,
    logging_dir=f"{output_dir}/logs",
    logging_strategy="steps",
    logging_steps=500,
    save_strategy="no",
    report_to="tensorboard",
)

# Create Trainer instance
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_dataset["train"],
)

model.config.use_cache = False  # silence the warnings. Please re-enable for inference!

In [None]:
# train model
trainer.train()

In [None]:
text = "زمن طويل غير محدود.	"
device = "cuda:0"
inputs = tokenizer(text, return_tensors="pt").to(device)

outputs = model.generate(**inputs, max_new_tokens=10)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

In [None]:
data.head()