In [15]:
from peft import prepare_model_for_kbit_training
from peft import LoraConfig , get_peft_model
from datasets import load_dataset , DatasetDict
import transformers

In [33]:
# from transformers import M2M100ForConditionalGeneration
# from tokenization_small100 import SMALL100Tokenizer

# hi_text = "i went to the officers cabin"

# model = M2M100ForConditionalGeneration.from_pretrained("alirezamsh/small100")
# tokenizer = SMALL100Tokenizer.from_pretrained("alirezamsh/small100")

# # translate Hindi to French
# tokenizer.tgt_lang = "gu"
# encoded_hi = tokenizer(hi_text, return_tensors="pt")
# generated_tokens = model.generate(**encoded_hi)
# tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)

In [16]:
from transformers import M2M100ForConditionalGeneration
from tokenization_small100 import SMALL100Tokenizer

class M2M100Wrapper(M2M100ForConditionalGeneration):
    def forward(self , *args , **kwargs):
        kwargs.pop('num_items_in_batch', None)
        return super().forward(*args, **kwargs)

model = M2M100ForConditionalGeneration.from_pretrained("alirezamsh/small100")
tokenizer = SMALL100Tokenizer.from_pretrained("alirezamsh/small100", tgt_lang="gu")

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'M2M100Tokenizer'. 
The class this function is called from is 'SMALL100Tokenizer'.


In [17]:
model.enable_input_require_grads()
model.train()
model = prepare_model_for_kbit_training(model)

In [18]:
config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="SEQ_2_SEQ_LM"
)

model = get_peft_model(model, config)

model.print_trainable_parameters()

trainable params: 589,824 || all params: 333,325,312 || trainable%: 0.17695145815988916


In [19]:
data = load_dataset("Hemanth-thunder/english-to-gujarati-mt")['train']

In [20]:
small_data = data.shuffle(seed=42).select(range(int(0.4 * len(data))))

In [21]:
data_splitted = small_data.train_test_split(test_size=0.1 , seed=42)

In [24]:
data = DatasetDict({
    'train' : data_splitted['train'],
    'test' : data_splitted['test']
})

In [25]:
def tokenize_function(examples):
    tgt_text = examples["gu"]
    src_text = examples["en"]
    tokenizer.truncation_side = "left"
    return tokenizer(src_text, text_target=tgt_text, truncation=True, padding="max_length", max_length=128)

tokenized_data = data.map(tokenize_function , batched=True)

Map:   0%|          | 0/1104404 [00:00<?, ? examples/s]

Map:   0%|          | 0/122712 [00:00<?, ? examples/s]

In [26]:
tokenizer.pad_token = tokenizer.eos_token
data_collator = transformers.DataCollatorForSeq2Seq(tokenizer, model=model)

In [27]:
lr = 2e-4
batch_size = 4
num_epochs = 10

training_args = transformers.TrainingArguments(
    output_dir= "guj",
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    weight_decay=0.01,
    logging_strategy="epoch",
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    gradient_accumulation_steps=4,
    warmup_steps=2,
    fp16=True,
    optim="paged_adamw_8bit",
    report_to="none",
)

In [None]:
trainer = transformers.Trainer(
    model=model,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["test"],
    args=training_args,
    data_collator=data_collator,
)

model.train()
model.config.use_cache = False 
trainer.train()

model.config.use_cache = True