In [4]:
from datasets import load_dataset
from transformers import T5ForConditionalGeneration, T5Tokenizer, Trainer, TrainingArguments, DataCollatorForSeq2Seq
import pandas as pd
import torch

In [2]:

# Read the source and target files
with open('data/source.txt', 'r', encoding='utf-8') as f:
    source_sentences = f.readlines()

with open('data/target.txt', 'r', encoding='utf-8') as f:
    target_sentences = f.readlines()

# Ensure both files have the same number of lines
assert len(source_sentences) == len(target_sentences), "Source and target files must have the same number of lines."

# Create a DataFrame
df = pd.DataFrame({
    'source': [line.strip() for line in source_sentences],
    'target': [line.strip() for line in target_sentences]
})

# Save to CSV
df.to_csv('data/data.csv', index=False)

In [5]:
dataset = load_dataset('csv', data_files={'train': 'data/data.csv'})

model_name = 'jbochi/madlad400-3b-mt'
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name, torch_dtype=torch.float16)

In [9]:
source_lang = "en"
target_lang = "sw"
task_prefix = f"<2{target_lang}>"

In [10]:
def preprocess_function(examples):
    inputs = [task_prefix + src for src in examples['source']]
    targets = examples['target']
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding='max_length')
    
    # Tokenize targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=512, truncation=True, padding='max_length')

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [11]:
tokenized_dataset = dataset.map(preprocess_function, batched=True)

Map: 100%|██████████| 10/10 [00:00<00:00, 232.61 examples/s]


In [12]:
split_dataset = tokenized_dataset['train'].train_test_split(test_size=0.1)
train_dataset = split_dataset['train']
eval_dataset = split_dataset['test']

In [13]:
training_args = TrainingArguments(
    output_dir="./madlad400-finetuned",
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=4,  # Adjust based on GPU memory
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=2,
    # predict_with_generate=True,
    logging_dir='./logs',
    logging_steps=10,
    fp16=True,  # Use mixed precision if supported
)



In [14]:

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

In [15]:
trainer.train()

  0%|          | 0/9 [00:00<?, ?it/s]

KeyboardInterrupt: 