In [23]:
import pandas as pd
from transformers import MarianTokenizer, MarianMTModel, Seq2SeqTrainer, Seq2SeqTrainingArguments
from datasets import Dataset

In [21]:
pip install sentencepiece transformers datasets



In [24]:
data_file = "translation_dataset.csv"
df = pd.read_csv(data_file)

In [6]:
print(df.head())

              english                        french
0               Hello                       Bonjour
1        How are you?               Comment ça va ?
2  What is your name?   Comment vous appelez-vous ?
3   I am happy today.  Je suis heureux aujourd'hui.
4    The sky is blue.             Le ciel est bleu.


In [27]:
source_language = "english"
target_language = "french"

In [26]:
model_name = "Helsinki-NLP/opus-mt-en-fr"
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)



In [28]:
def preprocess_function(examples):
    # Extract input and target sentences correctly from the 'examples' dictionary
    inputs = examples['english']  # 'english' column
    targets = examples['french']  # 'french' column

    # Tokenize both the inputs and the targets
    model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding="max_length")
    labels = tokenizer(targets, max_length=128, truncation=True, padding="max_length")

    # Add the labels to the model inputs
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [29]:
tokenized_dataset = dataset.map(preprocess_function, batched=True, remove_columns=["english", "french"])

Map:   0%|          | 0/69 [00:00<?, ? examples/s]

In [31]:
print(tokenized_dataset[0])

{'input_ids': [10537, 0, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513], 'attention_mask': [1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

In [32]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir="./results",          # Output directory
    evaluation_strategy="epoch",     # Evaluation strategy
    learning_rate=2e-5,              # Learning rate
    per_device_train_batch_size=8,   # Batch size for training
    per_device_eval_batch_size=8,    # Batch size for evaluation
    num_train_epochs=3,              # Number of epochs
    weight_decay=0.01,               # Weight decay
    logging_dir='./logs',            # Directory for storing logs
    logging_steps=10,
)

trainer = Seq2SeqTrainer(
    model=model,                         # The model to train
    args=training_args,                  # Training arguments
    train_dataset=tokenized_dataset,     # Training dataset
    eval_dataset=tokenized_dataset,      # Evaluation dataset
)

# Step 8: Start Training (this may take time)
trainer.train()



Epoch,Training Loss,Validation Loss
1,No log,0.465213
2,3.256000,0.358544
3,0.393100,0.335495




TrainOutput(global_step=27, training_loss=1.4439582648100677, metrics={'train_runtime': 432.521, 'train_samples_per_second': 0.479, 'train_steps_per_second': 0.062, 'total_flos': 7016961540096.0, 'train_loss': 1.4439582648100677, 'epoch': 3.0})

In [33]:
# Save the fine-tuned model
model.save_pretrained("fine_tuned_translation_model")
tokenizer.save_pretrained("fine_tuned_translation_model")

('fine_tuned_translation_model/tokenizer_config.json',
 'fine_tuned_translation_model/special_tokens_map.json',
 'fine_tuned_translation_model/vocab.json',
 'fine_tuned_translation_model/source.spm',
 'fine_tuned_translation_model/target.spm',
 'fine_tuned_translation_model/added_tokens.json')

In [34]:
test_sentences = ["Hello", "How are you?", "What is your name?"]
inputs = tokenizer(test_sentences, return_tensors="pt", padding=True, truncation=True)
outputs = model.generate(**inputs)
translations = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]

print("\nTranslations:")
for sentence, translation in zip(test_sentences, translations):
    print(f"{sentence} -> {translation}")


Translations:
Hello -> Bonjour.
How are you? -> Comment ça va ?
What is your name? -> Comment vous appelez-vous ?
