In [1]:
from transformers import MBart50Tokenizer, MBartForConditionalGeneration

# Load mBART-50 model and tokenizer
model_name = "facebook/mbart-large-50-many-to-many-mmt"
tokenizer = MBart50Tokenizer.from_pretrained(model_name)
model = MBartForConditionalGeneration.from_pretrained(model_name)

# Input data in the source language
source_sentences = [
    "Hello, how are you?",
    "What is your name?",
    "I love coding!"
]

# Initialize lists for storing source sentences and translated sentences
source_corpus = []
translated_corpus = []


# Translate each source sentence to Tamil
for sentence in source_sentences:
    
    # Tokenization
    tokenizer.src_lang = "en_XX"
    encoded_ar = tokenizer(sentence, return_tensors="pt")
    generated_tokens = model.generate(
    **encoded_ar,
    forced_bos_token_id=tokenizer.lang_code_to_id["te_IN"],
    max_new_tokens=200
    )
    

    # Detokenization
    translated_sentence =tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)

    # Append source and translated sentences to the respective lists
    source_corpus.append(sentence)
    translated_corpus.append(translated_sentence)



In [2]:
# Print the parallel corpus
for source, translation in zip(source_corpus, translated_corpus):
    print(f"Source (English): {source}")
    print(f"Translation (Tamil): {translation[0]}")




Source (English): Hello, how are you?
Translation (Tamil): హలో, ఎలా ఉన్నావు?
Source (English): What is your name?
Translation (Tamil): మీ పేరు ఏమిటి?
Source (English): I love coding!
Translation (Tamil): నేను ప్రోగ్రామ్ ప్రేమ!


In [3]:
# Save the parallel corpus to a file
output_file1 = "english_augmented_corpus.txt"
with open(output_file1, "w", encoding="utf-8") as file:
    for source in source_corpus:
        file.write(source)
        file.write("\n")

output_file2 = "telugu_augmented_corpus.txt"
with open(output_file2, "w", encoding="utf-8") as file:
    for translation in translated_corpus:
        file.write(translation[0])
        file.write("\n")

TypeError: write() argument must be str, not list