<a href="https://colab.research.google.com/github/shinsuikyo/cumberlands/blob/main/NLPWeek5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

https://huggingface.co/google-t5/t5-large
https://huggingface.co/Helsinki-NLP/opus-mt-en-fr

In [45]:
!pip install transformers torch sentencepiece sacremoses




In [46]:
from transformers import T5Tokenizer, T5ForConditionalGeneration, MarianMTModel, MarianTokenizer
import difflib
from difflib import SequenceMatcher


In [47]:
# Load the model and tokenizer
model_names = {
    "T5": "google-t5/t5-large",
    "MarianMT": "Helsinki-NLP/opus-mt-en-fr"
}

In [48]:
english_text = "Look, I didn’t want to be a half-blood. If you’re reading this because you think you might be one, my advice is: close this book right now. Believe whatever lie your mom or dad told you about your birth, and try to lead a normal life. Being a half-blood is dangerous. It’s scary. Most of the time, it gets you killed in painful, nasty ways. If you’re a normal kid, reading this because you think it’s fiction, great. Read on. I envy you for being able to believe that none of this ever happened. But if you recognize yourself in these pages..."

In [49]:
def translate_t5(text):
    tokenizer = T5Tokenizer.from_pretrained(model_names["T5"])
    model = T5ForConditionalGeneration.from_pretrained(model_names["T5"])
    input_text = f"translate English to French: {english_text}"
    inputs = tokenizer(input_text, return_tensors="pt")
    translated_tokens = model.generate(**inputs)
    return tokenizer.decode(translated_tokens[0], skip_special_tokens=True)


In [50]:
def translate_marian(text):
    tokenizer = MarianTokenizer.from_pretrained(model_names["MarianMT"])
    model = MarianMTModel.from_pretrained(model_names["MarianMT"])
    text = ">>fr<< " + text
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
    translated_tokens = model.generate(**inputs)
    return tokenizer.decode(translated_tokens[0], skip_special_tokens=True)


In [51]:
t5 = translate_t5(english_text)


In [52]:
marian = translate_marian(english_text)


tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/778k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/802k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.34M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.42k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/301M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

In [53]:
# Compute similarity
def compute_similarity(text1, text2):
    return SequenceMatcher(None, text1, text2).ratio()



In [54]:
similarity_score = compute_similarity(t5, marian)


In [55]:
# Compute differences
diff = list(difflib.ndiff(t5.split(), marian.split()))
diff_output = '\n'.join(diff)


In [56]:
# Write to output file
output_filename = "translation_comparison.txt"
with open(output_filename, "w", encoding="utf-8") as file:
    file.write("Comparison of T5 and MarianMT translation models:\n\n")
    file.write(f"T5 Translation:\n{t5}\n\n")
    file.write(f"MarianMT Translation:\n{marian}\n\n")
    file.write(f"Similarity Score: {similarity_score:.2f}\n\n")
    file.write("Differences:\n")
    file.write(diff_output)

print(f"Comparison results saved to {output_filename}")


Comparison results saved to translation_comparison.txt


In [57]:
similarity_score

0.13312693498452013