# LLM Portion: Machine Translation with NLLB-200

# **Please create a new environment first before installing libraries and dependencies**

# Install Libraries and Dependencies

In [12]:
%pip install torch transformers deep-translator

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.3.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


## Import Libraries

In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from deep_translator import GoogleTranslator
import re

## Load NLLB Model

In [2]:
# Load NLLB-200 for MT
mt_model_name = "facebook/nllb-200-distilled-600M"
mt_tokenizer = AutoTokenizer.from_pretrained(mt_model_name, src_lang="zho_Hans")
mt_model = AutoModelForSeq2SeqLM.from_pretrained(mt_model_name).to("cuda")
print("NLLB-200 model loaded and moved to GPU.")

NLLB-200 model loaded and moved to GPU.


## Define Pipeline Functions

In [9]:
def clean_text(text):
    """Clean the input text by removing excessive repetitions and normalizing spaces."""
    text = re.sub(r'(\\b\\w+\\b)(?:\\s+\\1)+', r'\\1', text)  # Remove repeated words
    text = re.sub(r'\\s+', ' ', text).strip()  # Normalize spaces
    return text

def translate_text(text, target_lang="eng_Latn", use_nllb=True):
    """Translate text using either NLLB-200 or GoogleTranslator."""
    text = clean_text(text)
    print(f"Cleaned Input: {text}")
    
    if use_nllb:
        try:
            print(f"Input text: {text}")
            
            # Prepend target language to input (NLLB convention)
            input_text = f"[{target_lang}] {text}"
            inputs = mt_tokenizer(input_text, return_tensors="pt", padding=True, truncation=True, max_length=512).to("cuda")
            print(f"Tokenized input: {inputs}")
            
            # Generate translation
            translated_ids = mt_model.generate(
                **inputs,
                max_length=512,
                num_beams=5,
                no_repeat_ngram_size=2
            )
            
            # Decode translation
            translated_text = mt_tokenizer.decode(translated_ids[0], skip_special_tokens=True)
            print(f"NLLB Translated text: {translated_text}")
            return translated_text
        except Exception as e:
            print(f"NLLB Translation Error: {str(e)}")
            return f"NLLB Translation Error: {str(e)}"
    else:
        try:
            translated_text = GoogleTranslator(source="zh-CN", target="en").translate(text)
            print(f"Google Translated text: {translated_text}")
            return translated_text
        except Exception as e:
            print(f"Google Translation Error: {str(e)}")
            return f"Google Translation Error: {str(e)}"

## Test the Translation Pipeline with Sample 1

In [10]:
# Test with the sample transcribed text
sample_text = "你好泡泡你好你好说你好好的好的说你好跟着好的做实验So easy自然科学重量实验欢迎来到实验 先准备两桶水记得把水装到一样的高度哦再拿一个苹果一个硬币一个硬币 跟着我放放看哪边的水看起来比较多呢用学习单记录优优独播剧场——YoYo Television Series Exclusive 把蘋果和硬幣拿出來換其他東西吧看看我的香蕉跟芭樂哪邊的水比較多 是芭乐你的实验结果如何呢用学习单记录下来 请不吝点赞 订阅 转发 打赏支持明镜与点点栏目"

print("\nTesting Sample 1:")
print(f"Original: {sample_text}")

# Test with NLLB
nllb_translated = translate_text(sample_text, target_lang="eng_Latn", use_nllb=True)
print("\nNLLB Final Translation:", nllb_translated)

# Test with Google Translate for comparison
google_translated = translate_text(sample_text, target_lang="eng_Latn", use_nllb=False)
print("\nGoogle Final Translation:", google_translated)


Testing Sample 1:
Original: 你好泡泡你好你好说你好好的好的说你好跟着好的做实验So easy自然科学重量实验欢迎来到实验 先准备两桶水记得把水装到一样的高度哦再拿一个苹果一个硬币一个硬币 跟着我放放看哪边的水看起来比较多呢用学习单记录优优独播剧场——YoYo Television Series Exclusive 把蘋果和硬幣拿出來換其他東西吧看看我的香蕉跟芭樂哪邊的水比較多 是芭乐你的实验结果如何呢用学习单记录下来 请不吝点赞 订阅 转发 打赏支持明镜与点点栏目
Cleaned Input: 你好泡泡你好你好说你好好的好的说你好跟着好的做实验So easy自然科学重量实验欢迎来到实验 先准备两桶水记得把水装到一样的高度哦再拿一个苹果一个硬币一个硬币 跟着我放放看哪边的水看起来比较多呢用学习单记录优优独播剧场——YoYo Television Series Exclusive 把蘋果和硬幣拿出來換其他東西吧看看我的香蕉跟芭樂哪邊的水比較多 是芭乐你的实验结果如何呢用学习单记录下来 请不吝点赞 订阅 转发 打赏支持明镜与点点栏目
Input text: 你好泡泡你好你好说你好好的好的说你好跟着好的做实验So easy自然科学重量实验欢迎来到实验 先准备两桶水记得把水装到一样的高度哦再拿一个苹果一个硬币一个硬币 跟着我放放看哪边的水看起来比较多呢用学习单记录优优独播剧场——YoYo Television Series Exclusive 把蘋果和硬幣拿出來換其他東西吧看看我的香蕉跟芭樂哪邊的水比較多 是芭乐你的实验结果如何呢用学习单记录下来 请不吝点赞 订阅 转发 打赏支持明镜与点点栏目
Tokenized input: {'input_ids': tensor([[256200,    709, 256047,  10109,  12695, 249725, 254339, 254339, 249188,
         249725, 249188, 249725, 250102, 249188, 249725,  96339,  96339, 250102,
         249188, 249725, 252229, 250347,  96339, 250566, 250842, 252507,  30743,
       

## Test the Translation Pipeline with Additional Samples

In [11]:
# Additional test samples for robustness
test_samples = [
    "你好！这是一个简单的测试句子。",
    "今天的天气很好，我们去公园散步吧。",
    "科学实验很有趣，我们来试试看吧！So interesting!"
]

for i, sample in enumerate(test_samples, 2):
    print(f"\nTesting Sample {i}:")
    print(f"Original: {sample}")
    
    # Test with NLLB
    nllb_translated = translate_text(sample, target_lang="eng_Latn", use_nllb=True)
    print("\nNLLB Final Translation:", nllb_translated)
    
    # Test with Google Translate for comparison
    google_translated = translate_text(sample, target_lang="eng_Latn", use_nllb=False)
    print("\nGoogle Final Translation:", google_translated)


Testing Sample 2:
Original: 你好！这是一个简单的测试句子。
Cleaned Input: 你好！这是一个简单的测试句子。
Input text: 你好！这是一个简单的测试句子。
Tokenized input: {'input_ids': tensor([[256200,    709, 256047,  10109,  12695, 249725, 248203,  74640,  14257,
         253502, 252255, 248506, 252484, 252409, 252314, 249316, 253935,      2]],
       device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]],
       device='cuda:0')}
NLLB Translated text: [ ] Hello! This is a simple test sentence.

NLLB Final Translation: [ ] Hello! This is a simple test sentence.
Cleaned Input: 你好！这是一个简单的测试句子。
Google Translated text: Hello! This is a simple test sentence.

Google Final Translation: Hello! This is a simple test sentence.

Testing Sample 3:
Original: 今天的天气很好，我们去公园散步吧。
Cleaned Input: 今天的天气很好，我们去公园散步吧。
Input text: 今天的天气很好，我们去公园散步吧。
Tokenized input: {'input_ids': tensor([[256200,    709, 256047,  10109,   7213, 249910, 248506, 249910, 252663,
         251563, 249725, 248079,  14994, 249884, 2498