In [1]:
from langdetect import detect
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

# Load the translation model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M")
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M")

# Mapping from detected language to NLLB-200 language codes
# Mapping from detected language to NLLB-200 language codes
lang_code_mapping = {
    "en": "eng_Latn",  # English
    "hi": "hin_Deva",  # Hindi
    "gu": "guj_Gujr",  # Gujarati
    "mr": "mar_Deva",  # Marathi
    "te": "tel_Telu",  # Telugu
    "ur": "urd_Arab"   # Hyderabadi (Dakhini Urdu uses Urdu script)
    # Add more mappings if needed
}

def predominant_language(sentence):
    words = sentence.split()  # Tokenize by space
    language_counts = {}

    for word in words:
        try:
            lang = detect(word)  # Detect language
            language_counts[lang] = language_counts.get(lang, 0) + 1
        except:
            pass  # Ignore errors

    if not language_counts:
        return "Unknown"

    predominant = max(language_counts, key=language_counts.get)

    # Get corresponding language code for NLLB-200
    return lang_code_mapping.get(predominant, "eng_Latn")  # Default to English if not found

# sentence = "मुझे hungry लगी है"  # Mixed Gujarati and English
# sentence = "mera name hemanshi"
sentence = "माझं मन खुश है, पण अजून काम बाकी है!"
# sentence = "Kal maine ek amazing movie dekhi, it was full of action and drama!"

# Detect predominant language
detected_lang_code = predominant_language(sentence)

# Tokenize input
inputs = tokenizer(sentence, return_tensors="pt")

# Translate to the detected language
translated_tokens = model.generate(
    **inputs, forced_bos_token_id=tokenizer.convert_tokens_to_ids(detected_lang_code), max_length=30
)

# Decode and print translation
result = tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)
print(detected_lang_code)
print(f"Translated text: {result[0]}")


  from .autonotebook import tqdm as notebook_tqdm


mar_Deva
Translated text: माझे मन आनंदी आहे, पण अजून काम बाकी आहे!
