This script employs Facebook's NLLB model with HuggingFace for machine translation.

(Source: https://github.com/sinaahmadi/ScriptNormalization)

In [None]:
!pip install transformers
!pip install --quiet bitsandbytes
!pip install --quiet --upgrade accelerate

In [None]:
import gc
import torch

gc.collect()
torch.cuda.empty_cache()

!nvidia-smi

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline

tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M")
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M").cuda()
model.device

In [None]:
translator_ckb = pipeline('translation', model=model, tokenizer=tokenizer, src_lang='ckb_Arab', tgt_lang='eng_Latn', max_length = 400, device=model.device, num_beams=3, early_stopping=True)
translator_kas = pipeline('translation', model=model, tokenizer=tokenizer, src_lang='kas_Arab', tgt_lang='eng_Latn', max_length = 400, device=model.device, num_beams=3, early_stopping=True)
translator_snd = pipeline('translation', model=model, tokenizer=tokenizer, src_lang='snd_Arab', tgt_lang='eng_Latn', max_length = 400, device=model.device, num_beams=3, early_stopping=True)


In [None]:
for f_name in ["Sorani-Arabic", "Sorani-Persian", "Kashmiri-Urdu", "Sindhi-Urdu"]:
  for n in ["20", "40", "60", "80", "100"]:
    for t in ["src", "normalized.src", "trg"]:
      print(f_name, n, t)
      # read the file
      with open("%s/devtest_%s.%s"%(f_name, n, t), "r") as f:
        text = f.read().splitlines()
      print("Input length: ", len(text))
      # translate the file
      trans_text = list()
      if "Sorani" in f_name:
        trans_text = translator_ckb(text)
      elif "Sindhi" in f_name:
        trans_text = translator_snd(text)
      else:
        trans_text = translator_kas(text)
      
      trans_text = [i["translation_text"] for i in trans_text]
      #print(text)
      #print(trans_text)
      # save the translation
      print(len(trans_text))
      with open("%s/devtest_%s.translated.%s"%(f_name, n, t), "w") as f:
        f.write("\n".join(trans_text))