Develop a Machine Translation system to translate public information content between English
and any Indian language.

In [1]:
!pip install sentencepiece torch indic-nlp-library
!pip install git+https://github.com/VarunGumma/IndicTransToolkit.git --quiet

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


In [2]:
!pip install transformers==4.44.0 --quiet

In [3]:
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from IndicTransToolkit import IndicProcessor

In [None]:
from huggingface_hub import login

# Replace with your actual token from https://huggingface.co/settings/tokens
login(token="REPLACE_TOKEN")

In [5]:
model_name = "ai4bharat/indictrans2-en-indic-1B"

tokenizer = AutoTokenizer.from_pretrained(
    model_name, trust_remote_code=True
)
model = AutoModelForSeq2SeqLM.from_pretrained(
    model_name, trust_remote_code=True
)

# Initialize the IndicProcessor for preprocessing
ip = IndicProcessor(inference=True)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

Error while fetching `HF_TOKEN` secret value from your vault: 'Requesting secret HF_TOKEN timed out. Secrets can only be fetched when running from the Colab UI.'.
You are not authenticated with the Hugging Face Hub in this notebook.
If the error persists, please let us know by opening an issue on GitHub (https://github.com/huggingface/huggingface_hub/issues/new).


IndicTransForConditionalGeneration(
  (model): IndicTransModel(
    (encoder): IndicTransEncoder(
      (embed_tokens): Embedding(32322, 1024, padding_idx=1)
      (embed_positions): IndicTransSinusoidalPositionalEmbedding()
      (layers): ModuleList(
        (0-17): 18 x IndicTransEncoderLayer(
          (self_attn): IndicTransAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=1024, out_features=8192, bias=True)
          (fc2): Linear(in_features=8192, out_features=1024, bias=True)
          (final_layer_norm): LayerNorm((1024,), eps=1e-05,

In [6]:
def translate_english_to_marathi(text):
    src_lang = "eng_Latn"
    tgt_lang = "mar_Deva"
    
    # Preprocess the input text
    input_sentences = [text]
    batch = ip.preprocess_batch(input_sentences, src_lang=src_lang, tgt_lang=tgt_lang)
    
    inputs = tokenizer(
        batch,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=256
    ).to(device)

    with torch.no_grad():
        output_tokens = model.generate(
            **inputs,
            max_length=256,
            num_beams=5,
            early_stopping=True,
            num_return_sequences=1
        )

    # Decode the output
    with tokenizer.as_target_tokenizer():
        output_text = tokenizer.batch_decode(
            output_tokens,
            skip_special_tokens=True
        )
    
    # Postprocess the output
    output_text = ip.postprocess_batch(output_text, lang=tgt_lang)
    
    return output_text[0]

In [7]:
english_text = """
Wearing a helmet while riding a two-wheeler is mandatory for safety.
Follow traffic rules to reduce accidents.
"""

marathi_translation = translate_english_to_marathi(english_text)

print("English Text:\n", english_text)
print("\nMarathi Translation:\n", marathi_translation)

English Text:
 
Wearing a helmet while riding a two-wheeler is mandatory for safety.
Follow traffic rules to reduce accidents.


Marathi Translation:
 दुचाकी चालवताना हेल्मेट घालणे सुरक्षेसाठी अनिवार्य आहे. अपघात कमी करण्यासाठी वाहतुकीच्या नियमांचे पालन करा.


