## Setup

Please run the cells below to install the necessary dependencies.


In [None]:
%%capture
!git clone https://github.com/AI4Bharat/IndicTrans2.git

In [None]:
%%capture
%cd /content/IndicTrans2/huggingface_interface

In [None]:

%%capture
!python3 -m pip install nltk sacremoses pandas regex mock transformers>=4.33.2 mosestokenizer
!python3 -c "import nltk; nltk.download('punkt')"
!python3 -m pip install bitsandbytes scipy accelerate datasets
!python3 -m pip install sentencepiece

!git clone https://github.com/VarunGumma/IndicTransToolkit.git
%cd IndicTransToolkit
!python3 -m pip install --editable ./
%cd ..

**IMPORTANT : Restart your run-time first and then run the cells below.**

## Inference


In [None]:
import torch
from transformers import AutoModelForSeq2SeqLM, BitsAndBytesConfig, AutoTokenizer
from IndicTransToolkit.processor import IndicProcessor

BATCH_SIZE = 4
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
quantization = None

In [None]:
def initialize_model_and_tokenizer(ckpt_dir, quantization):
    if quantization == "4-bit":
        qconfig = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_compute_dtype=torch.bfloat16,
        )
    elif quantization == "8-bit":
        qconfig = BitsAndBytesConfig(
            load_in_8bit=True,
            bnb_8bit_use_double_quant=True,
            bnb_8bit_compute_dtype=torch.bfloat16,
        )
    else:
        qconfig = None

    tokenizer = AutoTokenizer.from_pretrained(ckpt_dir, trust_remote_code=True)
    model = AutoModelForSeq2SeqLM.from_pretrained(
        ckpt_dir,
        trust_remote_code=True,
        low_cpu_mem_usage=True,
        quantization_config=qconfig,
    )

    if qconfig == None:
        model = model.to(DEVICE)
        if DEVICE == "cuda":
            model.half()

    model.eval()

    return tokenizer, model


def batch_translate(input_sentences, src_lang, tgt_lang, model, tokenizer, ip):
    translations = []
    for i in range(0, len(input_sentences), BATCH_SIZE):
        batch = input_sentences[i : i + BATCH_SIZE]

        # Preprocess the batch and extract entity mappings
        batch = ip.preprocess_batch(batch, src_lang=src_lang, tgt_lang=tgt_lang)

        # Tokenize the batch and generate input encodings
        inputs = tokenizer(
            batch,
            truncation=True,
            padding="longest",
            return_tensors="pt",
            return_attention_mask=True,
        ).to(DEVICE)

        # Generate translations using the model
        with torch.no_grad():
            generated_tokens = model.generate(
                **inputs,
                use_cache=True,
                min_length=0,
                max_length=256,
                num_beams=5,
                num_return_sequences=1,
            )

        # Decode the generated tokens into text
        generated_tokens = tokenizer.batch_decode(
            generated_tokens,
            skip_special_tokens=True,
            clean_up_tokenization_spaces=True,
        )

        # Postprocess the translations, including entity replacement
        translations += ip.postprocess_batch(generated_tokens, lang=tgt_lang)

        del inputs
        torch.cuda.empty_cache()

    return translations

In [None]:
##---------------------------------------------------------------------------------------------------------
import os
import pandas as pd
import torch
import numpy as np
from sklearn.model_selection import train_test_split
from datasets import Dataset
def prepare_data(input_csv='eng-dogri.csv', output_dir="data"):
    """
    Prepare data from CSV or use existing files
    """
    if input_csv and os.path.exists(input_csv):
        # Load dataset from CSV
        df = pd.read_csv(input_csv)

        # Ensure correct column names
        if "eng_Latn" not in df.columns or "doi_Deva" not in df.columns:
            raise ValueError("Dataset must contain 'eng_Latn' and 'doi_Deva' columns.")

        # Clean data
        df = df.dropna(subset=["eng_Latn", "doi_Deva"])
        df["eng_Latn"] = df["eng_Latn"].str.strip()
        df["doi_Deva"] = df["doi_Deva"].str.strip()

        # Split dataset
        train_df, temp_df = train_test_split(df, test_size=0.2, random_state=42)
        dev_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

        # Create output directory
        os.makedirs(output_dir, exist_ok=True)

        # Save data to files
        with open(f"{output_dir}/train.en", "w", encoding="utf-8") as f:
            f.write("\n".join(train_df["eng_Latn"].tolist()))

        with open(f"{output_dir}/train.doi", "w", encoding="utf-8") as f:
            f.write("\n".join(train_df["doi_Deva"].tolist()))

        with open(f"{output_dir}/dev.en", "w", encoding="utf-8") as f:
            f.write("\n".join(dev_df["eng_Latn"].tolist()))

        with open(f"{output_dir}/dev.doi", "w", encoding="utf-8") as f:
            f.write("\n".join(dev_df["doi_Deva"].tolist()))

        with open(f"{output_dir}/test.en", "w", encoding="utf-8") as f:
            f.write("\n".join(test_df["eng_Latn"].tolist()))

        with open(f"{output_dir}/test.doi", "w", encoding="utf-8") as f:
            f.write("\n".join(test_df["doi_Deva"].tolist()))

        print(f" Dataset processed and saved in '{output_dir}/' directory")
        print(f"   Train: {len(train_df)} samples")
        print(f"   Dev: {len(dev_df)} samples")
        print(f"   Test: {len(test_df)} samples")

        # Match the file extensions actually used
        for split in ["train", "dev", "test"]:
            src_file = f"{output_dir}/{split}.en"
            tgt_file = f"{output_dir}/{split}.doi"

            if not os.path.exists(src_file) or not os.path.exists(tgt_file):
                if split == "test":
                    continue
                raise FileNotFoundError(f"Required data files not found: {src_file} or {tgt_file}")

    return output_dir

def load_dataset_from_files(data_dir, split):
    """Load dataset from text files"""
    src_path = os.path.join(data_dir, f"{split}.{SRC_LANG}")
    tgt_path = os.path.join(data_dir, f"{split}.{TGT_LANG}")

    if not os.path.exists(src_path) or not os.path.exists(tgt_path):
        raise FileNotFoundError(f"Data files not found: {src_path} or {tgt_path}")

    with open(src_path, encoding="utf-8") as src_file, open(tgt_path, encoding="utf-8") as tgt_file:
        src_lines = [line.strip() for line in src_file.readlines()]
        tgt_lines = [line.strip() for line in tgt_file.readlines()]

    if len(src_lines) != len(tgt_lines):
        raise ValueError(f"Mismatch in source and target file lengths for {split}")

    print(f"Loaded {len(src_lines)} examples for {split}")

    return Dataset.from_dict({
        "translation": [
            {"en": src, "doi": tgt}
            for src, tgt in zip(src_lines, tgt_lines)
        ]
    })

### English to Indic Example


In [None]:
en_indic_ckpt_dir = "ai4bharat/indictrans2-en-indic-1B"
en_indic_tokenizer, en_indic_model = initialize_model_and_tokenizer(en_indic_ckpt_dir, quantization)
ip = IndicProcessor(inference=True)

src_lang, tgt_lang = "eng_Latn", "doi_Deva"

while True:
    print("Enter sentence (or QUIT to exit):")
    text = input().strip()
    if text == "QUIT":
        break
    translation = batch_translate(
        [text],
        src_lang,
        tgt_lang,
        en_indic_model,
        en_indic_tokenizer,
        ip
    )[0]
    print("Translated text is : ")
    print(translation)
    print("------------------------------------------------------------------")
del en_indic_tokenizer, en_indic_model


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenization_indictrans.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/ai4bharat/indictrans2-en-indic-1B:
- tokenization_indictrans.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


dict.SRC.json: 0.00B [00:00, ?B/s]

dict.TGT.json: 0.00B [00:00, ?B/s]

model.SRC:   0%|          | 0.00/759k [00:00<?, ?B/s]

model.TGT:   0%|          | 0.00/3.26M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/96.0 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

configuration_indictrans.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/ai4bharat/indictrans2-en-indic-1B:
- configuration_indictrans.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_indictrans.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/ai4bharat/indictrans2-en-indic-1B:
- modeling_indictrans.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors:   0%|          | 0.00/4.46G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/163 [00:00<?, ?B/s]

Enter sentence (or QUIT to exit):
How are u doing
Translated text is : 
तुस किʼयां करा'रदे ओ?
------------------------------------------------------------------
Enter sentence (or QUIT to exit):
what is your name
Translated text is : 
तुंʼदा नांऽ केह् ऐ
------------------------------------------------------------------
Enter sentence (or QUIT to exit):
I am happy
Translated text is : 
में खुश आं।
------------------------------------------------------------------
Enter sentence (or QUIT to exit):
Weather is so good today
Translated text is : 
अज्ज मौसम बड़ा चंगा ऐ
------------------------------------------------------------------
Enter sentence (or QUIT to exit):
QUIT


In [27]:
## Bleu Score
## Dataset taken from https://huggingface.co/datasets/facebook/flores
## Reference kashmiri sentences for the predicted sentences are present in kas_Deva.devtest

In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
eng_path = "/content/drive/My Drive/Machine Translation/eng_Latn.devtest"
doi_path = "/content/drive/My Drive/Machine Translation/kas_Deva.devtest"


In [None]:
with open(eng_path, "r", encoding="utf-8") as f:
    english_sentences = [line.strip() for line in f.readlines()]


In [29]:
ckpt_dir = "ai4bharat/indictrans2-en-indic-1B"
tokenizer, model = initialize_model_and_tokenizer(ckpt_dir, quantization=None)
ip = IndicProcessor(inference=True)

src_lang = "eng_Latn"
tgt_lang = "doi_Deva"

# Run batch translation
predicted_sentences = []
BATCH_SIZE = 4

for i in range(0, len(english_sentences), BATCH_SIZE):
    batch = english_sentences[i:i+BATCH_SIZE]
    translations = batch_translate(batch, src_lang, tgt_lang, model, tokenizer, ip)
    predicted_sentences.extend(translations)


In [16]:
pred_path = "/content/drive/My Drive/Machine Translation/model_predictions.txt"

with open(pred_path, "w", encoding="utf-8") as f:
    for sent in predicted_sentences:
        f.write(sent.strip() + "\n")


In [17]:
ref_path  = "/content/drive/My Drive/Machine Translation/kas_Deva.devtest"
pred_path = "/content/drive/My Drive/Machine Translation/model_predictions.txt"

In [19]:
import nltk
nltk.download('punkt')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [28]:
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction
from nltk.tokenize import word_tokenize
import nltk

# ✅ Download correct tokenizer
nltk.download('punkt')

# 📥 Load predictions and references
with open(pred_path, "r", encoding="utf-8") as f:
    predicted_lines = [line.strip() for line in f]

with open(ref_path, "r", encoding="utf-8") as f:
    reference_lines = [line.strip() for line in f]

# ✅ Ensure both files have the same number of lines
assert len(predicted_lines) == len(reference_lines), \
    f"Mismatch: {len(predicted_lines)} predictions vs {len(reference_lines)} references"

# 🔠 Tokenize both lists
candidates = [pred.strip().split() for pred in predicted_lines]
references = [[ref.strip().split()] for ref in reference_lines]

# 📊 Compute BLEU score
smoothing = SmoothingFunction().method4
bleu_score = corpus_bleu(references, candidates, smoothing_function=smoothing) * 100 * 100

print(f"🔵 BLEU Score: {bleu_score:.2f}")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


🔵 BLEU Score: 15.02


In [None]:
## GRADIO PART

In [None]:
%%capture
!pip install gradio
import gradio as gr


In [None]:
def initialize_model_and_tokenizer(ckpt_dir, quantization):
    if quantization == "4-bit":
        qconfig = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_compute_dtype=torch.bfloat16,
        )
    elif quantization == "8-bit":
        qconfig = BitsAndBytesConfig(
            load_in_8bit=True,
            bnb_8bit_use_double_quant=True,
            bnb_8bit_compute_dtype=torch.bfloat16,
        )
    else:
        qconfig = None

    tokenizer = AutoTokenizer.from_pretrained(ckpt_dir, trust_remote_code=True)
    model = AutoModelForSeq2SeqLM.from_pretrained(
        ckpt_dir,
        trust_remote_code=True,
        low_cpu_mem_usage=True,
        quantization_config=qconfig,
    )

    if qconfig is None:
        model = model.to(DEVICE)
        if DEVICE == "cuda":
            model.half()

    model.eval()
    return tokenizer, model

# Batch translation
def batch_translate(input_sentences, src_lang, tgt_lang, model, tokenizer, ip):
    translations = []
    for i in range(0, len(input_sentences), BATCH_SIZE):
        batch = input_sentences[i : i + BATCH_SIZE]
        batch = ip.preprocess_batch(batch, src_lang=src_lang, tgt_lang=tgt_lang)
        inputs = tokenizer(
            batch,
            truncation=True,
            padding="longest",
            return_tensors="pt",
            return_attention_mask=True,
        ).to(DEVICE)

        with torch.no_grad():
            generated_tokens = model.generate(
                **inputs,
                use_cache=True,
                min_length=0,
                max_length=256,
                num_beams=5,
                num_return_sequences=1,
            )

        generated_tokens = tokenizer.batch_decode(
            generated_tokens,
            skip_special_tokens=True,
            clean_up_tokenization_spaces=True,
        )
        translations += ip.postprocess_batch(generated_tokens, lang=tgt_lang)
        del inputs
        torch.cuda.empty_cache()
    return translations

# Load model
ckpt_dir = "ai4bharat/indictrans2-en-indic-1B"
tokenizer, model = initialize_model_and_tokenizer(ckpt_dir, quantization)
ip = IndicProcessor(inference=True)

# Gradio translation function
def translate_text(text):
    if not text.strip():
        return "⚠️ Please enter some text."
    translation = batch_translate([text], src_lang, tgt_lang, model, tokenizer, ip)
    return translation[0]

# Description and Examples
description = """
<div style='margin-top: 10px; margin-bottom: 30px; font-size: 16px;'>
🔤 Translate <strong>English</strong> sentences into <strong>Kashmiri </strong> in one click!<br>
Just type your sentence below and hit <strong>Translate 🛫</strong>.
</div>
"""

examples = [
    ["Hello, how are you?"],
    ["What is your name?"],
    ["I am learning machine translation."],
    ["Can you help me?"],
    ["This is a great project!"]
]

# Gradio interface
demo = gr.Interface(
    fn=translate_text,
    inputs=gr.Textbox(
        lines=3,
        placeholder="Type your English sentence here...",
        label="Enter English text"
    ),
    outputs=gr.Textbox(
        label="Kashmiri Translation"
    ),
    examples=examples,
    title="<h1 style='margin-bottom: 15px;'>🌐 English ➡️ Kashmiri Translator</h1>",
    description=description,
    theme="soft",
    allow_flagging="never"
)

# Launch with share link
demo.launch(share=True)



Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://eeed49bcd51bc75921.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




In [None]:
def translate_input(text):
    if not text.strip():
        return "Please enter some text."
    translated = batch_translate([text], "eng_Latn", "doi_Deva", model, tokenizer, ip)
    return translated[0]

gr.Interface(
    fn=translate_input,
    inputs=gr.Textbox(lines=4, label="Enter English Text"),
    outputs=gr.Textbox(label="Translated Dogri Text"),
    title="English to Dogri Translator",
    description="Enter a sentence in English and get the translation in Dogri using IndicTrans2 model.",
    theme="default",
).launch()

It looks like you are running Gradio on a hosted a Jupyter notebook. For the Gradio app to work, sharing must be enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://cf861581810a330e6a.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




### Indic to English Example

In [None]:
# indic_en_ckpt_dir = "ai4bharat/indictrans2-indic-en-1B"  # ai4bharat/indictrans2-indic-en-dist-200M
# indic_en_tokenizer, indic_en_model = initialize_model_and_tokenizer(indic_en_ckpt_dir, quantization)

# ip = IndicProcessor(inference=True)

# hi_sents = [
#     "जब मैं छोटा था, मैं हर रोज़ पार्क जाता था।",
#     "उसके पास बहुत सारी पुरानी किताबें हैं, जिन्हें उसने अपने दादा-परदादा से विरासत में पाया।",
#     "मुझे समझ में नहीं आ रहा कि मैं अपनी समस्या का समाधान कैसे ढूंढूं।",
#     "वह बहुत मेहनती और समझदार है, इसलिए उसे सभी अच्छे मार्क्स मिले।",
#     "हमने पिछले सप्ताह एक नई फिल्म देखी जो कि बहुत प्रेरणादायक थी।",
#     "अगर तुम मुझे उस समय पास मिलते, तो हम बाहर खाना खाने चलते।",
#     "वह अपनी दीदी के साथ बाजार गयी थी ताकि वह नई साड़ी खरीद सके।",
#     "राज ने मुझसे कहा कि वह अगले महीने अपनी नानी के घर जा रहा है।",
#     "सभी बच्चे पार्टी में मज़ा कर रहे थे और खूब सारी मिठाइयाँ खा रहे थे।",
#     "मेरे मित्र ने मुझे उसके जन्मदिन की पार्टी में बुलाया है, और मैं उसे एक तोहफा दूंगा।",
# ]
# src_lang, tgt_lang = "hin_Deva", "eng_Latn"
# en_translations = batch_translate(hi_sents, src_lang, tgt_lang, indic_en_model, indic_en_tokenizer, ip)


# print(f"\n{src_lang} - {tgt_lang}")
# for input_sentence, translation in zip(hi_sents, en_translations):
#     print(f"{src_lang}: {input_sentence}")
#     print(f"{tgt_lang}: {translation}")

# # flush the models to free the GPU memory
# del indic_en_tokenizer, indic_en_model

tokenizer_config.json:   0%|          | 0.00/1.10k [00:00<?, ?B/s]

tokenization_indictrans.py:   0%|          | 0.00/8.04k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/ai4bharat/indictrans2-indic-en-1B:
- tokenization_indictrans.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


dict.SRC.json:   0%|          | 0.00/3.39M [00:00<?, ?B/s]

dict.TGT.json:   0%|          | 0.00/645k [00:00<?, ?B/s]

model.SRC:   0%|          | 0.00/3.26M [00:00<?, ?B/s]

model.TGT:   0%|          | 0.00/759k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/96.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.37k [00:00<?, ?B/s]

configuration_indictrans.py:   0%|          | 0.00/14.2k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/ai4bharat/indictrans2-indic-en-1B:
- configuration_indictrans.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_indictrans.py:   0%|          | 0.00/79.8k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/ai4bharat/indictrans2-indic-en-1B:
- modeling_indictrans.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors:   0%|          | 0.00/4.09G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/163 [00:00<?, ?B/s]


hin_Deva - eng_Latn
hin_Deva: जब मैं छोटा था, मैं हर रोज़ पार्क जाता था।
eng_Latn: When I was young, I used to go to the park every day.
hin_Deva: उसके पास बहुत सारी पुरानी किताबें हैं, जिन्हें उसने अपने दादा-परदादा से विरासत में पाया।
eng_Latn: She has a lot of old books, which she inherited from her grandparents.
hin_Deva: मुझे समझ में नहीं आ रहा कि मैं अपनी समस्या का समाधान कैसे ढूंढूं।
eng_Latn: I don't know how to find a solution to my problem.
hin_Deva: वह बहुत मेहनती और समझदार है, इसलिए उसे सभी अच्छे मार्क्स मिले।
eng_Latn: He is very hardworking and understanding, so he got all the good marks.
hin_Deva: हमने पिछले सप्ताह एक नई फिल्म देखी जो कि बहुत प्रेरणादायक थी।
eng_Latn: We saw a new movie last week that was very inspiring.
hin_Deva: अगर तुम मुझे उस समय पास मिलते, तो हम बाहर खाना खाने चलते।
eng_Latn: If you'd given me a pass at that time, we'd have gone out to eat.
hin_Deva: वह अपनी दीदी के साथ बाजार गयी थी ताकि वह नई साड़ी खरीद सके।
eng_Latn: She had gone to the market wit

### Indic to Indic Example


In [None]:
# indic_indic_ckpt_dir = "ai4bharat/indictrans2-indic-indic-1B"  # ai4bharat/indictrans2-indic-indic-dist-320M
# indic_indic_tokenizer, indic_indic_model = initialize_model_and_tokenizer(indic_indic_ckpt_dir, quantization)

# ip = IndicProcessor(inference=True)

# hi_sents = [
#     "जब मैं छोटा था, मैं हर रोज़ पार्क जाता था।",
#     "उसके पास बहुत सारी पुरानी किताबें हैं, जिन्हें उसने अपने दादा-परदादा से विरासत में पाया।",
#     "मुझे समझ में नहीं आ रहा कि मैं अपनी समस्या का समाधान कैसे ढूंढूं।",
#     "वह बहुत मेहनती और समझदार है, इसलिए उसे सभी अच्छे मार्क्स मिले।",
#     "हमने पिछले सप्ताह एक नई फिल्म देखी जो कि बहुत प्रेरणादायक थी।",
#     "अगर तुम मुझे उस समय पास मिलते, तो हम बाहर खाना खाने चलते।",
#     "वह अपनी दीदी के साथ बाजार गयी थी ताकि वह नई साड़ी खरीद सके।",
#     "राज ने मुझसे कहा कि वह अगले महीने अपनी नानी के घर जा रहा है।",
#     "सभी बच्चे पार्टी में मज़ा कर रहे थे और खूब सारी मिठाइयाँ खा रहे थे।",
#     "मेरे मित्र ने मुझे उसके जन्मदिन की पार्टी में बुलाया है, और मैं उसे एक तोहफा दूंगा।",
# ]
# src_lang, tgt_lang = "hin_Deva", "mar_Deva"
# mr_translations = batch_translate(hi_sents, src_lang, tgt_lang, indic_indic_model, indic_indic_tokenizer, ip)

# print(f"\n{src_lang} - {tgt_lang}")
# for input_sentence, translation in zip(hi_sents, mr_translations):
#     print(f"{src_lang}: {input_sentence}")
#     print(f"{tgt_lang}: {translation}")

# # flush the models to free the GPU memory
# del indic_indic_tokenizer, indic_indic_model

tokenizer_config.json:   0%|          | 0.00/1.10k [00:00<?, ?B/s]

tokenization_indictrans.py:   0%|          | 0.00/8.04k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/ai4bharat/indictrans2-indic-indic-1B:
- tokenization_indictrans.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


dict.SRC.json:   0%|          | 0.00/3.39M [00:00<?, ?B/s]

dict.TGT.json:   0%|          | 0.00/3.39M [00:00<?, ?B/s]

model.SRC:   0%|          | 0.00/3.26M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/96.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

configuration_indictrans.py:   0%|          | 0.00/14.2k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/ai4bharat/indictrans2-indic-indic-1B:
- configuration_indictrans.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_indictrans.py:   0%|          | 0.00/79.8k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/ai4bharat/indictrans2-indic-indic-1B:
- modeling_indictrans.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors:   0%|          | 0.00/4.83G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/168 [00:00<?, ?B/s]


hin_Deva - mar_Deva
hin_Deva: जब मैं छोटा था, मैं हर रोज़ पार्क जाता था।
mar_Deva: मी लहान होतो तेव्हा मी दररोज उद्यानाला जायचे.
hin_Deva: उसके पास बहुत सारी पुरानी किताबें हैं, जिन्हें उसने अपने दादा-परदादा से विरासत में पाया।
mar_Deva: तिच्याकडे बरीच जुनी पुस्तके आहेत, जी तिला तिच्या आजोबांकडून वारशाने मिळाली आहेत.
hin_Deva: मुझे समझ में नहीं आ रहा कि मैं अपनी समस्या का समाधान कैसे ढूंढूं।
mar_Deva: माझ्या समस्येवर तोडगा कसा काढायचा हे मला समजत नाही.
hin_Deva: वह बहुत मेहनती और समझदार है, इसलिए उसे सभी अच्छे मार्क्स मिले।
mar_Deva: तो खूप मेहनती आणि बुद्धिमान आहे, त्यामुळे त्याला सर्व चांगले गुण मिळाले.
hin_Deva: हमने पिछले सप्ताह एक नई फिल्म देखी जो कि बहुत प्रेरणादायक थी।
mar_Deva: आम्ही गेल्या आठवड्यात एक नवीन चित्रपट पाहिला जो खूप प्रेरणादायी होता.
hin_Deva: अगर तुम मुझे उस समय पास मिलते, तो हम बाहर खाना खाने चलते।
mar_Deva: जर तुम्हाला त्या वेळी मला पास मिळाला तर आम्ही बाहेर जेवायला जाऊ.
hin_Deva: वह अपनी दीदी के साथ बाजार गयी थी ताकि वह नई साड़ी खरीद सके।
mar_Deva: ती तिच्या ब