<a href="https://colab.research.google.com/github/saarang2003/Machine-Tranlator/blob/main/%20Machine%20Language%20Translator%20.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%%capture
!git clone https://github.com/AI4Bharat/IndicTrans2.git


In [None]:
%%capture
%cd /content/IndicTrans2/huggingface_interface

In [None]:
%%capture
!python3 -m pip install nltk sacremoses pandas regex mock transformers>=4.33.2 mosestokenizer
!python3 -c "import nltk; nltk.download('punkt')"
!python3 -m pip install bitsandbytes scipy accelerate datasets
!python3 -m pip install sentencepiece

!git clone https://github.com/VarunGumma/IndicTransToolkit.git
%cd IndicTransToolkit
!python3 -m pip install --editable ./
%cd ..

In [None]:
%%writefile app.py
import streamlit as st
import torch
from transformers import AutoModelForSeq2SeqLM, BitsAndBytesConfig, AutoTokenizer
from IndicTransToolkit import IndicProcessor

# Configuration
BATCH_SIZE = 4
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
quantization = None  # Options: None, "4-bit", or "8-bit"

# Function to initialize model and tokenizer
def initialize_model_and_tokenizer(ckpt_dir, quantization):
    if quantization == "4-bit":
        qconfig = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_compute_dtype=torch.bfloat16,
        )
    elif quantization == "8-bit":
        qconfig = BitsAndBytesConfig(
            load_in_8bit=True,
            bnb_8bit_use_double_quant=True,
            bnb_8bit_compute_dtype=torch.bfloat16,
        )
    else:
        qconfig = None

    tokenizer = AutoTokenizer.from_pretrained(ckpt_dir, trust_remote_code=True)
    model = AutoModelForSeq2SeqLM.from_pretrained(
        ckpt_dir,
        trust_remote_code=True,
        low_cpu_mem_usage=True,
        quantization_config=qconfig,
    )

    if qconfig is None:
        model = model.to(DEVICE)
        if DEVICE == "cuda":
            model.half()

    model.eval()
    return tokenizer, model

def batch_translate(input_sentences, src_lang, tgt_lang, model, tokenizer, ip):
    translations = []
    for i in range(0, len(input_sentences), BATCH_SIZE):
        batch = input_sentences[i : i + BATCH_SIZE]
        batch = ip.preprocess_batch(batch, src_lang=src_lang, tgt_lang=tgt_lang)
        inputs = tokenizer(
            batch,
            truncation=True,
            padding="longest",
            return_tensors="pt",
            return_attention_mask=True,
        ).to(DEVICE)

        with torch.no_grad():
            generated_tokens = model.generate(
                **inputs,
                use_cache=True,
                min_length=0,
                max_length=256,
                num_beams=5,
                num_return_sequences=1,
            )

        with tokenizer.as_target_tokenizer():
            decoded = tokenizer.batch_decode(
                generated_tokens.detach().cpu().tolist(),
                skip_special_tokens=True,
                clean_up_tokenization_spaces=True,
            )

        translations += ip.postprocess_batch(decoded, lang=tgt_lang)
        del inputs
        if DEVICE == "cuda":
            torch.cuda.empty_cache()
    return translations

@st.cache_resource(show_spinner=False)
def load_model():
    ckpt_dir = "ai4bharat/indictrans2-en-indic-1B"
    tokenizer, model = initialize_model_and_tokenizer(ckpt_dir, quantization)
    ip = IndicProcessor(inference=True)
    return tokenizer, model, ip

def main():
    st.title("IndicTrans2 English-to-Indic Translation")
    st.write("Enter English sentences and get translations in your desired Indic language.")

    st.sidebar.header("Translation Settings")
    src_lang = st.sidebar.selectbox("Source Language", options=["eng_Latn"], index=0)
    target_options = {
        "Assamese": "asm_Beng",
        "Bengali": "ben_Beng",
        "Gujarati": "guj_Gujr",
        "Hindi": "hin_Deva",
        "Kannada": "kan_Knda",
        "Malayalam": "mal_Mlym",
        "Marathi": "mar_Deva",
        "Nepali": "npi_Deva",
        "Oriya": "ori_Orya",
        "Punjabi": "pan_Guru",
        "Tamil": "tam_Taml",
        "Telugu": "tel_Telu",
        "Urdu": "urd_Arab",
    }
    tgt_lang_name = st.sidebar.selectbox("Target Language", list(target_options.keys()))
    tgt_lang = target_options[tgt_lang_name]

    input_text = st.text_area("Enter English sentences (one per line)", height=200)
    if st.button("Translate"):
        if not input_text.strip():
            st.warning("Please enter some text to translate.")
        else:
            input_sentences = [line.strip() for line in input_text.split("\n") if line.strip()]
            with st.spinner("Loading model and translating..."):
                tokenizer, model, ip = load_model()
                translations = batch_translate(input_sentences, src_lang, tgt_lang, model, tokenizer, ip)
            st.subheader("Translations")
            for src, tgt in zip(input_sentences, translations):
                st.markdown(f"**English:** {src}")
                st.markdown(f"**{tgt_lang_name}:** {tgt}")
                st.write("---")

if __name__ == "__main__":
    main()


Writing app.py


In [None]:
!streamlit run app.py &>/dev/null&


In [None]:
!pip install streamlit pyngrok

Collecting pyngrok
  Downloading pyngrok-7.2.3-py3-none-any.whl.metadata (8.7 kB)
Downloading pyngrok-7.2.3-py3-none-any.whl (23 kB)
Installing collected packages: pyngrok
Successfully installed pyngrok-7.2.3


In [None]:
!pip install --upgrade pyngrok




In [None]:
ngrok.kill()

In [None]:
from pyngrok import ngrok

# Set your ngrok authtoken (replace with your actual token)
ngrok.set_auth_token("2saHaLzNEOiPkOpNnWLBRe0P6KZ_efXapqC6vMMNp9iKRrR5")

# Optionally, kill any existing tunnels
# ngrok.kill()

# Create a tunnel to the Streamlit port (8501 by default)
public_url = ngrok.connect(8501, "http", bind_tls=False)
print(f"Streamlit app is available at: {public_url}")


Streamlit app is available at: NgrokTunnel: "http://1c27-35-196-201-177.ngrok-free.app" -> "http://localhost:8501"


In [None]:
%%bash --bg
streamlit run app.py


In [None]:
!streamlit run app.py &>/dev/null&


In [1]:
import pandas as pd

In [2]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [4]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense, Attention
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import SparseCategoricalCrossentropy


In [5]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Example input lists
# hindi_sentences = [...]  # Already cleaned and aligned
# english_sentences = [...]  # Already cleaned and aligned

# Tokenizers
hin_tokenizer = Tokenizer(filters='', oov_token='<OOV>')
eng_tokenizer = Tokenizer(filters='', oov_token='<OOV>')

hin_tokenizer.fit_on_texts(hindi_sentences)
eng_tokenizer.fit_on_texts(english_sentences)

# Convert sentences to sequences
hindi_sequences = hin_tokenizer.texts_to_sequences(hindi_sentences)
english_sequences = eng_tokenizer.texts_to_sequences(english_sentences)

# ✅ Remove empty sequences (avoid pad_sequences error)
filtered_pairs = [(hin, eng) for hin, eng in zip(hindi_sequences, english_sequences) if len(hin) > 0 and len(eng) > 1]

hindi_sequences = [pair[0] for pair in filtered_pairs]
english_sequences = [pair[1] for pair in filtered_pairs]

# Recalculate max lengths
max_len_hin = max(len(seq) for seq in hindi_sequences)
max_len_eng = max(len(seq) for seq in english_sequences)

# Pad encoder input
encoder_input_data = pad_sequences(hindi_sequences, maxlen=max_len_hin, padding='post')

# Prepare decoder input and target
decoder_input_sequences = [seq[:-1] for seq in english_sequences]
decoder_target_sequences = [seq[1:] for seq in english_sequences]

decoder_input_data = pad_sequences(decoder_input_sequences, maxlen=max_len_eng-1, padding='post')
decoder_target_data = pad_sequences(decoder_target_sequences, maxlen=max_len_eng-1, padding='post')


NameError: name 'hindi_sentences' is not defined

In [7]:
import sentencepiece as spm

sp_eng = spm.SentencePieceProcessor()
sp_eng.load('eng_spm.model')

sp_hin = spm.SentencePieceProcessor()
sp_hin.load('hin_spm.model')  # or hin_spm.model if you used SentencePiece for Hindi too


True

In [11]:
# Read your train.eng and train.hin files as lists of sentences
with open('train.eng', 'r', encoding='utf-8') as f:
    train_eng_text = f.readlines()

with open('train.hin', 'r', encoding='utf-8') as f:
    train_hin_text = f.readlines()



In [12]:
train_eng_ids = [sp_eng.encode_as_ids(sentence.strip()) for sentence in train_eng_text]
train_hin_ids = [sp_hin.encode_as_ids(sentence.strip()) for sentence in train_hin_text]


In [13]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

max_len_eng = max(len(seq) for seq in train_eng_ids)
max_len_hin = max(len(seq) for seq in train_hin_ids)

encoder_input_data = pad_sequences(train_hin_ids, maxlen=max_len_hin, padding='post')
decoder_input_data = pad_sequences(train_eng_ids, maxlen=max_len_eng, padding='post')


In [14]:
decoder_target_data = np.array([seq[1:] + [0] for seq in decoder_input_data])
decoder_target_data = pad_sequences(decoder_target_data, maxlen=max_len_eng, padding='post')
decoder_target_data = np.expand_dims(decoder_target_data, -1)


In [15]:
hin_vocab_size = sp_hin.get_piece_size()
eng_vocab_size = sp_eng.get_piece_size()


In [19]:
import sentencepiece as spm

spm.SentencePieceTrainer.train(
    input='train.eng',
    model_prefix='eng_spm',
    vocab_size=8000,   # You can set 4000/5000/8000 as needed
    model_type='bpe',  # or 'unigram'
    character_coverage=1.0  # For English, 1.0 is okay
)


In [20]:
spm.SentencePieceTrainer.train(
    input='train.hin',
    model_prefix='hin_spm',
    vocab_size=8000,
    model_type='bpe',
    character_coverage=0.9995  # For Hindi/Devanagari, slightly less
)


In [21]:
sp_eng = spm.SentencePieceProcessor()
sp_eng.load("eng_spm.model")


True

In [22]:
sp_eng = spm.SentencePieceProcessor()
sp_eng.load("hin_spm.model")


True

In [23]:
# Read lines from train files
with open('train.eng', 'r', encoding='utf-8') as f:
    train_eng_text = f.readlines()

with open('train.hin', 'r', encoding='utf-8') as f:
    train_hin_text = f.readlines()


In [24]:
# Encode text into token IDs
train_eng_ids = [sp_eng.encode_as_ids(sentence.strip()) for sentence in train_eng_text]
train_hin_ids = [sp_hin.encode_as_ids(sentence.strip()) for sentence in train_hin_text]


In [25]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

max_len_eng = max(len(seq) for seq in train_eng_ids)
max_len_hin = max(len(seq) for seq in train_hin_ids)

train_eng_padded = pad_sequences(train_eng_ids, maxlen=max_len_eng, padding='post')
train_hin_padded = pad_sequences(train_hin_ids, maxlen=max_len_hin, padding='post')


In [26]:
import numpy as np

decoder_input = [seq[:-1] for seq in train_hin_ids]
decoder_output = [seq[1:] for seq in train_hin_ids]

decoder_input_padded = pad_sequences(decoder_input, maxlen=max_len_hin-1, padding='post')
decoder_output_padded = pad_sequences(decoder_output, maxlen=max_len_hin-1, padding='post')

encoder_input = np.array(train_eng_padded)
decoder_input = np.array(decoder_input_padded)
decoder_output = np.array(decoder_output_padded)


In [27]:
import tensorflow as tf
import sentencepiece as spm
import numpy as np

# ----------- STEP 1: Load SentencePiece Tokenizers ------------
sp_eng = spm.SentencePieceProcessor()
sp_eng.load("eng_spm.model")

sp_hin = spm.SentencePieceProcessor()
sp_hin.load("hin_spm.model")


True

In [28]:
# ----------- STEP 2: Load and Encode Training Data ------------
with open("train.eng", encoding="utf-8") as f:
    train_eng = [sp_eng.encode_as_ids(line.strip()) for line in f.readlines()]

with open("train.hin", encoding="utf-8") as f:
    train_hin = [sp_hin.encode_as_ids(line.strip()) for line in f.readlines()]

# Add special tokens for BOS and EOS
BOS_ID = sp_hin.bos_id() if sp_hin.bos_id() != -1 else 1
EOS_ID = sp_hin.eos_id() if sp_hin.eos_id() != -1 else 2

encoder_input = tf.keras.preprocessing.sequence.pad_sequences(train_eng, padding='post')
decoder_input = tf.keras.preprocessing.sequence.pad_sequences([[BOS_ID]+s for s in train_hin], padding='post')
decoder_target = tf.keras.preprocessing.sequence.pad_sequences([s+[EOS_ID] for s in train_hin], padding='post')

# ----------- STEP 3: Define Hyperparameters ------------
VOCAB_SIZE_EN = sp_eng.get_piece_size()
VOCAB_SIZE_HI = sp_hin.get_piece_size()
EMBEDDING_DIM = 256
UNITS = 512


In [29]:
# ----------- STEP 4: Build Encoder ------------
class Encoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, enc_units):
        super(Encoder, self).__init__()
        self.enc_units = enc_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.lstm = tf.keras.layers.LSTM(enc_units, return_sequences=True, return_state=True)

    def call(self, x):
        x = self.embedding(x)
        output, state_h, state_c = self.lstm(x)
        return output, state_h, state_c


In [30]:
class Attention(tf.keras.layers.Layer):
    def __init__(self, units):
        super(Attention, self).__init__()
        self.W1 = tf.keras.layers.Dense(units)
        self.W2 = tf.keras.layers.Dense(units)
        self.V = tf.keras.layers.Dense(1)

    def call(self, enc_output, dec_hidden):
        # dec_hidden shape: (batch, hidden)
        dec_hidden_with_time_axis = tf.expand_dims(dec_hidden, 1)
        score = self.V(tf.nn.tanh(self.W1(enc_output) + self.W2(dec_hidden_with_time_axis)))
        attention_weights = tf.nn.softmax(score, axis=1)
        context_vector = attention_weights * enc_output
        context_vector = tf.reduce_sum(context_vector, axis=1)
        return context_vector, attention_weights


In [31]:
# ----------- STEP 6: Build Decoder ------------
class Decoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, dec_units):
        super(Decoder, self).__init__()
        self.dec_units = dec_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.lstm = tf.keras.layers.LSTM(dec_units, return_sequences=True, return_state=True)
        self.fc = tf.keras.layers.Dense(vocab_size)
        self.attention = Attention(dec_units)

    def call(self, x, enc_output, hidden):
        context_vector, attention_weights = self.attention(enc_output, hidden)
        x = self.embedding(x)
        x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)
        output, state_h, state_c = self.lstm(x)
        output = tf.reshape(output, (-1, output.shape[2]))
        x = self.fc(output)
        return x, state_h, state_c, attention_weights


In [32]:
# ----------- STEP 7: Instantiate and Compile ------------
encoder = Encoder(VOCAB_SIZE_EN, EMBEDDING_DIM, UNITS)
decoder = Decoder(VOCAB_SIZE_HI, EMBEDDING_DIM, UNITS)

# ----------- STEP 8: Define Loss & Optimizer ------------
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')

def loss_function(real, pred):
    mask = tf.math.not_equal(real, 0)
    loss_ = loss_object(real, pred)
    mask = tf.cast(mask, dtype=loss_.dtype)
    return tf.reduce_mean(loss_ * mask)

optimizer = tf.keras.optimizers.Adam()

In [43]:
class Decoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz):
        super(Decoder, self).__init__()
        self.batch_sz = batch_sz
        self.dec_units = dec_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = tf.keras.layers.GRU(self.dec_units,
                                       return_sequences=True,
                                       return_state=True,
                                       recurrent_initializer='glorot_uniform')
        self.fc = tf.keras.layers.Dense(vocab_size)

        self.attention = BahdanauAttention(self.dec_units)

    def call(self, x, hidden, enc_output):
        context_vector, attention_weights = self.attention(hidden, enc_output)
        x = self.embedding(x)
        x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)
        output, state = self.gru(x)
        output = tf.reshape(output, (-1, output.shape[2]))
        x = self.fc(output)
        return x, state, attention_weights


In [44]:
# ----------- STEP 9: Training Step ------------
@tf.function
def train_step(enc_inp, dec_inp, dec_target):
    with tf.GradientTape() as tape:
        enc_output, enc_hidden, enc_cell = encoder(enc_inp)
        dec_hidden, dec_cell = enc_hidden, enc_cell
        dec_input = tf.expand_dims(dec_inp[:, 0], 1)

        loss = 0
        for t in range(1, dec_target.shape[1]):
            predictions, dec_hidden, dec_cell, _ = decoder(dec_input, enc_output, dec_hidden)
            loss += loss_function(dec_target[:, t], predictions)
            dec_input = tf.expand_dims(dec_inp[:, t], 1)

    batch_loss = loss / int(dec_target.shape[1])
    variables = encoder.trainable_variables + decoder.trainable_variables
    gradients = tape.gradient(loss, variables)
    optimizer.apply_gradients(zip(gradients, variables))
    return batch_loss

In [45]:
# ----------- STEP 10: Train Model ------------
BATCH_SIZE = 32
dataset = tf.data.Dataset.from_tensor_slices((encoder_input, decoder_input, decoder_target))
dataset = dataset.shuffle(len(encoder_input)).batch(BATCH_SIZE, drop_remainder=True)

EPOCHS = 10
for epoch in range(EPOCHS):
    total_loss = 0
    for (batch, (enc_inp, dec_inp, dec_target)) in enumerate(dataset):
        batch_loss = train_step(enc_inp, dec_inp, dec_target)
        total_loss += batch_loss
    print(f"Epoch {epoch+1} Loss {total_loss.numpy() / (batch+1):.4f}")


Epoch 1 Loss 1.8573
Epoch 2 Loss 1.8447
Epoch 3 Loss 1.8313
Epoch 4 Loss 1.8180
Epoch 5 Loss 1.8044
Epoch 6 Loss 1.7909
Epoch 7 Loss 1.7783
Epoch 8 Loss 1.7652
Epoch 9 Loss 1.7506
Epoch 10 Loss 1.7365


In [46]:
def translate(sentence):
    sentence = preprocess_sentence(sentence)
    inputs = [sp_eng.encode_as_ids(sentence)]
    max_length_input = 40
    units = 256
    inputs = tf.keras.preprocessing.sequence.pad_sequences(inputs, maxlen=max_length_input, padding='post')
    inputs = tf.convert_to_tensor(inputs)

    result = ''

    hidden = [tf.zeros((1, units))]
    enc_out, enc_hidden = encoder(inputs, hidden)

    dec_hidden = enc_hidden
    dec_input = tf.expand_dims([target_tokenizer.word_index['<start>']], 0)

    for t in range(max_length_output):
        predictions, dec_hidden, attention_weights = decoder(dec_input, dec_hidden, enc_out)

        predicted_id = tf.argmax(predictions[0]).numpy()
        result += target_tokenizer.index_word.get(predicted_id, '') + ' '

        if target_tokenizer.index_word.get(predicted_id, '') == '<end>':
            break

        dec_input = tf.expand_dims([predicted_id], 0)

    return result


In [48]:
print(translate("I love Indian food."))


TypeError: too many positional arguments

In [37]:
import re

def preprocess_sentence(sentence):
    sentence = sentence.lower().strip()
    # Add space between words and punctuation
    sentence = re.sub(r"([?.!,¿])", r" \1 ", sentence)
    sentence = re.sub(r'[" "]+', " ", sentence)
    sentence = re.sub(r"[^a-zA-Z?.!,¿]+", " ", sentence)
    sentence = sentence.strip()
    return sentence


In [38]:
print(translate("I love Indian food."))


NameError: name 'max_length_input' is not defined