In [2]:
import pandas as pd

splits = {'train': 'data/train-00000-of-00001.parquet', 'validation': 'data/validation-00000-of-00001.parquet', 'test': 'data/test-00000-of-00001.parquet'}
data = pd.read_parquet("hf://datasets/cfilt/iitb-english-hindi/" + splits["train"])

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [3]:
print(data)

                                               translation
0        {'en': 'Give your application an accessibility...
1        {'en': 'Accerciser Accessibility Explorer', 'h...
2        {'en': 'The default plugin layout for the bott...
3        {'en': 'The default plugin layout for the top ...
4        {'en': 'A list of plugins that are disabled by...
...                                                    ...
1659078  {'en': 'The Prime Minister, Shri Narendra Modi...
1659079  {'en': 'In a tweet, the Prime Minister said, c...
1659080  {'en': 'I also congratulate all those who took...
1659081  {'en': 'The NDA family will work together for ...
1659082  {'en': 'I assure all possible support from the...

[1659083 rows x 1 columns]


In [4]:
for i in range(10):
    print((data.iloc[i]['translation']['en']))
    print((data.iloc[i]['translation']['hi']))
    print('\n')

Give your application an accessibility workout
अपने अनुप्रयोग को पहुंचनीयता व्यायाम का लाभ दें


Accerciser Accessibility Explorer
एक्सेर्साइसर पहुंचनीयता अन्वेषक


The default plugin layout for the bottom panel
निचले पटल के लिए डिफोल्ट प्लग-इन खाका


The default plugin layout for the top panel
ऊपरी पटल के लिए डिफोल्ट प्लग-इन खाका


A list of plugins that are disabled by default
उन प्लग-इनों की सूची जिन्हें डिफोल्ट रूप से निष्क्रिय किया गया है


Highlight duration
अवधि को हाइलाइट रकें


The duration of the highlight box when selecting accessible nodes
पहुंचनीय आसंधि (नोड) को चुनते समय हाइलाइट बक्से की अवधि


Highlight border color
सीमांत (बोर्डर) के रंग को हाइलाइट करें


The color and opacity of the highlight border.
हाइलाइट किए गए सीमांत का रंग और अपारदर्शिता। 


Highlight fill color
भराई के रंग को हाइलाइट करें




In [11]:
import pandas as pd

# Sample DataFrame setup (Replace this with your actual data loading)

df = pd.DataFrame(data)

#  Since Data is Very Large We are taking only first 10,000 rows to perform
df=df[:10000]

df['english_sentence'] = df['translation'].apply(lambda x: x.get('en', ''))
df['hindi_sentence'] = df['translation'].apply(lambda x: x.get('hi', ''))

df = df.drop(columns=['translation'])

df.to_csv('LimitedHindiToEngilsh.csv', index=False)

print(df)
print("Dataset successfully saved to 'LimitedHindiToEngilsh.csv'")


                                    english_sentence  \
0     Give your application an accessibility workout   
1                  Accerciser Accessibility Explorer   
2     The default plugin layout for the bottom panel   
3        The default plugin layout for the top panel   
4     A list of plugins that are disabled by default   
...                                              ...   
9995                                      Properties   
9996                                         Signals   
9997                           Author Email Address:   
9998                                        License:   
9999                                  Add to Project   

                                         hindi_sentence  
0       अपने अनुप्रयोग को पहुंचनीयता व्यायाम का लाभ दें  
1                       एक्सेर्साइसर पहुंचनीयता अन्वेषक  
2                 निचले पटल के लिए डिफोल्ट प्लग-इन खाका  
3                  ऊपरी पटल के लिए डिफोल्ट प्लग-इन खाका  
4     उन प्लग-इनों की सूची जिन्हें डि

In [12]:
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences



english_sentences = df['english_sentence'].tolist()
hindi_sentences = df['hindi_sentence'].tolist()

def tokenize_and_pad(sentences, max_len):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(sentences)
    sequences = tokenizer.texts_to_sequences(sentences)
    padded_sequences = pad_sequences(sequences, maxlen=max_len, padding='post')
    return tokenizer, padded_sequences

max_eng_len = max(len(seq) for seq in english_sentences)
max_hin_len = max(len(seq) for seq in hindi_sentences)

eng_tokenizer, eng_sequences = tokenize_and_pad(english_sentences, max_eng_len)
hin_tokenizer, hin_sequences = tokenize_and_pad(hindi_sentences, max_hin_len)

X_train, X_test, y_train, y_test = train_test_split(eng_sequences, hin_sequences, test_size=0.2, random_state=42)


In [14]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, RepeatVector, TimeDistributed, Dropout, Bidirectional
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, LearningRateScheduler

embedding_dim = 128  # previously -> 256
hidden_units = 128
vocab_size_eng = len(eng_tokenizer.word_index) + 1
vocab_size_hin = len(hin_tokenizer.word_index) + 1

model = Sequential([
    Embedding(vocab_size_eng, embedding_dim, input_length=max_eng_len),
    Bidirectional(LSTM(hidden_units, return_sequences=False, dropout=0.3)),
    RepeatVector(max_hin_len),
    Bidirectional(LSTM(hidden_units, return_sequences=True, dropout=0.3)),
    TimeDistributed(Dense(vocab_size_hin, activation='softmax'))
])

model.compile(optimizer=Adam(learning_rate=1e-4), loss='sparse_categorical_crossentropy', metrics=['accuracy'])

early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

def scheduler(epoch, lr):
    if epoch % 5 == 0 and epoch != 0:
        lr = lr * 0.9
    return lr

lr_scheduler = LearningRateScheduler(scheduler)

model.fit(
    X_train,
    y_train.reshape(*y_train.shape, 1),
    epochs=5,   #previously -> 20
    batch_size=200,#previously -> 64
    validation_data=(X_test, y_test.reshape(*y_test.shape, 1)),
    callbacks=[early_stopping, lr_scheduler]
)


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x7bb82448e3e0>

In [18]:
# Evaluate the model on the test dataset
test_loss, test_accuracy = model.evaluate(X_test, y_test.reshape(*y_test.shape, 1))

print(f'Test Loss: {test_loss}')
print(f'Test Accuracy: {test_accuracy}')


Test Loss: 0.1413617730140686
Test Accuracy: 0.9884709119796753


In [22]:
# Make the prediction
predicted_sequence = model.predict(new_sentence_padded)
predicted_indices = np.argmax(predicted_sequence, axis=-1)

# Print the predicted token indices
print(f'Predicted Indices: {predicted_indices[0]}')


Predicted Indices: [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0]


In [23]:
# Print the Hindi words corresponding to the predicted indices
translated_sentence = ' '.join([hin_tokenizer.index_word.get(idx, '[UNK]') for idx in predicted_indices[0]])
print(f'Translated Hindi Sentence: {translated_sentence}')


Translated Hindi Sentence: [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [

In [24]:
# Tokenize and pad the new English sentence correctly
new_sentence = "The quick brown fox jumps over the lazy dog"
new_sentence_tokenized = eng_tokenizer.texts_to_sequences([new_sentence])

# Make sure padding length matches the maximum input length
new_sentence_padded = pad_sequences(new_sentence_tokenized, maxlen=max_eng_len, padding='post')

print(f'Padded Input Sentence: {new_sentence_padded}')


Padded Input Sentence: [[   1  511 1200    1  848    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0

In [25]:
# Check the first few token-to-word mappings
for i in range(1, 10):
    print(f"Token {i}: {hin_tokenizer.index_word.get(i, '[UNK]')}")


Token 1: का
Token 2: को
Token 3: के
Token 4: करें
Token 5: में
Token 6: 2
Token 7: एक
Token 8: a
Token 9: की
