In [10]:
import numpy as np
import pandas as pd
from datasets import load_dataset
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import seaborn as sns

import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense

In [11]:
dataset_en_hi = load_dataset("open_subtitles", "en-hi")
dataset_da_ru = load_dataset("open_subtitles", "da-ru")
data_en_hi = dataset_en_hi["train"]
data_da_ru = dataset_da_ru["train"]

In [12]:
data_en_hi[0:2]

{'id': ['0', '1'],
 'meta': [{'year': 1948,
   'imdbId': 40522,
   'subtitleId': {'en': 4180294, 'hi': 4239106},
   'sentenceIds': {'en': [1], 'hi': [1]}},
  {'year': 1948,
   'imdbId': 40522,
   'subtitleId': {'en': 4180294, 'hi': 4239106},
   'sentenceIds': {'en': [2], 'hi': [2]}}],
 'translation': [{'en': 'THE BICYCLE THIEF', 'hi': 'साइकिल चोर'},
  {'en': 'Ricci?', 'hi': 'रिच्ची?'}]}

In [13]:


# Extract English and Hindi sentences
english_sentences = [item['en'] for item in data_en_hi['translation']]
hindi_sentences = [item['hi'] for item in data_en_hi['translation']]

# Tokenize English sentences
en_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='')
en_tokenizer.fit_on_texts(english_sentences)
en_vocab_size = len(en_tokenizer.word_index) + 1
max_en_seq_length = max([len(sentence.split()) for sentence in english_sentences])
en_input_sequences = en_tokenizer.texts_to_sequences(english_sentences)
padded_en_input_sequences = tf.keras.preprocessing.sequence.pad_sequences(en_input_sequences, padding='post', maxlen=max_en_seq_length)

# Tokenize Hindi sentences
hi_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='')
hi_tokenizer.fit_on_texts(hindi_sentences)
hi_vocab_size = len(hi_tokenizer.word_index) + 1
max_hi_seq_length = max([len(sentence.split()) for sentence in hindi_sentences])
hi_input_sequences = hi_tokenizer.texts_to_sequences(hindi_sentences)
padded_hi_input_sequences = tf.keras.preprocessing.sequence.pad_sequences(hi_input_sequences, padding='post', maxlen=max_hi_seq_length)

In [14]:

# Define encoder-decoder model
embedding_dim = 256

# Encoder
encoder_inputs = Input(shape=(max_en_seq_length,))
encoder_embedding = tf.keras.layers.Embedding(en_vocab_size, embedding_dim)(encoder_inputs)
encoder_lstm = LSTM(embedding_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedding)
encoder_states = [state_h, state_c]

# Decoder
decoder_inputs = Input(shape=(max_hi_seq_length,))
decoder_embedding = tf.keras.layers.Embedding(hi_vocab_size, embedding_dim)(decoder_inputs)
decoder_lstm = LSTM(embedding_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)
decoder_dense = Dense(hi_vocab_size, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

In [15]:

# Model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

# Compile model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train model
model.fit([padded_en_input_sequences, padded_hi_input_sequences], np.expand_dims(padded_hi_input_sequences, -1), batch_size=64, epochs=3)

# Save model
model.save('translation_model.h5')


Epoch 1/3
[1m1454/1454[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4654s[0m 3s/step - accuracy: 0.8945 - loss: 1.2579
Epoch 2/3
[1m1454/1454[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5806s[0m 4s/step - accuracy: 0.9642 - loss: 0.3142
Epoch 3/3
[1m1454/1454[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3675s[0m 3s/step - accuracy: 0.9805 - loss: 0.1782


