<a href="https://colab.research.google.com/github/saishshinde15/NLP/blob/main/Translation_EncoderDecoderipynb.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
dataset = pd.read_csv('/content/Dataset_English_Hindi.csv')

In [3]:
dataset

Unnamed: 0,English,Hindi
0,Help!,बचाओ!
1,Jump.,उछलो.
2,Jump.,कूदो.
3,Jump.,छलांग.
4,Hello!,नमस्ते।
...,...,...
130471,Examples of art deco construction can be found...,आर्ट डेको शैली के निर्माण मैरीन ड्राइव और ओवल ...
130472,and put it in our cheeks.,और अपने गालों में डाल लेते हैं।
130473,"As for the other derivatives of sulphur , the ...","जहां तक गंधक के अन्य उत्पादों का प्रश्न है , द..."
130474,its complicated functioning is defined thus in...,Zरचना-प्रकिया को उसने एक पहेली में यों बांधा है .


In [4]:
dataset.isnull().sum()

English      2
Hindi      312
dtype: int64

In [5]:
#missing value fixing with empty string
dataset = dataset.fillna('')


In [6]:
dataset.isnull().sum()

English    0
Hindi      0
dtype: int64

In [7]:
X = dataset['English']
y = dataset['Hindi']

In [8]:
X

0                                                     Help!
1                                                     Jump.
2                                                     Jump.
3                                                     Jump.
4                                                    Hello!
                                ...                        
130471    Examples of art deco construction can be found...
130472                            and put it in our cheeks.
130473    As for the other derivatives of sulphur , the ...
130474    its complicated functioning is defined thus in...
130475    They've just won four government contracts to ...
Name: English, Length: 130476, dtype: object

In [9]:
y

0                                                     बचाओ!
1                                                     उछलो.
2                                                     कूदो.
3                                                    छलांग.
4                                                   नमस्ते।
                                ...                        
130471    आर्ट डेको शैली के निर्माण मैरीन ड्राइव और ओवल ...
130472                      और अपने गालों में डाल लेते हैं।
130473    जहां तक गंधक के अन्य उत्पादों का प्रश्न है , द...
130474    Zरचना-प्रकिया को उसने एक पहेली में यों बांधा है .
130475    हाल ही में उन्हें सरकारी ठेका मिला है करीब सौ ...
Name: Hindi, Length: 130476, dtype: object

In [10]:
# Tokenize input and target sequences
tokenizer_english = Tokenizer(num_words=5000, oov_token='<OOV>')
tokenizer_hindi = Tokenizer(num_words=5000, oov_token='<OOV>')

In [11]:
tokenizer_english.fit_on_texts(X)
tokenizer_hindi.fit_on_texts(y)

In [12]:
X_train = tokenizer_english.texts_to_sequences(X)
y_train = tokenizer_hindi.texts_to_sequences(y)

In [13]:
# Pad sequences to a maximum length
max_len = 100  # Set your desired maximum sequence length
X_train = pad_sequences(X_train, maxlen=max_len, padding='post')
y_train = pad_sequences(y_train, maxlen=max_len, padding='post')

In [14]:
# Define the model architecture
encoder_input = tf.keras.layers.Input(shape=(max_len,))
encoder_embedding = tf.keras.layers.Embedding(len(tokenizer_english.word_index) + 1, 128, input_length=max_len)(encoder_input)
encoder_lstm = tf.keras.layers.LSTM(128, return_sequences=True, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedding)

decoder_input = tf.keras.layers.Input(shape=(None,))
decoder_embedding = tf.keras.layers.Embedding(len(tokenizer_hindi.word_index) + 1, 128)(decoder_input)
decoder_lstm = tf.keras.layers.LSTM(128, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=[state_h, state_c])

decoder_dense = tf.keras.layers.Dense(len(tokenizer_hindi.word_index) + 1, activation='softmax')
output = decoder_dense(decoder_outputs)

In [15]:
model = tf.keras.models.Model([encoder_input, decoder_input], output)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')

In [17]:
# Split the data into train and validation sets
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [None]:
history = model.fit([X_train, y_train[:, :-1]], y_train[:, 1:], epochs=1, batch_size=32, validation_data=([X_val, y_val[:, :-1]], y_val[:, 1:])) # increse epochs as per need (using 1 to save time and computational power)

 250/3262 [=>............................] - ETA: 9:16:26 - loss: 1.0255

In [None]:
# Define inference models for encoder and decoder
encoder_model = tf.keras.models.Model(encoder_input, [encoder_outputs, state_h, state_c])

decoder_state_input_h = tf.keras.layers.Input(shape=(128,))
decoder_state_input_c = tf.keras.layers.Input(shape=(128,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

decoder_outputs, state_h_dec, state_c_dec = decoder_lstm(
    decoder_embedding, initial_state=decoder_states_inputs)

decoder_states = [state_h_dec, state_c_dec]
decoder_outputs = decoder_dense(decoder_outputs)

decoder_model = tf.keras.models.Model(
    [decoder_input] + decoder_states_inputs,
    [decoder_outputs] + decoder_states
)

# Function to generate translations from input sequences
def generate_translation(input_text):
    input_seq = tokenizer_english.texts_to_sequences([input_text])
    input_seq = pad_sequences(input_seq, maxlen=max_len, padding='post')

    decoded_sentence = ''
    encoder_output, encoder_state_h, encoder_state_c = encoder_model.predict(input_seq)
    decoder_state_input_h = encoder_state_h
    decoder_state_input_c = encoder_state_c
    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = tokenizer_hindi.word_index['<start>']

    stop_condition = False
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + [decoder_state_input_h, decoder_state_input_c])
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_token = tokenizer_hindi.index_word[sampled_token_index]
        if sampled_token != '<end>':
            decoded_sentence += ' ' + sampled_token
        if sampled_token == '<end>' or len(decoded_sentence.split()) > max_len:
            stop_condition = True
        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index
        decoder_state_input_h = h
        decoder_state_input_c = c

    return decoded_sentence.strip()


In [None]:
# Example usage:
input_text = "Hello!"
translated_text = generate_translation(input_text)
print(translated_text)

# Using Hugging Face Model

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch

# Load the dataset
dataset = pd.read_csv('/content/Dataset_English_Hindi.csv')

# Assuming your dataset has 'English' and 'Hindi' columns for input and target texts
X = dataset['English']
y = dataset['Hindi']

# Split the data into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the T5 tokenizer and model
tokenizer = T5Tokenizer.from_pretrained('google-t5/t5-base')
model = T5ForConditionalGeneration.from_pretrained('google-t5/t5-base')

# Tokenize the input and target sequences
X_train_tokens = tokenizer.batch_encode_plus(X_train.tolist(), max_length=128, padding='max_length', truncation=True, return_tensors='pt')
y_train_tokens = tokenizer.batch_encode_plus(y_train.tolist(), max_length=128, padding='max_length', truncation=True, return_tensors='pt')

# Tokenize validation data
X_val_tokens = tokenizer.batch_encode_plus(X_val.tolist(), max_length=128, padding='max_length', truncation=True, return_tensors='pt')
y_val_tokens = tokenizer.batch_encode_plus(y_val.tolist(), max_length=128, padding='max_length', truncation=True, return_tensors='pt')

# Define training parameters
train_dataset = torch.utils.data.TensorDataset(X_train_tokens['input_ids'], X_train_tokens['attention_mask'], y_train_tokens['input_ids'], y_train_tokens['attention_mask'])
val_dataset = torch.utils.data.TensorDataset(X_val_tokens['input_ids'], X_val_tokens['attention_mask'], y_val_tokens['input_ids'], y_val_tokens['attention_mask'])

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=8, shuffle=False)

# Set up optimizer and scheduler
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.9)

# Training loop
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

for epoch in range(3):  # Adjust the number of epochs as needed
    model.train()
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids, attention_mask, labels, _ = [b.to(device) for b in batch]
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        scheduler.step()

    model.eval()
    with torch.no_grad():
        val_losses = []
        for batch in val_loader:
            input_ids, attention_mask, labels, _ = [b.to(device) for b in batch]
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            val_losses.append(outputs.loss.item())

    print(f"Epoch {epoch+1}/{3}, Validation Loss: {sum(val_losses)/len(val_losses)}")


# Using Attention mechanism(Bahdanau Attention)

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

# Load the dataset
dataset = pd.read_csv('English.csv')

# Assuming your dataset has 'English' and 'Hindi' columns for input and target texts
X = dataset['English']
y = dataset['Hindi']

# Tokenize input and target sequences
tokenizer_english = Tokenizer(num_words=5000, oov_token='<OOV>')
tokenizer_hindi = Tokenizer(num_words=5000, oov_token='<OOV>')

tokenizer_english.fit_on_texts(X)
tokenizer_hindi.fit_on_texts(y)

X_train = tokenizer_english.texts_to_sequences(X)
y_train = tokenizer_hindi.texts_to_sequences(y)

# Pad sequences to a maximum length
max_len = 100  # Set your desired maximum sequence length
X_train = pad_sequences(X_train, maxlen=max_len, padding='post')
y_train = pad_sequences(y_train, maxlen=max_len, padding='post')

# Define the model architecture
class BahdanauAttention(tf.keras.layers.Layer):
    def __init__(self, units):
        super(BahdanauAttention, self).__init__()
        self.W1 = tf.keras.layers.Dense(units)
        self.W2 = tf.keras.layers.Dense(units)
        self.V = tf.keras.layers.Dense(1)

    def call(self, query, values):
        # query shape == (batch_size, hidden_size)
        # values shape == (batch_size, max_len, hidden_size)
        # hidden_with_time_axis shape == (batch_size, 1, hidden_size)
        # score shape == (batch_size, max_len, 1)
        hidden_with_time_axis = tf.expand_dims(query, 1)
        score = self.V(tf.nn.tanh(
            self.W1(values) + self.W2(hidden_with_time_axis)))
        attention_weights = tf.nn.softmax(score, axis=1)
        context_vector = attention_weights * values
        context_vector = tf.reduce_sum(context_vector, axis=1)
        return context_vector, attention_weights

vocab_size_en = len(tokenizer_english.word_index) + 1
vocab_size_hi = len(tokenizer_hindi.word_index) + 1
embedding_dim = 256
units = 512

encoder_input = tf.keras.layers.Input(shape=(max_len,))
encoder_embedding = tf.keras.layers.Embedding(vocab_size_en, embedding_dim, input_length=max_len)(encoder_input)
encoder_lstm = tf.keras.layers.LSTM(units, return_sequences=True, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedding)

decoder_input = tf.keras.layers.Input(shape=(max_len,))
decoder_embedding = tf.keras.layers.Embedding(vocab_size_hi, embedding_dim, input_length=max_len)(decoder_input)
decoder_lstm = tf.keras.layers.LSTM(units, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=[state_h, state_c])

attention_layer = BahdanauAttention(units)
context_vector, attention_weights = attention_layer(state_h, encoder_outputs)

decoder_concat_input = tf.concat([tf.expand_dims(context_vector, 1), decoder_outputs], axis=-1)
decoder_dense = tf.keras.layers.Dense(vocab_size_hi, activation='softmax')
decoder_outputs = decoder_dense(decoder_concat_input)

model = tf.keras.models.Model([encoder_input, decoder_input], decoder_outputs)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Split the data into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Train the model
history = model.fit([X_train, y_train[:, :-1]], y_train[:, 1:], epochs=50, batch_size=64, validation_data=([X_val, y_val[:, :-1]], y_val[:, 1:]))


## use the below code in place of the encoder_input and decoder_input to perform bidirectional RNN


In [None]:
# Encoder with Bidirectional LSTM
encoder_lstm = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(units, return_sequences=True, return_state=True))
encoder_outputs, forward_h, forward_c, backward_h, backward_c = encoder_lstm(encoder_embedding)
state_h = tf.keras.layers.Concatenate()([forward_h, backward_h])
state_c = tf.keras.layers.Concatenate()([forward_c, backward_c])

# Decoder with Bidirectional LSTM
decoder_lstm = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(units, return_sequences=True, return_state=True))
decoder_outputs, _, _, _, _ = decoder_lstm(decoder_embedding, initial_state=[state_h, state_c])
