In [202]:
import pandas as pd 
import numpy as np 
import tensorflow as tf


In [203]:
df = pd.read_csv("hindi_english_parallel.csv")

In [204]:
df.head()

Unnamed: 0,hindi,english
0,अपने अनुप्रयोग को पहुंचनीयता व्यायाम का लाभ दें,Give your application an accessibility workout
1,एक्सेर्साइसर पहुंचनीयता अन्वेषक,Accerciser Accessibility Explorer
2,निचले पटल के लिए डिफोल्ट प्लग-इन खाका,The default plugin layout for the bottom panel
3,ऊपरी पटल के लिए डिफोल्ट प्लग-इन खाका,The default plugin layout for the top panel
4,उन प्लग-इनों की सूची जिन्हें डिफोल्ट रूप से नि...,A list of plugins that are disabled by default


In [205]:
df = df.iloc[0:100000]

In [206]:
df['hindi'] = df['hindi'].astype(str)
df['english'] = df['english'].astype(str)

df['hindi'] =df['hindi'].apply(lambda x : '[sos] ' + x + ' [eos]')
df['english'] = df['english'].apply(lambda x : '[sos] ' + x + ' [eos]')

In [207]:
text_dataset_en = tf.data.Dataset.from_tensor_slices(df['english'])


for ele in text_dataset_en.take(5) :
    print(ele.numpy())



b'[sos] Give your application an accessibility workout [eos]'
b'[sos] Accerciser Accessibility Explorer [eos]'
b'[sos] The default plugin layout for the bottom panel [eos]'
b'[sos] The default plugin layout for the top panel [eos]'
b'[sos] A list of plugins that are disabled by default [eos]'


In [208]:
max_features = 5000  # Maximum vocab size.
max_len = 20  # Sequence length to pad the outputs to.


# Create the layer.

vectorize_layer_en= tf.keras.layers.TextVectorization(  
 max_tokens=max_features,
 standardize= "lower_and_strip_punctuation" ,
 output_mode ='int',
 encoding='utf-8',
 output_sequence_length=max_len)

vectorize_layer_en.adapt(text_dataset_en)



In [209]:
print(len(vectorize_layer_en.get_vocabulary()))
en_vocab = np.array( vectorize_layer_en.get_vocabulary())

5000


In [210]:
X_train = vectorize_layer_en(df['english'])

In [211]:
X_train.numpy()[1:5]

array([[   2, 1521,  994, 2458,    3,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0],
       [   2,    4,  105,   75,  376,   11,    4,  654,  874,    3,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0],
       [   2,    4,  105,   75,  376,   11,    4,  401,  874,    3,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0],
       [   2,    7,   39,    8,  212,   62,   48,  741,   59,  105,    3,
           0,    0,    0,    0,    0,    0,    0,    0,    0]],
      dtype=int64)

In [212]:
en_vocab[X_train.numpy()[1]]

array(['sos', 'accerciser', 'accessibility', 'explorer', 'eos', '', '',
       '', '', '', '', '', '', '', '', '', '', '', '', ''], dtype='<U20')

In [213]:
max_features = 5000  # Maximum vocab size.
max_len = 20  # Sequence length to pad the outputs to.


text_dataset_hindi = tf.data.Dataset.from_tensor_slices(df['hindi'])

# Create the layer.

vectorize_layer_hindi= tf.keras.layers.TextVectorization(
 max_tokens=max_features,
# standardize=tf_lower_and_split_punct,
 output_mode='int',
 encoding='utf-8',
 output_sequence_length=max_len)

vectorize_layer_hindi.adapt(text_dataset_hindi)

print(len(vectorize_layer_hindi.get_vocabulary()))
hindi_vocab = np.array( vectorize_layer_hindi.get_vocabulary())

5000


In [217]:
Y_train = vectorize_layer_hindi(df['hindi'])
Y_train.numpy()[1:5]
hindi_vocab[Y_train.numpy()[1]]

array(['sos', 'एक्सेर्साइसर', 'पहुंचनीयता', 'अन्वेषक', 'eos', '', '', '',
       '', '', '', '', '', '', '', '', '', '', '', ''], dtype='<U26')

In [218]:
df['hindi'].head(5)

0    [sos] अपने अनुप्रयोग को पहुंचनीयता व्यायाम का ...
1          [sos] एक्सेर्साइसर पहुंचनीयता अन्वेषक [eos]
2    [sos] निचले पटल के लिए डिफोल्ट प्लग-इन खाका [eos]
3     [sos] ऊपरी पटल के लिए डिफोल्ट प्लग-इन खाका [eos]
4    [sos] उन प्लग-इनों की सूची जिन्हें डिफोल्ट रूप...
Name: hindi, dtype: object

## MODEL

In [238]:
from tensorflow.keras.layers import Input,Bidirectional, Concatenate, Dot, LSTM, multiply, RepeatVector, Dense, Embedding, Activation, Dot

In [243]:
Tx = 20
Ty = 20
embed_dim = 256
vocab_size = 5000 # Max vocab size
n_a = 64 # number of neurons in pre attention layer
n_s =128 # number of neruons in post attention layer

# Please note, this is the post attention LSTM cell.  
post_activation_LSTM_cell = LSTM(n_s, return_state = True) # Please do not modify this global variable.
output_layer = Dense(vocab_size, activation='softmax')

def one_step_act(a,s_prev):
    
    s_prev = RepeatVector(Tx)(s_prev) # Shape M,Tx,n_s
    concat = Concatenate(axis = -1)([a,s_prev])

    e = Dense(10,activation = "tanh")(concat)
    # Use densor2 to propagate e through a small fully-connected neural network to compute the "energies" variable energies. (≈1 lines)
    energies = Dense(1,activation = "relu")(e)
    # Use "activator" on "energies" to compute the attention weights "alphas" (≈ 1 line)
    alphas = Activation(activation = 'softmax', name='attention_weights', axis = -1)(energies)
    # Use dotor together with "alphas" and "a", in this order, to compute the context vector to be given to the next (post-attention) LSTM-cell (≈ 1 line)
    context = Dot(axes = 1)([alphas,a])
    ### END CODE HERE ###
    
    return context


def Model_Trans(input):

    s0 = Input(shape=(n_s,), name='s0')
    # initial cell state
    c0 = Input(shape=(n_s,), name='c0')
    # hidden state
    s = s0
    # cell state
    c = c0
    
    # Initialize empty list of outputs
    outputs = []
    

    X = Embedding(input_dim =vocab_size ,output_dim = embed_dim ,input_length = Tx )(input)
    X = Bidirectional(LSTM(units = n_a,return_sequences=True))(X)

    # Step 2: Iterate for Ty steps
    for t in range(Ty):
    
        # Step 2.A: Perform one step of the attention mechanism to get back the context vector at step t (≈ 1 line)
        context =one_step_act(a, s)
        
        # Step 2.B: Apply the post-attention LSTM cell to the "context" vector. (≈ 1 line)
        # Don't forget to pass: initial_state = [hidden state, cell state] 
        # Remember: s = hidden state, c = cell state
        _, s, c = post_activation_LSTM_cell(inputs=context, initial_state=[s,c])
        
        # Step 2.C: Apply Dense layer to the hidden state output of the post-attention LSTM (≈ 1 line)
        out = output_layer(s)
        
        # Step 2.D: Append "out" to the "outputs" list (≈ 1 line)
        outputs.append(out)

        # Step 3: Create model instance taking three inputs and returning the list of outputs. (≈ 1 line)
    model = Model(inputs = [X,s0,c0],outputs= outputs)
    
    ### END CODE HERE ###
    
    return model


In [244]:
model = Model_Trans(X_train)