In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import tensorflow as tf
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, LSTM, Dense, Dropout
from keras.preprocessing.text import Tokenizer
from keras.callbacks import EarlyStopping
from keras.models import Sequential
import re, string, time
from sklearn.model_selection import train_test_split

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/hindienglish-corpora/Hindi_English_Truncated_Corpus.csv


In [3]:
raw = pd.read_csv(r'/kaggle/input/hindienglish-corpora/Hindi_English_Truncated_Corpus.csv')
corpus = raw[raw['source']=='ted']
corpus

Unnamed: 0,source,english_sentence,hindi_sentence
0,ted,politicians do not have permission to do what ...,"राजनीतिज्ञों के पास जो कार्य करना चाहिए, वह कर..."
1,ted,"I'd like to tell you about one such child,",मई आपको ऐसे ही एक बच्चे के बारे में बताना चाहू...
3,ted,what we really mean is that they're bad at not...,हम ये नहीं कहना चाहते कि वो ध्यान नहीं दे पाते
7,ted,"And who are we to say, even, that they are wrong",और हम होते कौन हैं यह कहने भी वाले कि वे गलत हैं
13,ted,So there is some sort of justice,तो वहाँ न्याय है
...,...,...,...
127595,ted,is if we want that to become our reality -,अगर हम चाहते हैं कि यह वास्तविकता बने -
127597,ted,Africa has not done bad.,अफ़्रीका ने कुछ गलती नहीं की है।
127598,ted,Thank you.,धन्यवाद |
127603,ted,and put it in our cheeks.,और अपने गालों में डाल लेते हैं।


In [4]:
def preprocessing(df, language):
    df[language] = df[language].apply(lambda x:x.lower().strip())
    df[language] = df[language].apply(lambda x:re.sub(r"([?.!,])", r" \1 ", x))
    if language == 'english_sentence':
        df[language] = df[language].apply(lambda x: re.sub(r"[^a-zA-Z?.!,]+", " ", x))
    df[language] = df[language].apply(lambda x:'<start> '+x+' <end>')
    return df[language]
corpus['english_sentence'] = preprocessing(corpus, 'english_sentence')
corpus['hindi_sentence'] = preprocessing(corpus, 'hindi_sentence')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value inste

In [5]:
dictionary=set()
for sentence in corpus['english_sentence'].to_list():
    for word in sentence.split():
        dictionary.add(word)

In [6]:
def tokenize_seq(corpus_ln):
    ln_tokenizer = Tokenizer()
    ln_tokenizer.fit_on_texts(corpus_ln)
    tensor = ln_tokenizer.texts_to_sequences(corpus_ln)
    tensor = pad_sequences(tensor, padding='post')
    return tensor, ln_tokenizer
en_tensor, en_tokenizer = tokenize_seq(corpus['english_sentence'])
hin_tensor, hin_tokenizer = tokenize_seq(corpus['hindi_sentence'])

In [7]:
input_train, input_val, target_train, target_val = train_test_split(en_tensor, hin_tensor, test_size=0.2)
print(input_train.shape, 'English Training Shape')
print(input_val.shape, 'English Validation Shape')
print(target_train.shape, 'Indie Training Shape')
print(target_val.shape, 'Indie Validation Shape')

(31904, 23) English Training Shape
(7977, 23) English Validation Shape
(31904, 32) Indie Training Shape
(7977, 32) Indie Validation Shape


In [8]:
batch_size=64
buffer_size=len(input_train)
steps_per_batch=len(input_train)//64
embed_dim=256
units=1024
en_word_len = len(en_tokenizer.word_index)+1
hin_word_len = len(hin_tokenizer.word_index)+1
dataset=tf.data.Dataset.from_tensor_slices((input_train, target_train)).shuffle(buffer_size)
dataset=dataset.batch(batch_size, drop_remainder=True)
val_dataset = tf.data.Dataset.from_tensor_slices((input_val, target_val)).batch(batch_size, drop_remainder=True)

In [9]:
exp_input, exp_target = next(iter(dataset))
print(exp_input.shape)
print(exp_target.shape)

(64, 23)
(64, 32)


In [10]:
class Encode(tf.keras.Model):
    def __init__(self, units, batch, vocab_size, embed_dim):
        super().__init__()
        self.units = units
        self.batch = batch
        self.embedding = tf.keras.layers.Embedding(vocab_size, embed_dim)
        self.gru = tf.keras.layers.GRU(self.units, return_sequences=True, return_state=True, 
                                       recurrent_initializer='glorot_uniform')
    def call(self, x, hidden):
        x = self.embedding(x)
        output, state = self.gru(x, initial_state = hidden)
        return output, state
    def initialization(self):
        return tf.zeros((self.batch, self.units))

In [11]:
encoder = Encode(units, batch_size, en_word_len, embed_dim)
exp_hidden = encoder.initialization()
exp_input_tensor, exp_hidden = encoder(exp_input, exp_hidden)
print(exp_input_tensor.shape)
print(exp_hidden.shape)

(64, 23, 1024)
(64, 1024)


In [12]:
class Attention(tf.keras.layers.Layer):
    def __init__(self, units):
        super().__init__()
        self.w1 = tf.keras.layers.Dense(units)
        self.w2 = tf.keras.layers.Dense(units)
        self.v = tf.keras.layers.Dense(1)
    def call(self, encoded_input, state):
        state = tf.expand_dims(state, 1)
        score = self.v(tf.nn.tanh(self.w1(encoded_input)+self.w2(state)))
        attention_weights = tf.nn.softmax(score, axis=1)
        context_vector = attention_weights*encoded_input
        context_vector = tf.reduce_sum(context_vector, axis=1)
        return context_vector, attention_weights

In [13]:
weights_func = Attention(units)
vector, weights = weights_func(exp_input_tensor, exp_hidden)
print(vector.shape)
print(weights.shape)

(64, 1024)
(64, 23, 1)


In [14]:
class Decoder(tf.keras.Model):
    def __init__(self, units, batch, vocab_size, embed_dim):
        super().__init__()
        self.units = units
        self.batch = batch
        self.embedding = tf.keras.layers.Embedding(vocab_size, embed_dim)
        self.gru = tf.keras.layers.GRU(self.units, return_sequences=True, return_state=True
                                       , recurrent_initializer='glorot_uniform')
        self.dense = tf.keras.layers.Dense(vocab_size)
        self.attention = Attention(self.units)
    def call(self, x, encoded_input, state):
        context_vector, weights = self.attention(encoded_input, state)
        x = self.embedding(x)
        x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)
        output, state = self.gru(x)
        output = tf.reshape(output, (-1, output.shape[2]))
        output = self.dense(output)
        return output, state, weights

In [15]:
decoder = Decoder(units, batch_size, hin_word_len, embed_dim)
output, state, weights = decoder(tf.random.uniform((batch_size, 1)), exp_input_tensor, exp_hidden)
print(output.shape)
print(state.shape)
print(weights.shape)

(64, 22088)
(64, 1024)
(64, 23, 1)


In [16]:
optimizer = tf.keras.optimizers.Adam()
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')
def loss_function(value, output):
    mask = tf.math.logical_not(tf.math.equal(value, 0))
    loss = loss_object(value, output)
    mask = tf.cast(mask, dtype = loss.dtype)
    loss*=mask
    return tf.reduce_mean(loss)

In [17]:
@tf.function
def train_step(input, target, hidden):
    loss=0
    with tf.GradientTape() as tape:
        enc_output, enc_hidden = encoder(input, hidden)
        dec_input = tf.expand_dims([hin_tokenizer.word_index['start']]*batch_size, 1)
        
        for i in range(1, target.shape[1]):
            dec_output, dec_state, _ = decoder(dec_input, enc_output, hidden)
            loss += loss_function(target[:, i], dec_output)
            dec_input = tf.expand_dims(target[:, i], 1)
        
    batch_loss = loss / int(target.shape[1])
    var = encoder.trainable_variables+decoder.trainable_variables
    grad = tape.gradient(loss, var)
    optimizer.apply_gradients(zip(grad, var))
    return batch_loss

In [18]:
epochs = 15
for epoch in range(epochs):
    start = time.time()
    hidden = encoder.initialization()
    total_loss = 0
    for (batch, (inputs, targets)) in enumerate(dataset.take(steps_per_batch)):
        batch_loss = train_step(inputs, targets, hidden)
        total_loss+=batch_loss
        
        if batch % 100 ==0:
            print('Epoch {} Batch{} Loss {:.4f}'.format(epoch+1, batch, batch_loss.numpy()))
        
    print('Epoch {} Loss {:.4f}'.format(epoch+1, total_loss/steps_per_batch))
    print('Total Computation Time Per Epoch: {} sec\n'.format(time.time()-start))

Epoch 1 Batch0 Loss 2.7986


KeyboardInterrupt: 

In [21]:
def evaluate(corpus):
    attention = np.zeros(input_val.shape[1], input_train.shape[1])
    inputs = [en_tokenizer.word_index[i] for i in corpus.split(' ')]
    inputs = tf.keras.preprocessing.sequence.pad_sequences([inputs],
                                                         maxlen=en_word_len,
                                                         padding='post')
    inputs = tf.convert_to_tensor(inputs)

    result = ''

    hidden = [tf.zeros((1, units))]
    enc_out, enc_hidden = encoder(inputs, hidden)
    dec_hidden = enc_hidden
    dec_input = tf.expand_dims([hin_tokenizer.word_index['<start>']], 0)
    for t in range(max_length_targ):
        predictions, dec_hidden, attention_weights = decoder(dec_input,dec_hidden,enc_out)
    attention_weights = tf.reshape(attention_weights, (-1, ))
    attention_plot[t] = attention_weights.numpy()

    predicted_id = tf.argmax(predictions[0]).numpy()

    result += targ_lang.index_word[predicted_id] + ' '

    if hin_tokenizer.index_word[predicted_id] == '<end>':
        return result, sentence, attention_plot

    # the predicted ID is fed back into the model
    dec_input = tf.expand_dims([predicted_id], 0)
    return result, sentence, attention_plot 

In [22]:
def translate(corpus):
    result, corpus, attention_plot = evaluate(corpus)
    print('Input: %s' % (sentence))
    print('Predicted translation: {}'.format(result))
    attention_plot = attention_plot[:len(result.split(' ')), :len(sentence.split(' '))]

In [23]:
raw[raw['source']=='tides']

Unnamed: 0,source,english_sentence,hindi_sentence
5,tides,The then Governor of Kashmir resisted transfer...,कश्मीर के तत्कालीन गवर्नर ने इस हस्तांतरण का व...
9,tides,You may want your child to go to a school that...,हो सकता है कि आप चाहते हों कि आप का नऋर्नमेनटे...
10,tides,Please ensure that you use the appropriate form .,कृपया यह सुनिश्चित कर लें कि आप सही फॉर्म का प...
14,tides,The first two were found unreliable and the pr...,पहले दो को अविश्वसनीय मानकर बाकी पांच मुखबिरों...
15,tides,They had justified their educational policy of...,कम संख़्या वाले उच्च एवं मध्यम श्रेणी के लोगों...
...,...,...,...
127599,tides,About 85 per cent of our tea production was di...,हमारे चाय उत्पादन का 85 प्रतिशत सीधे ब्रिटेन क...
127600,tides,Certain other tests may be necessary for speci...,विशेष स्थितियों में कुछ अन्य परीक्षणों की भी ज...
127601,tides,It is now widening its boundaries and it may i...,अब यह अपनी सीमाओं को फैला रहा है और हो सकता है...
127604,tides,"As for the other derivatives of sulphur , the ...","जहां तक गंधक के अन्य उत्पादों का प्रश्न है , द..."
