<a href="https://colab.research.google.com/github/rajashekar/colab/blob/main/Transliterate.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np

In [3]:
%cd /content/drive/MyDrive/Colab/Transliterate/

/content/drive/MyDrive/Colab/Transliterate


In [4]:
%ls

players_with_hindi_names.json


In [5]:
df = pd.read_json('players_with_hindi_names.json')

In [6]:
df.head()

Unnamed: 0,id,hindi_name,hindi_long_name,english_name,gender,date_of_birth,country_id,slug
0,90143,आर गुरबाज़,रहमानउल्लाह गुरबाज़,Rahmanullah Gurbaz,M,"{'year': 2001, 'month': 11, 'date': 28}",40,rahmanullah-gurbaz
1,54018,एम रिज़वान,मोहम्मद रिज़वान,Mohammad Rizwan,M,"{'year': 1992, 'month': 6, 'date': 1}",7,mohammad-rizwan
2,47686,एस मक़सूद,सोहेब मक़सूद,Sohaib Maqsood,M,"{'year': 1987, 'month': 4, 'date': 15}",7,sohaib-maqsood
3,53649,आर आर रुसो,राइली रुसो,"Rossouw, RR",M,"{'year': 1989, 'month': 10, 'date': 9}",3,rilee-rossouw
4,72393,एस हेटमायर,शिमरॉन हेटमायर,"Hetmyer, SO",M,"{'year': 1996, 'month': 12, 'date': 26}",4,shimron-hetmyer


In [7]:
input_list =  df['hindi_long_name'].tolist()
target_list =  df['english_name'].tolist()

In [8]:
print(input_list[0])
print(target_list[0])

रहमानउल्लाह गुरबाज़
Rahmanullah Gurbaz


In [79]:
def isEnglish(s):
  try:
      s.encode(encoding='utf-8').decode('ascii')
  except UnicodeDecodeError:
      return False
  else:
      return True


# Build Vocabulary
input_vocab = set()
target_vocab = set()
for input_word, target_word in zip(input_list, target_list):
  # there are some target words in hindi
  if isEnglish(target_word): 
    # there are 3 ways names are given
    # 1. Firstname lastname  Ex ('रहमानउल्लाह गुरबाज़', 'Rahmanullah Gurbaz')
    # 2. Lastname, Firstname Initials  Ex ('राइली रुसो', 'Rossouw, RR')
    # 3. Lastname. Firstname Initials  Ex ('राइली रुसो', 'Rossouw.RR')
    # if comma exists then discard initials
    if "," in target_word or "." in target_word:
      hln = input_word.split()[1]
      if "," in target_word:
        eln = target_word.lower().split(',')[0]
      else:
        eln = target_word.lower().split('.')[0]
      input_vocab.update(set(hln))
      target_vocab.update(set(eln))
    else:
      input_names = input_word.split()
      target_names = target_word.split()
      # For example - 'लिटन कुमार दास' != Liton Das, remove middle name
      if len(input_names) > len(target_names):
        del input_names[1] # delete middle name
      for idx in range(len(input_names)):
        hn = input_names[idx]
        en = target_names[idx].lower()
        input_vocab.update(set(hn))
        target_vocab.update(set(en))

In [80]:
print(f"Total hindi characters {len(input_vocab)}")
print(f"Total english characters {len(target_vocab)}")

Total hindi characters 53
Total english characters 29


In [81]:
input_vocab = sorted(list(input_vocab))
target_vocab = sorted(list(target_vocab))

In [82]:
input_token_index = dict([(char, i) for i, char in enumerate(input_vocab)])
target_token_index = dict([(char, i) for i, char in enumerate(target_vocab)])

In [83]:
num_encoder_tokens = len(input_vocab)
num_decoder_tokens = len(target_vocab)

In [84]:
max_encoder_seq_length = max([len(txt) for txt in input_list])
max_decoder_seq_length = max([len(txt) for txt in target_list])

In [85]:
print('Number of samples:', len(input_list))
print('Number of unique input tokens:', num_encoder_tokens)
print('Number of unique output tokens:', num_decoder_tokens)
print('Max sequence length for inputs:', max_encoder_seq_length)
print('Max sequence length for outputs:', max_decoder_seq_length)

Number of samples: 2512
Number of unique input tokens: 53
Number of unique output tokens: 29
Max sequence length for inputs: 20
Max sequence length for outputs: 26


In [86]:
encoder_input_data = np.zeros((len(input_list), max_encoder_seq_length, num_encoder_tokens), dtype='float32')
decoder_input_data = np.zeros((len(input_list), max_decoder_seq_length, num_decoder_tokens), dtype='float32')
decoder_target_data = np.zeros( (len(input_list), max_decoder_seq_length, num_decoder_tokens), dtype='float32')

In [87]:
print(f"encoder input shape {encoder_input_data.shape}")
print(f"decoder input shape {decoder_input_data.shape}")
print(f"decoder target shape {decoder_target_data.shape}")

encoder input shape (2512, 20, 53)
decoder input shape (2512, 26, 29)
decoder target shape (2512, 26, 29)


In [88]:
def populate_data(i, input_text, target_text):
  for t, char in enumerate(input_text):
      encoder_input_data[i, t, input_token_index[char]] = 1.
  for t, char in enumerate(target_text):
      # decoder_target_data is ahead of decoder_input_data by one timestep
      decoder_input_data[i, t, target_token_index[char]] = 1.
      if t > 0:
          # decoder_target_data will be ahead by one timestep
          # and will not include the start character.
          decoder_target_data[i, t - 1, target_token_index[char]] = 1.

for i, (input_text, target_text) in enumerate(zip(input_list, target_list)):
  if isEnglish(target_text): 
    if "," in target_text or "." in target_text:
      hln = input_word.split()[1]
      if "," in target_word:
        eln = target_word.lower().split(',')[0]
      else:
        eln = target_word.lower().split('.')[0]
      populate_data(i, hln, eln)
    else:
      input_names = input_word.split()
      target_names = target_word.split()
      # For example - 'लिटन कुमार दास' != Liton Das, remove middle name
      if len(input_names) > len(target_names):
        del input_names[1] # delete middle name
      for idx in range(len(input_names)):
        hn = input_names[idx]
        en = target_names[idx].lower()
        populate_data(i, hn, en)

In [89]:
from keras.models import Model
from keras.layers import Input, LSTM, Dense

# Define an input sequence and process it.
encoder_inputs = Input(shape=(None, num_encoder_tokens))

In [90]:
latent_dim = 256  # Latent dimensionality of the encoding space.

encoder = LSTM(latent_dim, return_state=True)

In [91]:
encoder_outputs, state_h, state_c = encoder(encoder_inputs)

In [92]:
# We discard `encoder_outputs` and only keep the states.
encoder_states = [state_h, state_c]

In [93]:
# Set up the decoder, using `encoder_states` as initial state.
decoder_inputs = Input(shape=(None, num_decoder_tokens))

In [94]:
# We set up our decoder to return full output sequences,
# and to return internal states as well. We don't use the
# return states in the training model, but we will use them in inference.
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)

In [95]:
decoder_outputs, _, _ = decoder_lstm(decoder_inputs, initial_state=encoder_states)

In [96]:
decoder_dense = Dense(num_decoder_tokens, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

In [97]:
# Define the model that will turn
# `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

In [98]:
# Training
batch_size = 64  # Batch size for training.
epochs = 10  # Number of epochs to train for.

model.compile(optimizer='rmsprop', loss='categorical_crossentropy')
model.fit([encoder_input_data, decoder_input_data], decoder_target_data,
          batch_size=batch_size, epochs=epochs, validation_split=0.2)
# Save model
model.save('s2s.h5')

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [99]:
%ls -ltr

total 5665
-rw------- 1 root root  813709 Sep 14 22:38 players_with_hindi_names.json
-rw------- 1 root root 4986208 Sep 15 00:40 s2s.h5


In [100]:
!du -sh s2s.h5

4.8M	s2s.h5


In [101]:
input_data = np.zeros((1, max_encoder_seq_length, num_encoder_tokens), dtype='float32')
decoder_data = np.zeros((1, max_decoder_seq_length, num_decoder_tokens), dtype='float32')
test_input = 'राजशेखर'
for t, char in enumerate(test_input):
  input_data[0, t, input_token_index[char]] = 1.

In [102]:
input_data.shape

(1, 20, 53)

In [103]:
decoder_data.shape

(1, 26, 29)

In [104]:
prediction = model.predict([input_data, decoder_data])

In [105]:
prediction.shape

(1, 26, 29)

In [106]:
target_vocab[np.argmax(prediction[0, 0])]

'o'

In [107]:
t_txt = list()
for i in range(prediction.shape[1]):
  t_txt.append(target_vocab[np.argmax(prediction[0, i])])

In [108]:
''.join(t_txt)

'ohammad  raaaaaaaaaaaaaaaa'