In [1]:
from tensorflow.keras.models import Model
from tensorflow.keras import models,preprocessing
from tensorflow.keras.utils import plot_model,to_categorical
from tensorflow.keras.layers import Input,LSTM,Dense,Embedding
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
import pickle


In [2]:

input_texts=[]
target_texts=[]
input_characters=set()
target_characters=set()

In [3]:
with open('transliteration_dataset.txt','r',encoding='utf-8') as f:
    rows=f.read().split('\n')
    for row in rows[0:30823]:        
        input_text,target_text = row.split('\t')   
        target_text='\t' + target_text + '\n'
        input_texts.append(input_text.lower())
        target_texts.append(target_text.lower())    
        input_characters.update(list(input_text.lower()))
        target_characters.update(list(target_text.lower()))
      

In [4]:
input_characters = sorted(list(input_characters))
target_characters = sorted(list(target_characters))

In [5]:

num_en_chars = len(input_characters)
num_dec_chars = len(target_characters)

In [6]:

max_input_length = max([len(i) for i in input_texts])
max_target_length = max([len(i) for i in target_texts])

In [7]:
def bagofcharacters(input_texts,target_texts):
  
  en_in_data=[] ; dec_in_data=[] ; dec_tr_data=[]
  
  pad_en=[1]+[0]*(len(input_characters)-1)
  pad_dec=[0]*(len(target_characters)) ; pad_dec[2]=1
  
  cv=CountVectorizer(binary=True,tokenizer=lambda txt: txt.split(),stop_words=None,analyzer='char')
  for i,(input_t,target_t) in enumerate(zip(input_texts,target_texts)):
    
    cv_inp= cv.fit(input_characters)
    
   
    en_in_data.append(cv_inp.transform(list(input_t)).toarray().tolist())
    cv_tar= cv.fit(target_characters)		
    dec_in_data.append(cv_tar.transform(list(target_t)).toarray().tolist())
    
    dec_tr_data.append(cv_tar.transform(list(target_t)[1:]).toarray().tolist())
    
    
    if len(input_t) < max_input_length:
      for _ in range(max_input_length-len(input_t)):
        en_in_data[i].append(pad_en)
    if len(target_t) < max_target_length:
      for _ in range(max_target_length-len(target_t)):
        dec_in_data[i].append(pad_dec)
    if (len(target_t)-1) < max_target_length:
      for _ in range(max_target_length-len(target_t)+1):
        dec_tr_data[i].append(pad_dec)
  
 
  en_in_data=np.array(en_in_data,dtype="float32")
  dec_in_data=np.array(dec_in_data,dtype="float32")
  dec_tr_data=np.array(dec_tr_data,dtype="float32")
  en_in_data

  return en_in_data,dec_in_data,dec_tr_data

In [8]:

en_inputs = Input(shape=(None, num_en_chars))

In [9]:

encoder = LSTM(256, return_state=True,return_sequences=True)

In [10]:

en_outputs, state_h, state_c = encoder(en_inputs)
en_states = [state_h, state_c]

In [11]:

en_outputs, state_h, state_c = encoder(en_inputs)
en_states = [state_h, state_c]

In [12]:

en_outputs, state_h, state_c = encoder(en_inputs)
en_states = [state_h, state_c]

In [13]:

dec_inputs = Input(shape=(None, num_dec_chars))

In [14]:
decoder_embedding = Embedding( num_dec_chars, 256 , mask_zero=True) (dec_inputs)

In [15]:

dec_lstm = LSTM(256, return_sequences=True, return_state=True)

In [16]:

dec_outputs, _, _ = dec_lstm(dec_inputs, initial_state=en_states)

In [17]:

dec_dense = Dense(num_dec_chars, activation="softmax")
dec_outputs = dec_dense(dec_outputs)

In [18]:

model = Model([en_inputs, dec_inputs], dec_outputs)
pickle.dump({'input_characters':input_characters,'target_characters':target_characters,
             'max_input_length':max_input_length,'max_target_length':max_target_length,
             'num_en_chars':num_en_chars,'num_dec_chars':num_dec_chars},open("training_data_transliteration.pkl","wb"))


In [None]:
en_in_data,dec_in_data,dec_tr_data = bagofcharacters(input_texts,target_texts)



In [25]:

model.compile(
    optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"]
)
history=model.fit(
    [en_in_data, dec_in_data],
    dec_tr_data,
    batch_size=1,
    epochs=5,
    validation_split=0.2,
)

model.save("s2s_transliteration")


model.summary()

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 5/5




Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, None, 27)]   0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, None, 68)]   0                                            
__________________________________________________________________________________________________
lstm (LSTM)                     [(None, None, 256),  290816      input_1[0][0]                    
__________________________________________________________________________________________________
lstm_1 (LSTM)                   [(None, None, 256),  332800      input_2[0][0]                    
                                                                 lstm[2][1]                   

In [None]:
 def decode_sequence(input_seq):
        datafile = pickle.load(open("training_data_transliteration.pkl","rb"))       
        target_characters = datafile['target_characters']        
        max_target_length = datafile['max_target_length']           
        model = models.load_model("s2s_transliteration")    
        enc_outputs, state_h_enc, state_c_enc = model.layers[2].output         
        en_model = Model(model.input[0], [state_h_enc, state_c_enc])
        dec_state_input_h = Input(shape=(256,), name="input_6")
        dec_state_input_c = Input(shape=(256,), name="input_5")
        dec_states_inputs = [dec_state_input_h, dec_state_input_c]
        dec_lstm = model.layers[3]
        dec_outputs, state_h_dec, state_c_dec = dec_lstm(model.input[1], initial_state=dec_states_inputs )       
        dec_states = [state_h_dec, state_c_dec]
        dec_dense = model.layers[4]
        dec_outputs = dec_dense(dec_outputs)
        dec_model = Model([model.input[1]] + dec_states_inputs, [dec_outputs] + dec_states   )
        reverse_target_char_index = dict(enumerate(target_characters))        
        states_value = en_model.predict(input_seq)       
        cv=CountVectorizer(binary=True,tokenizer=lambda txt: txt.split(),stop_words=None,analyzer='char')        
        co=cv.fit(target_characters) 
        target_seq=np.array([co.transform(list("\t")).toarray().tolist()],dtype="float32")        
        stop_condition = False        
        decoded_sentence = ""
        while not stop_condition:           
            output_chars, h, c = dec_model.predict([target_seq] + states_value)            
            char_index = np.argmax(output_chars[0, -1, :])
            text_char = reverse_target_char_index[char_index]
            decoded_sentence += text_char            
            if text_char == "\n" or len(decoded_sentence) > max_target_length:
                stop_condition = True           
            target_seq = np.zeros((1, 1, num_dec_chars))
            target_seq[0, 0, char_index] = 1.0
            states_value = [h, c]        
        return decoded_sentence

In [None]:
def bagofcharacter(input_t):
        datafile = pickle.load(open("training_data_transliteration.pkl","rb"))
        input_characters = datafile['input_characters']        
        max_input_length = datafile['max_input_length']       
        cv=CountVectorizer(binary=True,tokenizer=lambda txt: txt.split(),stop_words=None,analyzer='char') 
        en_in_data=[] ; pad_en=[1]+[0]*(len(input_characters)-1)
        cv_inp= cv.fit(input_characters)
        en_in_data.append(cv_inp.transform(list(input_t)).toarray().tolist())
        if len(input_t)< max_input_length:
            for _ in range(max_input_length-len(input_t)):
                en_in_data[0].append(pad_en)    
        return np.array(en_in_data,dtype="float32")

In [None]:
def listToString(s):   
    str1 = ""   
    for ele in s:
        str1 += ele    
    return str1

In [None]:
def entry():
    x=  input( 'Enter eng sentence : ' ) 
    input_text = x.split(' ') 
    count=0
    output_texts=""
    
    for x in input_text:
        en_in_data = bagofcharacter(x.lower()+".")    
        x=decode_sequence(en_in_data)
        output_texts+=" "+ x
        print(output_texts)
           
          
    
       
    print(output_texts)

In [None]:
output_texts=""

entry()