In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import keras
import tensorflow as tf
from keras.layers import SimpleRNN,LSTM,GRU,Embedding,Dense,Dropout,Input
from tensorflow.keras.optimizers import Adam,Nadam
from keras import Model
tf.compat.v1.enable_eager_execution()

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [39]:
def tok_map(data):
    source = data['en'].values
    target = data['hi'].values
    target = '\t'+target+'\n'

    len_list_s = [len(i) for i in source]
    s_max_len = max(len_list_s)

    len_list_t = [len(i) for i in target]
    t_max_len = max(len_list_t)

    s_tok = set()
    for sw in source:
        for chr in sw:
            s_tok.add(chr)
    source_tokens = sorted(list(s_tok))
    s_tok_map = dict([(chr,i+1) for i,chr in enumerate(source_tokens)])
    s_tok_map[" "] = 0

    t_tok = set()
    for st in target:
        for chr in st:
            t_tok.add(chr)
    tar_tokens = sorted(list(t_tok))
    t_tok_map = dict([(chr,i+1) for i,chr in enumerate(tar_tokens)])
    t_tok_map[" "] = 0

    return source_tokens, s_tok_map, s_max_len, tar_tokens, t_tok_map, t_max_len

def dataLoad(path):
    with open(path) as dataFile:
        dataset = pd.read_csv(dataFile,sep='\t',header=None,names=["hi","en",""],skip_blank_lines=True,index_col=None)
    #print(dataset.head())
    dataset = dataset[dataset['hi'].notna()]
    #print(dataset.head())
    dataset = dataset[dataset['en'].notna()]
    #print(dataset.head())
    dataset = dataset[['hi','en']]
    #print(dataset.head())
    return dataset

def dataProcess(data):
    src,tar = data['en'].values, data['hi'].values
    tar = "\t" + tar + "\n"

    slen = len(src)
    enc_inp = np.zeros(
        (slen,s_max_len), dtype="float32"
    )

    tlen = len(tar)
    dec_inp = np.zeros(
        (tlen,t_max_len), dtype="float32"
    )
    dec_tar = np.zeros(
        (tlen, t_max_len, len(tar_tokens)+1), dtype="int"
    )
    i = int(0)
    for (sw,tw) in zip(src,tar):
        j=int(0)
        for ch in tw:
            dec_inp[i,j] = t_tok_map[ch]
            if j>0:
                dec_tar[i,j-1,t_tok_map[ch]] = 1
            j+=1
        dec_inp[i,j+1:] = t_tok_map[" "]
        dec_tar[i,j:,t_tok_map[" "]] = 1
        j=int(0)
        for ch in sw:
            enc_inp[i,j] = s_tok_map[ch]
            j+=1
        enc_inp[i,j+1:] = s_tok_map[" "]

        i += 1
        
    return enc_inp, dec_inp, dec_tar

In [40]:
#Loading the Datasets from the Drive ----------------------------------------------------------------------
train = dataLoad("/content/drive/MyDrive/dakshina_dataset_v1.0/hi/lexicons/hi.translit.sampled.train.tsv")
source_tokens, s_tok_map, s_max_len, tar_tokens, t_tok_map, t_max_len = tok_map(train)
dev = dataLoad("/content/drive/MyDrive/dakshina_dataset_v1.0/hi/lexicons/hi.translit.sampled.dev.tsv")
test = dataLoad("/content/drive/MyDrive/dakshina_dataset_v1.0/hi/lexicons/hi.translit.sampled.test.tsv")   

In [41]:
# preparing the Test data set for evaluating the mdoel
test_encoder_input, test_decoder_input, test_decoder_target= dataProcess(test)

In [42]:
enmstok =  []
for chr,i in s_tok_map.items():
  enmstok.append((i,chr))
revSource_map = dict(enmstok)

enmttok =  []
for chr,i in t_tok_map.items():
  enmttok.append((i,chr))
revTarg_map = dict(enmttok)

In [8]:
# Loading the saved best configuration model ( Encoder and decoder )-----------------------------
enc = keras.models.load_model('/content/drive/MyDrive/dakshina_dataset_v1.0/best_enc.h5')
dec = keras.models.load_model('/content/drive/MyDrive/dakshina_dataset_v1.0/best_dec.h5')



In [10]:
def beam_search(input, BeamWidth, decl, cell="LSTM"):
    print("BeamSearch Starting")
    stateList = enc.predict(input)
    tar_seq = np.zeros((input.shape[0],1))
    tar_seq[:,0] = t_tok_map["\t"]
    if cell != "LSTM":
      states = []
      for i in range(decl):
          states += [stateList]
    else:
      states = []
      for i in range(decl):
          states += [stateList[0],stateList[1]]
  
    output = dec.predict([tar_seq]+states)
    states = output[1:]
    stateTrArr = np.asarray(states).transpose([1,0,2])
    
    points = np.sort(output[0][:,-1,:],axis=-1)[:,-BeamWidth:]
    beamChar = np.argsort(output[0][:,-1,:],axis=-1)[:,-BeamWidth:]
    
    inps0 = input.shape[0]
    seqs = []
    for i in range(inps0):
      temp = [([chr],-np.log(pt),stateTrArr[i],0) for chr,pt in zip(beamChar[i],points[i])]
      seqs.append(temp)
    
    
    for _ in range(t_max_len-1):
        si_ze = input.shape[0]
        probable = []
        for _ in range(si_ze):
            probable.append([])
        for j in range(BeamWidth):
            tar_seq[:,0] = [seqs[i][j][0][-1] for i in range(si_ze)]
            states = list(np.asarray([seqs[i][j][2] for i in range(si_ze)]).transpose([1,0,2]))
            output = dec.predict([tar_seq]+states,batch_size=64)

            stateTrArr = np.asarray(output[1:]).transpose([1,0,2])

            points = np.sort(output[0][:,-1,:],axis=-1)[:,-BeamWidth:]
            beamChar = np.argsort(output[0][:,-1,:],axis=-1)[:,-BeamWidth:]
            
            for i in range(input.shape[0]):
                if ( not((seqs[i][j][0][-1] == t_tok_map["\n"]) or (seqs[i][j][3]==1)) ):
                  check = 0
                else:
                  check = 1

                if check != 0:
                    probable[i] += [seqs[i][j]]
                else:
                    probable[i] += [(seqs[i][j][0]+[beamChar[i,rep]], seqs[i][j][1]-np.log(points[i,rep]), stateTrArr[i],check) for rep in range(BeamWidth)]
                    
                    
        for i in range(si_ze):
            probable[i] = sorted(probable[i],key = lambda var:var[1]/len(var[0]))
        
        for i in range(si_ze):
            seqs[i] = probable[i][:BeamWidth]

    si_ze = input.shape[0]
    result = []
    for i in range(si_ze):
      result.append([])
    for i in range(si_ze):
        for j in range(BeamWidth):
            result[i].append(seqs[i][j][0])
    
    print("BeamSearch Complete")        
    return result

In [43]:
def TransLit(input, decl, cell):
    tar_seq = np.zeros((input.shape[0],1))
    tar_seq[:,0] = t_tok_map["\t"]
    predicted = np.zeros((input.shape[0],t_max_len))
    states = []
    stateList = enc.predict(input)
    if cell != "LSTM":
      for c in range(decl):
            states += [stateList]
    else:
      for c in range(decl):
          states += [stateList[0],stateList[1]]
    
    tmp = train['hi'].values
    tmp = '\t'+tmp+'\n'
    tmp2 = [len(i) for i in tmp]
    tmp3 = max(tmp2)
    for idx in range(tmp3):
        output = dec.predict([tar_seq]+states,batch_size=64)
        states = output[1:]
        predicted[:,idx] = np.argmax(output[0][:,-1,:],axis=1)
        tar_seq[:,0] = predicted[:,idx]
        
    return predicted

In [12]:
def getWords(predArr):
  resB = []
  resI = []
  i=0
  for predn in predArr:
    wordsB = []    
    original = ""
    real = ""
    for ch in test_encoder_input[i]:
      if revSource_map[ch] != " ":
          original += revSource_map[ch]
      else:
          break
          
    wordsB.append(original)
    
    for ch in test_decoder_input[i,1:]:
        if revTarg_map[ch] != "\n":
            real += revTarg_map[ch]
        else:
            break

    wordsB.append(real)
    wordsI = wordsB.copy()
    predicted = ""
    for chr in predn[0]:
      if revTarg_map[chr] != "\n":
          predicted = predicted + revTarg_map[chr]
      else:
          break
          
    wordsI.append(predicted)
    resI.append(wordsI)
    for pr in predn:
        predicted = ""
        for ch in pr:
            if revTarg_map[ch] != "\n":
                predicted += revTarg_map[ch]
            else:
                break
        wordsB.append(predicted)
    resB.append(wordsB)
    i=i+1
  return resB, resI

In [48]:
# Predicting the input words using beam search 

pred_beam = beam_search(test_encoder_input,5,2,cell="LSTM")

BeamSearch Starting
BeamSearch Complete


In [45]:
# Predicting the input words using inference prediction

pred = TransLit(test_encoder_input,2,cell="LSTM")

In [46]:
for i in range(1, 6):
    idx = np.random.choice(test_encoder_input.shape[0])
    
    #creating strings to store all three strings 
    original = ""
    decoded = ""
    real = ""
    
    #getting the words according the random index
    sword = test_encoder_input[idx]
    pword = pred[idx]
    rword = test_decoder_input[idx]

    #fetching the words character by character
    for c in sword:
        original += revSource_map[c]
        if revSource_map[c] == "\n":
            break
    print("English word:",original)
    for c in pword:
        decoded += revTarg_map[c]
        if revTarg_map[c] == "\n":
            break
    print("Predicted word:", decoded)
    for c in rword:
      real += revTarg_map[c]
      if revTarg_map[c] == "\n":
          break
        
    print("Hindi original:",real[1:])
    print("----x----x----x----x----")

English word: phaaph              
Predicted word: फाफ

Hindi original: फाफ

----x----x----x----x----
English word: vidyaen             
Predicted word: विद्याएं

Hindi original: विद्याएं

----x----x----x----x----
English word: narayangadh         
Predicted word: नारायणगढ़

Hindi original: नारायणगढ़

----x----x----x----x----
English word: udar                
Predicted word: उदार

Hindi original: उदर

----x----x----x----x----
English word: duwaidhaayein       
Predicted word: दुवाहिदाएं

Hindi original: दुविधाएं

----x----x----x----x----


In [47]:
acc = 0
for idx1,p in enumerate(pred):
    correct = 1
    for idx2,chr in enumerate(p):
        if chr != np.argmax(test_decoder_target[idx1,idx2,:]):
            correct = 0
            break
        if chr == t_tok_map["\n"]:
            break
            
    if correct==1:
        acc+=1
        
accuracy = acc/len(pred)
print(f"Accuracy on the Test Data = {accuracy} => {round(accuracy*100,2)}%")

Accuracy on the Test Data = 0.3756108396268325 => 37.56%


In [49]:
ansB, ansI = getWords(pred_beam)

In [50]:
df = pd.DataFrame(ansB,columns=['English','Hindi_Real']+[f'Pred_{i}' for i in range(5)])
df.sample(n=5)

Unnamed: 0,English,Hindi_Real,Pred_0,Pred_1,Pred_2,Pred_3,Pred_4
2070,nayak,नायक,नायक,नायक़,नयक,नयाक,न्यक
3300,mohani,मोहानी,मोहनी,मोहानी,मोहानि,मौहनी,मोहनि
599,asfalt,एस्फाल्ट,असफाल्ट,अस्फाल्ट,असफालत,असफॉल्ट,असफलत
354,aavritti,आवृत्ति,आवृतिति,आयृतिति,आविर्तिति,आवृति,आवृत्ति
100,anukulata,अनुकूलता,अनुकूलता,अनुकुलता,अनुुकूलता,आनुकूलता,अनुखूलता


In [51]:
df1 = pd.DataFrame(ansI,columns=['English','Hindi_Real','Hindi_pred'])
df1.sample(n=5)

Unnamed: 0,English,Hindi_Real,Hindi_pred
895,claris,क्लेरिस,क्लारिस
2996,bhula,भूला,भुला
788,kaalpi,कालपी,कालपी
2126,neechaaee,नीचाई,नीचाई
1299,zarda,जर्दा,जर्दा


In [52]:
df1.to_csv('predictions_vanilla.csv')