In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import keras
import tensorflow as tf
from keras.layers import SimpleRNN,LSTM,GRU,Embedding,Dense,Dropout,Input
from tensorflow.keras.optimizers import Adam,Nadam
from keras import Model
tf.compat.v1.enable_eager_execution()

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
def tok_map(data):
    source = data['en'].values
    target = data['hi'].values
    target = '\t'+target+'\n'

    len_list_s = [len(i) for i in source]
    s_max_len = max(len_list_s)

    len_list_t = [len(i) for i in target]
    t_max_len = max(len_list_t)

    s_tok = set()
    for sw in source:
        for chr in sw:
            s_tok.add(chr)
    source_tokens = sorted(list(s_tok))
    s_tok_map = dict([(chr,i+1) for i,chr in enumerate(source_tokens)])
    s_tok_map[" "] = 0

    t_tok = set()
    for st in target:
        for chr in st:
            t_tok.add(chr)
    tar_tokens = sorted(list(t_tok))
    t_tok_map = dict([(chr,i+1) for i,chr in enumerate(tar_tokens)])
    t_tok_map[" "] = 0

    return source_tokens, s_tok_map, s_max_len, tar_tokens, t_tok_map, t_max_len

def dataLoad(path):
    with open(path) as dataFile:
        dataset = pd.read_csv(dataFile,sep='\t',header=None,names=["hi","en",""],skip_blank_lines=True,index_col=None)
    #print(dataset.head())
    dataset = dataset[dataset['hi'].notna()]
    #print(dataset.head())
    dataset = dataset[dataset['en'].notna()]
    #print(dataset.head())
    dataset = dataset[['hi','en']]
    #print(dataset.head())
    return dataset

def dataProcess(data):
    src,tar = data['en'].values, data['hi'].values
    tar = "\t" + tar + "\n"

    slen = len(src)
    enc_inp = np.zeros(
        (slen,s_max_len), dtype="float32"
    )

    tlen = len(tar)
    dec_inp = np.zeros(
        (tlen,t_max_len), dtype="float32"
    )
    dec_tar = np.zeros(
        (tlen, t_max_len, len(tar_tokens)+1), dtype="int"
    )
    for i,(sw,tw) in enumerate(zip(src,tar)):
        for j,ch in enumerate(sw):
            enc_inp[i,j] = s_tok_map[ch]
        enc_inp[i,j+1:] = s_tok_map[" "]

        for j,ch in enumerate(tw):
            dec_inp[i,j] = t_tok_map[ch]
            if j>0:
                dec_tar[i,j-1,t_tok_map[ch]] = 1
        dec_inp[i,j+1:] = t_tok_map[" "]
        dec_tar[i,j:,t_tok_map[" "]] = 1
        
    return enc_inp, dec_inp, dec_tar

In [7]:
#Loading the Datasets from the Drive ----------------------------------------------------------------------
train = dataLoad("/content/drive/MyDrive/dakshina_dataset_v1.0/hi/lexicons/hi.translit.sampled.train.tsv")
source_tokens, s_tok_map, s_max_len, tar_tokens, t_tok_map, t_max_len = tok_map(train)
dev = dataLoad("/content/drive/MyDrive/dakshina_dataset_v1.0/hi/lexicons/hi.translit.sampled.dev.tsv")
test = dataLoad("/content/drive/MyDrive/dakshina_dataset_v1.0/hi/lexicons/hi.translit.sampled.test.tsv")   

In [8]:
# preparing the Test data set for evaluating the mdoel
test_encoder_input, test_decoder_input, test_decoder_target= dataProcess(test)

In [9]:
enmstok =  []
for chr,i in s_tok_map.items():
  enmstok.append((i,chr))
revSource_map = dict(enmstok)

enmttok =  []
for chr,i in t_tok_map.items():
  enmttok.append((i,chr))
revTarg_map = dict(enmttok)

In [10]:
# Loading the saved best configuration model ( Encoder and decoder )-----------------------------
enc = keras.models.load_model('best_enc.h5')
dec = keras.models.load_model('best_dec.h5')



In [11]:
def beam_search(input, BeamWidth, decl, cell="LSTM"):
    stateList = enc.predict(input)
    tar_seq = np.zeros((input.shape[0],1))
    tar_seq[:,0] = t_tok_map["\t"]
    if cell != "LSTM":
      states = []
      for i in range(decl):
          states += [stateList]
    else:
      states = []
      for i in range(decl):
          states += [stateList[0],stateList[1]]
  
    output = dec.predict([tar_seq]+states)
    states = output[1:]
    stateTrArr = np.asarray(states).transpose([1,0,2])
    
    beamChar = np.argsort(output[0][:,-1,:],axis=-1)[:,-BeamWidth:]
    points = np.sort(output[0][:,-1,:],axis=-1)[:,-BeamWidth:]
    inps0 = input.shape[0]
    seqs = []
    for i in range(inps0):
      temp = [([chr],-np.log(pt),stateTrArr[i],0) for chr,pt in zip(beamChar[i],points[i])]
      seqs.append(temp)
    
    
    for _ in range(t_max_len-1):
        probable = [[] for _ in range(input.shape[0])]
        for j in range(BeamWidth):
            tar_seq[:,0] = [seqs[i][j][0][-1] for i in range(input.shape[0])]
            states = list(np.asarray([seqs[i][j][2] for i in range(input.shape[0])]).transpose([1,0,2]))
            output = dec.predict([tar_seq]+states,batch_size=32)
            beamChar = np.argsort(output[0][:,-1,:],axis=-1)[:,-BeamWidth:]
            points = np.sort(output[0][:,-1,:],axis=-1)[:,-BeamWidth:]
            
            stateTrArr = np.asarray(output[1:]).transpose([1,0,2])
            
            for i in range(input.shape[0]):
                if ( (seqs[i][j][0][-1] == t_tok_map["\n"]) or (seqs[i][j][3]==1) ):
                  check = 1
                else:
                  check = 0

                if check == 0:
                    probable[i] += [(seqs[i][j][0]+[beamChar[i,rep]], seqs[i][j][1]-np.log(points[i,rep]), stateTrArr[i],check) for rep in range(BeamWidth)]
                else:
                    probable[i] += [seqs[i][j]]
                    
        for i in range(input.shape[0]):
            probable[i] = sorted(probable[i],key = lambda tup:tup[1]/len(tup[0]))
            seqs[i] = probable[i][:BeamWidth]
        
    res = [list() for i in range(input.shape[0])]
    for i in range(input.shape[0]):
        for j in range(BeamWidth):
            res[i].append(seqs[i][j][0])
        
    return res

In [12]:
def TransLit(input, decl, cell):
    tar_seq = np.zeros((input.shape[0],1))
    tar_seq[:,0] = t_tok_map["\t"]
    predicted = np.zeros((input.shape[0],t_max_len))
    states = []
    stateList = enc.predict(input)
    if cell != "LSTM":
      for c in range(decl):
            states += [stateList]
    else:
      for c in range(decl):
          states += [stateList[0],stateList[1]]
          
    for idx in range(t_max_len):
        output = dec.predict([tar_seq]+states,batch_size=64)
        predicted[:,idx] = np.argmax(output[0][:,-1,:],axis=1)
        tar_seq[:,0] = predicted[:,idx]
        states = output[1:]
        
    return predicted

In [13]:
def getWords(predArr):
  resB = []
  resI = []
  for i,predn in enumerate(predArr):
    wordsB = []    
    original = ""
    for ch in test_encoder_input[i]:
      if revSource_map[ch] == " ":
          break
      original += revSource_map[ch]
    wordsB.append(original)
    
    real = ""
    for ch in test_decoder_input[i,1:]:
        if revTarg_map[ch] == "\n":
            break
        real += revTarg_map[ch]
    
    wordsB.append(real)
    wordsI = wordsB.copy()
    predicted = ""
    for chr in predn[0]:
      if revTarg_map[chr] == "\n":
        break
      predicted = predicted + revTarg_map[chr]
    wordsI.append(predicted)
    resI.append(wordsI)

    for pr in predn:
        predicted = ""
        for ch in pr:
            if revTarg_map[ch] == "\n":
                break
            predicted += revTarg_map[ch]
        wordsB.append(predicted)
    resB.append(wordsB)
  return resB, resI

In [14]:
# Predicting the input words using beam search 

pred_beam = beam_search(test_encoder_input,5,2,cell="LSTM")

In [15]:
# Predicting the input words using inference prediction

pred = TransLit(test_encoder_input,2,cell="LSTM")

In [16]:
for i in range(1, 6):
    idx = np.random.choice(test_encoder_input.shape[0])
    
    #creating strings to store all three strings 
    original = ""
    decoded = ""
    real = ""
    
    #getting the words according the random index
    sword = test_encoder_input[idx]
    pword = pred[idx]
    rword = test_decoder_input[idx]

    #fetching the words character by character
    for c in sword:
        original += revSource_map[c]
        if revSource_map[c] == "\n":
            break
    print("English word:",original)
    for c in pword:
        decoded += revTarg_map[c]
        if revTarg_map[c] == "\n":
            break
    print("Predicted word:", decoded)
    for c in rword:
      real += revTarg_map[c]
      if revTarg_map[c] == "\n":
          break
        
    print("Hindi original:",real[1:])
    print("----x----x----x----x----")

English word: prativaad           
Predicted word: प्रतिवाद

Hindi original: प्रतिवाद

----x----x----x----x----
English word: registered          
Predicted word: रजिस्टेडर

Hindi original: रजिस्टर्ड

----x----x----x----x----
English word: tadipar             
Predicted word: तादिपार

Hindi original: तड़ीपार

----x----x----x----x----
English word: mahajan             
Predicted word: महाजन

Hindi original: महाजन

----x----x----x----x----
English word: raadhe              
Predicted word: राढे

Hindi original: राधे

----x----x----x----x----


In [17]:
acc = 0
for idx1,p in enumerate(pred):
    correct = 1
    for idx2,chr in enumerate(p):
        if chr != np.argmax(test_decoder_target[idx1,idx2,:]):
            correct = 0
            break
        if chr == t_tok_map["\n"]:
            break
            
    if correct==1:
        acc+=1
        
accuracy = acc/len(pred)
print(f"Accuracy on the Test Data = {accuracy} => {round(accuracy*100,2)}%")

Accuracy on the Test Data = 0.4004608396268325 => 40.05%


In [18]:
ansB, ansI = getWords(pred_beam)

In [19]:
df = pd.DataFrame(ansB,columns=['English','Hindi_Real']+[f'Pred_{i}' for i in range(5)])
df.sample(n=5)

Unnamed: 0,English,Hindi_Real,Pred_0,Pred_1,Pred_2,Pred_3,Pred_4
2644,francisco,फ्रांसिस्को,फ्रेंसिस्को,फ्रांसिस्को,फ्रैंसिस्को,फ्रेंसिक्सो,फ्रेंसिस्कों
1643,dhoondhata,ढूंढता,ढूंढाता,धूंढाता,धूंधता,ढूंधाता,ढूंधता
2508,priyamani,प्रियामणि,प्रियमानी,प्रियमणि,प्रिमाणी,प्रियमणी,प्रियमाणी
3785,vinasht,विनष्ट,विनाश्ट,विनाश्त,विनास्त,विनाष्ट,विनाश
753,kahaniyon,कहानियों,कहानियों,कहनियों,कहंशियों,कहंतियों,कहकियों


In [20]:
df1 = pd.DataFrame(ansI,columns=['English','Hindi_Real','Hindi_pred'])
df1.sample(n=5)

Unnamed: 0,English,Hindi_Real,Hindi_pred
1342,jatiyon,जातियों,जातियों
1058,ghataya,घटाया,घटाया
1355,jin,जिन,जीन
2430,paim,पैम,पेम
551,addi,एड्डी,अड्डी


In [21]:
df1.to_csv('predictions_vanilla.csv')