In [19]:
import pandas as pd
import numpy as np
import string
from string import digits
import matplotlib.pyplot as plt
%matplotlib inline
import re
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense
from tensorflow.keras.models import Model
import os
os.environ['TF_XLA_FLAGS'] = '--tf_xla_enable_xla_devices'

In [2]:
data=pd.read_table('./dataset/french.txt',names=['lang','trans','useless'])
data.drop('useless',axis=1,inplace=True)
data.shape

(190206, 2)

In [3]:
data1=data
data1.head()

Unnamed: 0,lang,trans
0,Go.,Va !
1,Go.,Marche.
2,Go.,Bouge !
3,Hi.,Salut !
4,Hi.,Salut.


In [4]:
data=data1
spchrs=set(string.punctuation)
lowerCase=lambda x:x.lower()
quotes=lambda x:re.sub("'",'',x)
specialCh=lambda x:''.join(ch for ch in x if ch not in spchrs)
rmDigits=lambda x:x.translate(str.maketrans('','',digits))
spaces=lambda x:x.strip()
unwanted=lambda x:re.sub(" +"," ",x)
startEnd=lambda x:'START_'+x+'_END'
data.lang=data.lang.apply(lowerCase)
data.lang=data.lang.apply(quotes)
data.lang=data.lang.apply(specialCh)
data.lang=data.lang.apply(rmDigits)
data.lang=data.lang.apply(spaces)
data.lang=data.lang.apply(unwanted)
data.trans=data.trans.apply(lowerCase)
data.trans=data.trans.apply(quotes)
data.trans=data.trans.apply(specialCh)
data.trans=data.trans.apply(rmDigits)
data.trans=data.trans.apply(spaces)
data.trans=data.trans.apply(unwanted)
data.trans=data.trans.apply(startEnd)

In [5]:
data.sample(10)

Unnamed: 0,lang,trans
13393,help us please,START_aideznous sil vous plaît_END
26150,tom will obey you,START_tom tobéira_END
12136,we volunteered,START_nous nous portâmes volontaires_END
66472,you dont even know how,START_tu ne sais même pas comment_END
2532,were boys,START_nous sommes des garçons_END
152202,he responded to her offer with a laugh,START_il a répondu à sa proposition par des ri...
104799,dont you want to come inside,START_ne veuxtu pas venir à lintérieur_END
158241,you dont have to go unless you want to,START_vous nêtes pas obligés dy aller à moins ...
89011,why are we studying french,START_pourquoi étudionsnous le français_END
35065,i like each of them,START_jaime chacun dentre eux_END


In [6]:
langVocab=set()
for line in data.lang:
    for word in line.split():
        langVocab.add(word)
transVocab=set()
for line in data.trans:
    for word in line.split():
        transVocab.add(word)


In [7]:
maxSrcLen=0
for line in data.lang:
    maxSrcLen=max(maxSrcLen,len(line.split()))
print(maxSrcLen)

47


In [8]:
maxTarLen=0
for line in data.trans:
    maxTarLen=max(maxTarLen,len(line.split()))
print(maxTarLen)

55


In [9]:
inputWords=sorted(list(langVocab))
targetWords=sorted(list(transVocab))
lenOfEncoderTokens=len(langVocab)
lenOfDecoderTokens=len(transVocab)
print(lenOfDecoderTokens,lenOfEncoderTokens)

43869 15359


In [10]:
lenOfDecoderTokens+=1
lenOfDecoderTokens

43870

In [11]:
tarTokenInd,inpRevIndMap,tarRevIndMap,inpTokenInd={},{},{},{}
for i,word in enumerate(inputWords):
    inpTokenInd[word]=i+1
    inpRevIndMap[i]=word
for i,word in enumerate(targetWords):
    tarTokenInd[word]=i+1
    tarRevIndMap[i]=word

In [12]:
data=shuffle(data)
data.head()

Unnamed: 0,lang,trans
181837,i wonder how many people in australia can spea...,START_je me demande combien de personnes en au...
138449,tom told the children many stories,START_tom raconta beaucoup dhistoires aux enfa...
189263,when was the last time you used benzodiazepine...,START_quand avezvous pris pour la dernière foi...
50444,thanks for the cookie,START_merci pour le cookie_END
50041,nothing lasts forever,START_rien ne dure pour toujours_END


In [13]:
x,y=data.lang,data.trans
x_train,x_test,y_train,y_test=train=train_test_split(x,y,test_size=0.1)

In [14]:
def encode(x=x_train,y=y_train,size=128):
    while True:
        for i1 in range(0,len(x),size):
            encInp=np.zeros((size,maxSrcLen),dtype='float32')
            decInp=np.zeros((size,maxTarLen),dtype='float32')
            decTar=np.zeros((size,maxTarLen,lenOfDecoderTokens),dtype='float32')
            for i2,(inpText,tarText) in enumerate(zip(x[i1:i1+size],y[i1:i1+size])):
                for i3,word in enumerate(inpText.split()):
                    encInp[i2,i3]=inpTokenInd[word]
                tarTextSplit=tarText.split()
                for i3,word in enumerate(tarTextSplit):
                    if i3<len(tarTextSplit)-1:
                        decInp[i2,i3]=tarTokenInd[word]=tarTokenInd[word]
                    if i3>0:
                        decTar[i2,i3-1,tarTokenInd[word]]=1
            yield([encInp,decInp],decTar)

In [20]:
dims=50
encInp=Input(shape=(None,))
encEmb=Embedding(lenOfEncoderTokens,dims,mask_zero=True)(encInp)
encLSTM=LSTM(dims,return_state=True)
encOut,stateH,stateC=encLSTM(encEmb)
encStates=[stateH,stateC]

In [21]:
decInp=Input(shape=(None,))
decEmbLayer=Embedding(lenOfDecoderTokens,dims,mask_zero=True)
decEmb=decEmbLayer(decInp)
decLSTM=LSTM(dims,return_sequences=True,return_state=True)
decOut,_,_=decLSTM(decEmb,initial_state=encStates)
decDense=Dense(lenOfDecoderTokens,activation='softmax')
decOut=decDense(decOut)
model=Model([encInp,decInp],decOut)

In [22]:
model.compile(optimizer='rmsprop',loss='categorical_crossentropy',metrics=['acc'])

In [23]:
trainLen=len(x_train)
testLen=len(x_test)
size=128
epochs=50

In [24]:
model.fit(encode(x_train,y_train,size),steps_per_epoch = trainLen//size,
                    epochs=epochs,
                    validation_data = encode(x_test, y_test,size),
                    validation_steps = testLen//size)

KeyError: 'he'

In [None]:
model.save_weights('nmt_weights.h5')