In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf

In [2]:
from nltk.corpus import stopwords

In [9]:
stop = stopwords.words('english')

In [11]:
data = pd.read_csv('fin.txt', sep='\t', header=None)

In [13]:
data.head()

Unnamed: 0,0,1,2
0,Go.,Mene.,CC-BY 2.0 (France) Attribution: tatoeba.org #2...
1,Hi.,Moro!,CC-BY 2.0 (France) Attribution: tatoeba.org #5...
2,Hi.,Terve.,CC-BY 2.0 (France) Attribution: tatoeba.org #5...
3,Run!,Juokse!,CC-BY 2.0 (France) Attribution: tatoeba.org #9...
4,Run!,Juoskaa!,CC-BY 2.0 (France) Attribution: tatoeba.org #9...


In [14]:
data.columns = ['en','fin','waste']

In [16]:
data.head()

Unnamed: 0,en,fin,waste
0,Go.,Mene.,CC-BY 2.0 (France) Attribution: tatoeba.org #2...
1,Hi.,Moro!,CC-BY 2.0 (France) Attribution: tatoeba.org #5...
2,Hi.,Terve.,CC-BY 2.0 (France) Attribution: tatoeba.org #5...
3,Run!,Juokse!,CC-BY 2.0 (France) Attribution: tatoeba.org #9...
4,Run!,Juoskaa!,CC-BY 2.0 (France) Attribution: tatoeba.org #9...


In [17]:
data.drop('waste', axis=1, inplace=True)

In [18]:
data.head()

Unnamed: 0,en,fin
0,Go.,Mene.
1,Hi.,Moro!
2,Hi.,Terve.
3,Run!,Juokse!
4,Run!,Juoskaa!


In [19]:
len(data)

60508

In [20]:
data['en'] = data['en'].str.lower()
data['en'] = data['en'].str.replace('[^\w\s]','')

In [21]:
data['fin'] = data['fin'].str.lower()
data['fin'] = data['fin'].str.replace('[^\w\s]','')

In [23]:
data.head()

Unnamed: 0,en,fin
0,go,mene
1,hi,moro
2,hi,terve
3,run,juokse
4,run,juoskaa


In [24]:
data['fin'] = '__start__' + data['fin'] + '__end__'

In [25]:
data.head()

Unnamed: 0,en,fin
0,go,__start__mene__end__
1,hi,__start__moro__end__
2,hi,__start__terve__end__
3,run,__start__juokse__end__
4,run,__start__juoskaa__end__


In [26]:
max_features1 = 10000
maxlen1 = 20

max_features2 = 10000
maxlen2 = 20

In [27]:
tok1 = tf.keras.preprocessing.text.Tokenizer(num_words=max_features1)
tok1.fit_on_texts(list(data['en']))
en = tok1.texts_to_sequences(list(data['en']))
en = tf.keras.preprocessing.sequence.pad_sequences(en, maxlen=maxlen1)

tok2 = tf.keras.preprocessing.text.Tokenizer(num_words=max_features2)
tok2.fit_on_texts(list(data['fin']))
fin = tok2.texts_to_sequences(list(data['fin']))
fin = tf.keras.preprocessing.sequence.pad_sequences(fin, maxlen=maxlen2)

In [29]:
len(tok1.word_index)

9425

In [30]:
len(tok2.word_index)

30495

In [31]:
decoder_input_data = fin[:, :-1]
decoder_target_data = fin[:, 1:]
encoder_input_data = en

In [34]:
encoder_input_data.shape

(60508, 20)

In [35]:
vocab_size_encoder = len(tok1.word_index) + 1
vocab_size_decoder = len(tok2.word_index) + 1

In [36]:
latent_dim = 40

In [42]:
encoder_inputs = tf.keras.Input(shape=(None,))
emb_1 = tf.keras.layers.Embedding(vocab_size_encoder, latent_dim, mask_zero=False)(encoder_inputs)
bn_1 = tf.keras.layers.BatchNormalization()(emb_1)
encoder_outputs, state_h, state_c = tf.keras.layers.LSTM(latent_dim, return_state=True)(bn_1)

decoder_inputs = tf.keras.Input(shape=(None,))
emb_2 = tf.keras.layers.Embedding(vocab_size_decoder, latent_dim, mask_zero=False)(decoder_inputs)
bn_2 = tf.keras.layers.BatchNormalization()(emb_2)
decoder_lstm = tf.keras.layers.LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, state_h, state_c = decoder_lstm(bn_2)
dense = tf.keras.layers.Dense(vocab_size_decoder, activation='softmax')
decoder_output = dense(decoder_outputs)

In [43]:
model = tf.keras.Model([encoder_inputs, decoder_inputs], decoder_output)

In [44]:
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_4 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
embedding_3 (Embedding)         (None, None, 40)     1219840     input_4[0][0]                    
__________________________________________________________________________________________________
batch_normalization_3 (BatchNor (None, None, 40)     160         embedding_3[0][0]                
__________________________________________________________________________________________________
lstm_3 (LSTM)                   [(None, None, 40), ( 12960       batch_normalization_3[0][0]      
____________________________________________________________________________________________

In [45]:
model.compile(optimizer='rmsprop', loss=tf.keras.losses.sparse_categorical_crossentropy, metrics=['accuracy'])

In [None]:
model.fit([encoder_input_data, decoder_input_data], decoder_target_data, batch_size=2048)

Train on 60508 samples
