In [1]:
from __future__ import print_function, division
from builtins import range, input
# Note: you may need to update your version of future
# sudo pip install -U future

import os, sys

from keras.models import Model
from keras.layers import Input, LSTM, GRU, Dense, Embedding
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
import numpy as np
import matplotlib.pyplot as plt

import keras.backend as K
if len(K.tensorflow_backend._get_available_gpus()) > 0:
    from keras.layers import CuDNNLSTM as LSTM
    from keras.layers import CuDNNGRU as GRU

Using TensorFlow backend.


In [2]:

# some config
BATCH_SIZE = 64  # Batch size for training.
EPOCHS = 40  # Number of epochs to train for.
LATENT_DIM = 256  # Latent dimensionality of the encoding space.
NUM_SAMPLES = 10000  # Number of samples to train on.
MAX_NUM_WORDS = 20000
EMBEDDING_DIM = 100

# Where we will store the data
input_texts = [] # sentence in original language
target_texts = [] # sentence in target language
target_texts_inputs = [] # sentence in target language offset by 1

In [7]:

# load in the data
# download the data at: http://www.manythings.org/anki/
t = 0
for line in open('spa.txt'):
#     print(line)
  # only keep a limited number of samples
    t += 1
    if t > NUM_SAMPLES:
        break

    # input and target are separated by tab
    if '\t' not in line:
        continue

    # split up the input and translation
    input_text, translation = line.rstrip().split('\t')[0], line.rstrip().split('\t')[1]

    # make the target input and output
    # recall we'll be using teacher forcing
    target_text = translation + ' <eos>'
    target_text_input = '<sos> ' + translation

    input_texts.append(input_text)
    target_texts.append(target_text)
    target_texts_inputs.append(target_text_input)
    
print("num samples:", len(input_texts))

Go.	Ve.	CC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #4986655 (cueyayotl)

Go.	Vete.	CC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #4986656 (cueyayotl)

Go.	Vaya.	CC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #4986657 (cueyayotl)

Go.	V찼yase.	CC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #6586271 (arh)

Hi.	Hola.	CC-BY 2.0 (France) Attribution: tatoeba.org #538123 (CM) & #431975 (Leono)

Run!	징Corre!	CC-BY 2.0 (France) Attribution: tatoeba.org #906328 (papabear) & #1685404 (Elenitigormiti)

Run!	징Corran!	CC-BY 2.0 (France) Attribution: tatoeba.org #906328 (papabear) & #5213896 (cueyayotl)

Run!	징Corra!	CC-BY 2.0 (France) Attribution: tatoeba.org #906328 (papabear) & #8005613 (Seael)

Run!	징Corred!	CC-BY 2.0 (France) Attribution: tatoeba.org #906328 (papabear) & #8005615 (Seael)

Run.	Corred.	CC-BY 2.0 (France) Attribution: tatoeba.org #4008918 (JSakuragi) & #6681472 (arh)

Who?	쩔Qui챕n?	CC-BY 2.0 (France) Attribution: tatoeba.org 


I want this.	Quiero este.	CC-BY 2.0 (France) Attribution: tatoeba.org #2011272 (CK) & #1829237 (Shishir)

I want this.	Quiero esto.	CC-BY 2.0 (France) Attribution: tatoeba.org #2011272 (CK) & #6863585 (arh)

I was awake.	Estaba despierto.	CC-BY 2.0 (France) Attribution: tatoeba.org #2247458 (CK) & #5871364 (albrusgher)

I was bored.	Estaba aburrido.	CC-BY 2.0 (France) Attribution: tatoeba.org #2247463 (CK) & #6957966 (albrusgher)

I was dizzy.	Estaba mareado.	CC-BY 2.0 (France) Attribution: tatoeba.org #1671743 (CK) & #5048513 (don_ramon)

I was dizzy.	Estaba mareada.	CC-BY 2.0 (France) Attribution: tatoeba.org #1671743 (CK) & #5058723 (don_ramon)

I was drunk.	Estaba borracho.	CC-BY 2.0 (France) Attribution: tatoeba.org #2247479 (CK) & #4371730 (pchamorro)

I was fired.	Estaba despedido.	CC-BY 2.0 (France) Attribution: tatoeba.org #1140531 (CK) & #1321943 (Shishir)

I was fired.	Me despidieron.	CC-BY 2.0 (France) Attribution: tatoeba.org #1140531 (CK) & #1321944 (Shishir)

I was luck

I didn't stop.	Yo no me detuve.	CC-BY 2.0 (France) Attribution: tatoeba.org #2245703 (CK) & #4590240 (Francisco_M)

I didn't vote.	Yo no vot챕	CC-BY 2.0 (France) Attribution: tatoeba.org #2245705 (CK) & #4590242 (Francisco_M)

I didn't wait.	No esper챕.	CC-BY 2.0 (France) Attribution: tatoeba.org #5840498 (CK) & #5840566 (arh)

I didn't walk.	Yo no camin챕.	CC-BY 2.0 (France) Attribution: tatoeba.org #2245706 (CK) & #4590243 (Francisco_M)

I do like Tom.	S챠, me gusta Tom.	CC-BY 2.0 (France) Attribution: tatoeba.org #2315133 (CK) & #4590245 (Francisco_M)

I do like you.	S챠, t첬 me gustas.	CC-BY 2.0 (France) Attribution: tatoeba.org #2315134 (CK) & #4590248 (Francisco_M)

I do love you.	Yo s챠 te amo.	CC-BY 2.0 (France) Attribution: tatoeba.org #71454 (CK) & #1246286 (marcelostockle)

I do love you.	Yo de veras te amo.	CC-BY 2.0 (France) Attribution: tatoeba.org #71454 (CK) & #1246288 (marcelostockle)

I do remember.	S챠, yo recuerdo.	CC-BY 2.0 (France) Attribution: tatoeba.org #2245711 (CK) &

I like walking.	Me gusta caminar.	CC-BY 2.0 (France) Attribution: tatoeba.org #3818760 (CK) & #727833 (Fanki)

I like working.	Me gusta trabajar.	CC-BY 2.0 (France) Attribution: tatoeba.org #2245946 (CK) & #1189211 (Shishir)

I like writing.	Me gusta escribir.	CC-BY 2.0 (France) Attribution: tatoeba.org #5851859 (CK) & #5208342 (Elyte)

I like writing.	Me gusta la escritura.	CC-BY 2.0 (France) Attribution: tatoeba.org #5851859 (CK) & #6600591 (arh)

I live upstate.	Vivo en el norte.	CC-BY 2.0 (France) Attribution: tatoeba.org #2245949 (CK) & #6607890 (arh)

I live upstate.	Vivo hacia el norte.	CC-BY 2.0 (France) Attribution: tatoeba.org #2245949 (CK) & #6607891 (arh)

I love Harvard.	Me encanta Harvard.	CC-BY 2.0 (France) Attribution: tatoeba.org #4500181 (CK) & #6694347 (arh)

I love Harvard.	Adoro Harvard.	CC-BY 2.0 (France) Attribution: tatoeba.org #4500181 (CK) & #6694348 (arh)

I love chicken.	Me encanta el pollo.	CC-BY 2.0 (France) Attribution: tatoeba.org #620111 (ulyssemc1) & #


Don't get drunk.	No te emborraches.	CC-BY 2.0 (France) Attribution: tatoeba.org #2245333 (CK) & #2848534 (teskmon)

Don't interfere.	No interfieras.	CC-BY 2.0 (France) Attribution: tatoeba.org #2187190 (CK) & #2187399 (hayastan)

Don't interrupt.	No interrumpas.	CC-BY 2.0 (France) Attribution: tatoeba.org #2187191 (CK) & #2187395 (hayastan)

Don't lie to me.	No me mientas.	CC-BY 2.0 (France) Attribution: tatoeba.org #452256 (enwilson) & #580984 (Shishir)

Don't lie to us.	No nos mientas.	CC-BY 2.0 (France) Attribution: tatoeba.org #2250428 (hayastan) & #2372687 (hayastan)

Don't look back.	No mires atr찼s.	CC-BY 2.0 (France) Attribution: tatoeba.org #240167 (CK) & #538637 (Shishir)

Don't look down.	No mires abajo.	CC-BY 2.0 (France) Attribution: tatoeba.org #1840597 (Amastan) & #3515658 (Aether)

Don't overdo it.	No exageres.	CC-BY 2.0 (France) Attribution: tatoeba.org #30192 (CK) & #934551 (Shishir)

Don't play dead.	No te hagas el muerto.	CC-BY 2.0 (France) Attribution: tatoeba.org 

What did you do?	쩔Qu챕 hiciste?	CC-BY 2.0 (France) Attribution: tatoeba.org #1495903 (CK) & #738720 (chinopinyin)

What do they do?	쩔Qu챕 hacen ellos?	CC-BY 2.0 (France) Attribution: tatoeba.org #2648892 (CK) & #2787464 (marcelostockle)

What is missing?	쩔Qu챕 falta?	CC-BY 2.0 (France) Attribution: tatoeba.org #25227 (CK) & #519499 (Shishir)

What killed Tom?	쩔Qu챕 mat처 a Tom?	CC-BY 2.0 (France) Attribution: tatoeba.org #1886723 (CK) & #2378423 (Shishir)

What time is it?	쩔Qu챕 hora es?	CC-BY 2.0 (France) Attribution: tatoeba.org #428402 (laofmoonster) & #2830 (Shishir)

What was stolen?	쩔Qu챕 fue robado?	CC-BY 2.0 (France) Attribution: tatoeba.org #24678 (CK) & #1284325 (marcelostockle)

What year is it?	쩔Qu챕 a챰o es?	CC-BY 2.0 (France) Attribution: tatoeba.org #682508 (Source_VOA) & #1284326 (marcelostockle)

What year is it?	쩔En qu챕 a챰o estamos?	CC-BY 2.0 (France) Attribution: tatoeba.org #682508 (Source_VOA) & #1294965 (hayastan)

What's Tom like?	쩔C처mo es Tom?	CC-BY 2.0 (France) Attribut