## Get data from URL and save

In [1]:
# download dataset
def get_data(link):
    
    from zipfile import ZipFile
    import requests
    import os
    
    output = 'temp.zip'

    response = requests.get(link, stream=True)
    handle = open(output, "wb")
    for chunk in response.iter_content(chunk_size=512):
        if chunk:
            handle.write(chunk)
    handle.close()


    with ZipFile(output,"r") as zip_file:
        for name in zip_file.namelist():
            if name != '_about.txt':
                filename = name
        zip_file.extractall("data")
    
    os.remove('temp.zip')
    os.remove('data/_about.txt')
    return filename

In [2]:
path = 'data/' + get_data('http://www.manythings.org/anki/spa-eng.zip')

In [3]:
path

'data/spa.txt'

## Clean data

In [4]:
def clean_lines(line):
    
    from unicodedata import normalize
    import string
    
    remove_punct_map = dict.fromkeys(map(ord, string.punctuation)) # thank you Reed!
    
    line = normalize('NFD', line).encode('ascii', 'ignore')
    line = line.decode('UTF-8')
    line = line.split()
    line = [word.lower() for word in line]
    line = [word.translate(remove_punct_map) for word in line]
    line = [word for word in line if word.isalpha()]
    
    return(line)

In [5]:
def clean_data(filepath):
    
    import numpy as np
    
    import re
    
    file = open(filepath, mode='rt')
    text = file.read()
    file.close()
    
    lines = text.strip().split('\n')
    pairs = [line.split('\t') for line in  lines]
    
    all_cleaned = []
    

    
    for pair in pairs:
        cleaned_pair = []
        for sentence in pair:
            clean_sentence = clean_lines(sentence)
            cleaned_pair.append(' '.join(clean_sentence))
            
        all_cleaned.append(cleaned_pair)
    
    no_duplicates = []
    previous_line = ''
    
    for line in all_cleaned:
        if (line[0] != previous_line) and (' tom' not in line[0]) and ('tom ' not in line[0]):
            no_duplicates.append(line)
            previous_line = line[0] 
          
        
    return np.array(no_duplicates)

In [6]:
clean_data = clean_data(path)

In [7]:
clean_data[1600:1610]

array([['i like chess', 'me gusta el ajedrez'],
       ['i like fruit', 'me gusta la fruta'],
       ['i like honey', 'me gusta la miel'],
       ['i like opera', 'me gusta la opera'],
       ['i like pizza', 'me gusta la pizza'],
       ['i like sushi', 'me gusta el sushi'],
       ['i like these', 'me gusta eso'],
       ['i like women', 'me gustan las mujeres'],
       ['i live alone', 'yo vivo solo'],
       ['i love books', 'me encantan los libros']], dtype='<U275')

In [8]:
clean_data.shape

(83783, 2)

## Split in test and train

In [9]:
def split_data(dataset):
    import numpy as np
    from sklearn.externals import joblib
    
    lines_number = 10000
    new_set = dataset[:lines_number, :]
    
    np.random.shuffle(new_set)
    train, test = new_set[:9000], new_set[9000:]
    
    

    joblib.dump(new_set, 'data/dataset.pkl')
    joblib.dump(train, 'data/train.pkl')
    joblib.dump(test, 'data/test.pkl')
    
    return new_set, train, test

In [10]:
dataset, train, test = split_data(clean_data)

In [11]:
len(train)

9000

## Tokenize data

In [12]:
from keras.preprocessing.text import Tokenizer

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [13]:
eng_tokenizer = Tokenizer()
eng_tokenizer.fit_on_texts(dataset[:,0])
eng_length = max(len(line.split()) for line in dataset[:,0])
eng_vocab_size = len(eng_tokenizer.word_counts) + 1
targ_tokenizer = Tokenizer()
targ_tokenizer.fit_on_texts(dataset[:,1])
targ_length = max(len(line.split()) for line in dataset[:,1])
targ_vocab_size = len(targ_tokenizer.word_counts) + 1
print(f'a number of unique words in english dataset - {len(eng_tokenizer.word_counts)}')
print(f'max sentence size english dataset - {eng_length}')
print(f'a number of unique words in target dataset - {len(targ_tokenizer.word_counts)}')
print(f'max sentence size target dataset - {targ_length}')

a number of unique words in english dataset - 2957
max sentence size english dataset - 5
a number of unique words in target dataset - 4649
max sentence size target dataset - 9


### Encoding

In [14]:
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

In [15]:
trainX = eng_tokenizer.texts_to_sequences(train[:,0])

In [16]:
trainX[:10]

[[2, 14, 8, 69],
 [74, 176],
 [1, 41, 118, 126],
 [48, 23, 70],
 [4, 80, 228, 54, 766],
 [1, 1612, 589],
 [672, 29, 53],
 [285, 1131],
 [76, 9, 402, 16],
 [89, 1613, 4, 673]]

In [17]:
trainX = pad_sequences(trainX, maxlen=eng_length, padding='post')

In [18]:
trainX

array([[   2,   14,    8,   69,    0],
       [  74,  176,    0,    0,    0],
       [   1,   41,  118,  126,    0],
       ...,
       [   6, 1418,    0,    0,    0],
       [  74,  301,    0,    0,    0],
       [  18,  660,   30,    0,    0]], dtype=int32)

In [19]:
trainY = targ_tokenizer.texts_to_sequences(train[:,1])
trainY = pad_sequences(trainY, maxlen=targ_length, padding='post')

In [20]:
trainY[:5]

array([[  24,   33,  845,    0,    0,    0,    0,    0,    0],
       [  58,    6,  207,    0,    0,    0,    0,    0,    0],
       [  44,   52,  102,    0,    0,    0,    0,    0,    0],
       [1424,    8,   40,    0,    0,    0,    0,    0,    0],
       [   5,   65,   34, 1046,   21,    9,  846,    0,    0]],
      dtype=int32)

In [21]:
trainY = to_categorical(trainY, num_classes=targ_vocab_size)

In [22]:
trainY[6]

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [23]:
testX = eng_tokenizer.texts_to_sequences(test[:,0])
testX = pad_sequences(testX, maxlen=eng_length, padding='post')
testX

array([[   1,   94,    2,  753,    0],
       [   2,   14,    4,  338,    0],
       [  74,  438,    0,    0,    0],
       ...,
       [  18, 2956,   30,    0,    0],
       [  14,    2,  161,  927,    0],
       [   7,  163, 2957,    0,    0]], dtype=int32)

In [24]:
testY = targ_tokenizer.texts_to_sequences(test[:,1])
testY = pad_sequences(testY, maxlen=targ_length, padding='post')
testY = to_categorical(testY, num_classes=targ_vocab_size)
testY[6]

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       ...,
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.]], dtype=float32)

## Define model

In [25]:
from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import Dense
from keras.layers import Embedding
from keras.layers import RepeatVector
from keras.layers import TimeDistributed
from keras.callbacks import ModelCheckpoint

In [26]:
model = Sequential()
model.add(Embedding(eng_vocab_size, 256, input_length=eng_length, mask_zero=True))
model.add(LSTM(256))
model.add(RepeatVector(targ_length))
model.add(LSTM(256, return_sequences=True))
model.add(TimeDistributed(Dense(targ_vocab_size, activation='softmax')))

In [27]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [28]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 5, 256)            757248    
_________________________________________________________________
lstm_1 (LSTM)                (None, 256)               525312    
_________________________________________________________________
repeat_vector_1 (RepeatVecto (None, 9, 256)            0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 9, 256)            525312    
_________________________________________________________________
time_distributed_1 (TimeDist (None, 9, 4650)           1195050   
Total params: 3,002,922
Trainable params: 3,002,922
Non-trainable params: 0
_________________________________________________________________


In [29]:
model_name = 'model_31_07.h5'

In [30]:
checkpoint = ModelCheckpoint(model_name, monitor='val_loss', verbose=1, save_best_only=True, mode='min')


In [None]:
model.fit(trainX, trainY, epochs=30, batch_size=64, validation_data=(testX, testY), callbacks=[checkpoint], verbose=2)


Train on 9000 samples, validate on 1000 samples
Epoch 1/30
 - 47s - loss: 3.3508 - acc: 0.6528 - val_loss: 2.5709 - val_acc: 0.6576

Epoch 00001: val_loss improved from inf to 2.57095, saving model to model_31_07.h5
Epoch 2/30
 - 46s - loss: 2.4067 - acc: 0.6688 - val_loss: 2.4617 - val_acc: 0.6669

Epoch 00002: val_loss improved from 2.57095 to 2.46174, saving model to model_31_07.h5
Epoch 3/30
 - 46s - loss: 2.3065 - acc: 0.6719 - val_loss: 2.4165 - val_acc: 0.6702

Epoch 00003: val_loss improved from 2.46174 to 2.41653, saving model to model_31_07.h5
Epoch 4/30
 - 47s - loss: 2.2511 - acc: 0.6727 - val_loss: 2.3971 - val_acc: 0.6701

Epoch 00004: val_loss improved from 2.41653 to 2.39706, saving model to model_31_07.h5
Epoch 5/30
 - 48s - loss: 2.2017 - acc: 0.6750 - val_loss: 2.3693 - val_acc: 0.6728

Epoch 00005: val_loss improved from 2.39706 to 2.36930, saving model to model_31_07.h5
Epoch 6/30
 - 48s - loss: 2.1567 - acc: 0.6781 - val_loss: 2.3473 - val_acc: 0.6769

Epoch 00006