## Get data from URL and save

In [1]:
# download dataset
def get_data(link):
    
    from zipfile import ZipFile
    import requests
    import os
    
    output = 'temp.zip'

    response = requests.get(link, stream=True)
    handle = open(output, "wb")
    for chunk in response.iter_content(chunk_size=512):
        if chunk:
            handle.write(chunk)
    handle.close()


    with ZipFile(output,"r") as zip_file:
        for name in zip_file.namelist():
            if name != '_about.txt':
                filename = name
        zip_file.extractall("data")
    
    os.remove('temp.zip')
    os.remove('data/_about.txt')
    return filename

In [2]:
path = 'data/' + get_data('http://www.manythings.org/anki/spa-eng.zip')

In [3]:
path

'data/spa.txt'

## Clean data

In [4]:
def clean_lines(line):
    
    from unicodedata import normalize
    import string
    
    remove_punct_map = dict.fromkeys(map(ord, string.punctuation)) # thank you Reed!
    
    line = normalize('NFD', line).encode('ascii', 'ignore')
    line = line.decode('UTF-8')
    line = line.split()
    line = [word.lower() for word in line]
#     line = [word.translate(remove_punct_map) for word in line]
    line = [word for word in line if word.isalpha()]
    
    return(line)

In [5]:
def clean_data(filepath):
    
    import numpy as np
    
    import re
    
    file = open(filepath, mode='rt')
    text = file.read()
    file.close()
    
    lines = text.strip().split('\n')
    pairs = [line.split('\t') for line in  lines]
    
    all_cleaned = []
    

    
    for pair in pairs:
        cleaned_pair = []
        for sentence in pair:
            clean_sentence = clean_lines(sentence)
            cleaned_pair.append(' '.join(clean_sentence))
            
        all_cleaned.append(cleaned_pair)
    
    no_duplicates = []
    previous_eng = ''
    previous_targ = ''
    
    for line in all_cleaned:
        if (line[0] != previous_eng) and (line[1] != previous_targ) and (len(line[1]) != 1) and (len(line[0]) != 1) and (line[0] != '') and (line[1] != '') and (' tom' not in line[0]) and ('tom ' not in line[0]) and (' tom ' not in line[0]):
            no_duplicates.append(line)
            previous_eng = line[0]
            previous_targ = line[1]
          
        
    return np.array(no_duplicates)

In [6]:
clean_data = clean_data(path)

In [7]:
clean_data[0:10]

array([['go', 've ahora'],
       ['got', 'lo'],
       ['he', 'el'],
       ['hop', 'metete'],
       ['no', 'no puede'],
       ['we', 'lo'],
       ['why', 'por que'],
       ['ask', 'preguntale a'],
       ['be', 'mantente en'],
       ['get', 'agarra a']], dtype='<U249')

In [8]:
clean_data.shape

(80158, 2)

## Split in test and train

In [9]:
def split_data(dataset):
    import numpy as np
    from sklearn.externals import joblib
    
    lines_number = 10000
    new_set = dataset[:lines_number, :]
    
    np.random.shuffle(new_set)
    train, test = new_set[:9000], new_set[9000:]
    
    

    joblib.dump(new_set, 'data/dataset.pkl')
    joblib.dump(train, 'data/train.pkl')
    joblib.dump(test, 'data/test.pkl')
    
    return new_set, train, test

In [10]:
dataset, train, test = split_data(clean_data)

In [11]:
len(train)

9000

In [12]:
train[0:5]

array([['i know all about', 'lo se todo sobre el'],
       ['boston is', 'boston es'],
       ['somebody is', 'alguien esta'],
       ['who was', 'quien era'],
       ['i cannot pray', 'no puedo rezar']], dtype='<U249')

## Tokenize data

In [13]:
from keras.preprocessing.text import Tokenizer

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [14]:
eng_tokenizer = Tokenizer()
eng_tokenizer.fit_on_texts(dataset[:,0])
eng_length = max(len(line.split()) for line in dataset[:,0])
eng_vocab_size = len(eng_tokenizer.word_counts) + 1
targ_tokenizer = Tokenizer()
targ_tokenizer.fit_on_texts(dataset[:,1])
targ_length = max(len(line.split()) for line in dataset[:,1])
targ_vocab_size = len(targ_tokenizer.word_counts) + 1
print(f'a number of unique words in english dataset - {len(eng_tokenizer.word_counts)}')
print(f'max sentence size english dataset - {eng_length}')
print(f'a number of unique words in target dataset - {len(targ_tokenizer.word_counts)}')
print(f'max sentence size target dataset - {targ_length}')

a number of unique words in english dataset - 1740
max sentence size english dataset - 6
a number of unique words in target dataset - 2712
max sentence size target dataset - 10


### Encoding

In [15]:
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

In [16]:
trainX = eng_tokenizer.texts_to_sequences(train[:,0])

In [17]:
trainX[:10]

[[1, 29, 43, 90],
 [271, 2],
 [292, 2],
 [33, 12],
 [1, 228, 541],
 [34, 15, 18],
 [5, 2, 319, 3],
 [2, 6, 542],
 [127],
 [10, 248, 3]]

In [18]:
trainX = pad_sequences(trainX, maxlen=eng_length, padding='post')

In [19]:
trainX

array([[  1,  29,  43,  90,   0,   0],
       [271,   2,   0,   0,   0,   0],
       [292,   2,   0,   0,   0,   0],
       ...,
       [ 24,  59, 177,   0,   0,   0],
       [  1, 154,   0,   0,   0,   0],
       [ 51,   0,   0,   0,   0,   0]], dtype=int32)

In [20]:
trainY = targ_tokenizer.texts_to_sequences(train[:,1])
trainY = pad_sequences(trainY, maxlen=targ_length, padding='post')

In [21]:
trainY[:5]

array([[  10,   12,   63,  256,    1,    0,    0,    0,    0,    0],
       [ 426,    2,    0,    0,    0,    0,    0,    0,    0,    0],
       [ 133,   11,    0,    0,    0,    0,    0,    0,    0,    0],
       [  27,   46,    0,    0,    0,    0,    0,    0,    0,    0],
       [   3,   21, 1249,    0,    0,    0,    0,    0,    0,    0]],
      dtype=int32)

In [22]:
trainY = to_categorical(trainY, num_classes=targ_vocab_size)

In [23]:
trainY[6]

array([[0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [24]:
testX = eng_tokenizer.texts_to_sequences(test[:,0])
testX = pad_sequences(testX, maxlen=eng_length, padding='post')
testX

array([[  14,    0,    0,    0,    0,    0],
       [ 226,    0,    0,    0,    0,    0],
       [   6,    0,    0,    0,    0,    0],
       ...,
       [   1,   25,  304,    0,    0,    0],
       [   1,   12, 1740,   20,    0,    0],
       [  23,  305,    0,    0,    0,    0]], dtype=int32)

In [25]:
testY = targ_tokenizer.texts_to_sequences(test[:,1])
testY = pad_sequences(testY, maxlen=targ_length, padding='post')
testY = to_categorical(testY, num_classes=targ_vocab_size)
testY[6]

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       ...,
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.]], dtype=float32)

## Define model

In [26]:
from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import Dense
from keras.layers import Embedding
from keras.layers import RepeatVector
from keras.layers import TimeDistributed
from keras.callbacks import ModelCheckpoint

In [27]:
model = Sequential()
model.add(Embedding(eng_vocab_size, 256, input_length=eng_length, mask_zero=True))
model.add(LSTM(256))
model.add(RepeatVector(targ_length))
model.add(LSTM(256, return_sequences=True))
model.add(TimeDistributed(Dense(targ_vocab_size, activation='softmax')))

In [28]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [29]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 6, 256)            445696    
_________________________________________________________________
lstm_1 (LSTM)                (None, 256)               525312    
_________________________________________________________________
repeat_vector_1 (RepeatVecto (None, 10, 256)           0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 10, 256)           525312    
_________________________________________________________________
time_distributed_1 (TimeDist (None, 10, 2713)          697241    
Total params: 2,193,561
Trainable params: 2,193,561
Non-trainable params: 0
_________________________________________________________________


In [30]:
model_name = 'model_31_07_2.h5'

In [31]:
checkpoint = ModelCheckpoint(model_name, monitor='val_loss', verbose=1, save_best_only=True, mode='min')


In [32]:
model.fit(trainX, trainY, epochs=30, batch_size=64, validation_data=(testX, testY), callbacks=[checkpoint], verbose=2)


Train on 9000 samples, validate on 1000 samples
Epoch 1/30
 - 34s - loss: 2.4525 - acc: 0.7431 - val_loss: 1.7279 - val_acc: 0.7470

Epoch 00001: val_loss improved from inf to 1.72792, saving model to model_31_07_2.h5
Epoch 2/30
 - 38s - loss: 1.6187 - acc: 0.7553 - val_loss: 1.6323 - val_acc: 0.7541

Epoch 00002: val_loss improved from 1.72792 to 1.63233, saving model to model_31_07_2.h5
Epoch 3/30
 - 40s - loss: 1.5591 - acc: 0.7577 - val_loss: 1.6147 - val_acc: 0.7586

Epoch 00003: val_loss improved from 1.63233 to 1.61473, saving model to model_31_07_2.h5
Epoch 4/30
 - 37s - loss: 1.5258 - acc: 0.7602 - val_loss: 1.5974 - val_acc: 0.7613

Epoch 00004: val_loss improved from 1.61473 to 1.59744, saving model to model_31_07_2.h5
Epoch 5/30
 - 43s - loss: 1.4952 - acc: 0.7617 - val_loss: 1.5807 - val_acc: 0.7628

Epoch 00005: val_loss improved from 1.59744 to 1.58067, saving model to model_31_07_2.h5
Epoch 6/30
 - 43s - loss: 1.4594 - acc: 0.7637 - val_loss: 1.5440 - val_acc: 0.7653

E

<keras.callbacks.History at 0x109fcc7f0>