## Get data from URL and save

In [1]:
# download dataset
def get_data(link):
    
    from zipfile import ZipFile
    import requests
    import os
    
    output = 'temp.zip'

    response = requests.get(link, stream=True)
    handle = open(output, "wb")
    for chunk in response.iter_content(chunk_size=512):
        if chunk:
            handle.write(chunk)
    handle.close()


    with ZipFile(output,"r") as zip_file:
        for name in zip_file.namelist():
            if name != '_about.txt':
                filename = name
        zip_file.extractall("data")
    
    os.remove('temp.zip')
    os.remove('data/_about.txt')
    return filename

In [2]:
path = 'data/' + get_data('http://www.manythings.org/anki/spa-eng.zip')

In [3]:
path

'data/spa.txt'

## Clean data

In [10]:
def clean_lines(line):
    
    from unicodedata import normalize
    import string
    
    remove_punct_map = dict.fromkeys(map(ord, string.punctuation)) # thank you Reed!
    
    line = normalize('NFD', line).encode('ascii', 'ignore')
    line = line.decode('UTF-8')
    line = line.split()
    line = [word.lower() for word in line]
    line = [word.translate(remove_punct_map) for word in line]
    line = [word for word in line if word.isalpha()]
    
    return(line)

In [11]:
def clean_data(filepath):
    
    import numpy as np
    
    import re
    
    file = open(filepath, mode='rt')
    text = file.read()
    file.close()
    
    lines = text.strip().split('\n')
    pairs = [line.split('\t') for line in  lines]
    
    all_cleaned = []
    

    
    for pair in pairs:
        cleaned_pair = []
        for sentence in pair:
            clean_sentence = clean_lines(sentence)
            cleaned_pair.append(' '.join(clean_sentence))
            
        all_cleaned.append(cleaned_pair)
    
    no_duplicates = []
    previous_line = ''
    
    for line in all_cleaned:
        if (line[0] != previous_line) and (' tom' not in line[0]) and ('tom ' not in line[0]):
            no_duplicates.append(line)
            previous_line = line[0] 
          
        
    return np.array(no_duplicates)

In [12]:
clean_data = clean_data(path)

In [13]:
clean_data[1600:1610]

array([['i like chess', 'me gusta el ajedrez'],
       ['i like fruit', 'me gusta la fruta'],
       ['i like honey', 'me gusta la miel'],
       ['i like opera', 'me gusta la opera'],
       ['i like pizza', 'me gusta la pizza'],
       ['i like sushi', 'me gusta el sushi'],
       ['i like these', 'me gusta eso'],
       ['i like women', 'me gustan las mujeres'],
       ['i live alone', 'yo vivo solo'],
       ['i love books', 'me encantan los libros']], dtype='<U275')

In [14]:
clean_data.shape

(83783, 2)

## Split in test and train

In [18]:
def split_data(dataset):
    import numpy as np
    from sklearn.externals import joblib
    
    lines_number = 10000
    new_set = dataset[:lines_number, :]
    
    np.random.shuffle(new_set)
    train, test = new_set[:9000], new_set[9000:]
    
    

    joblib.dump(new_set, 'data/dataset.pkl')
    joblib.dump(train, 'data/train.pkl')
    joblib.dump(test, 'data/test.pkl')
    
    return new_set, train, test

In [19]:
dataset, train, test = split_data(clean_data)

In [45]:
len(train)

9000

## Tokenize data

In [27]:
from keras.preprocessing.text import Tokenizer

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [49]:
eng_tokenizer = Tokenizer()
eng_tokenizer.fit_on_texts(dataset[:,0])
eng_length = max(len(line.split()) for line in dataset[:,0])
eng_vocab_size = len(eng_tokenizer.word_counts) + 1
targ_tokenizer = Tokenizer()
targ_tokenizer.fit_on_texts(dataset[:,1])
targ_length = max(len(line.split()) for line in dataset[:,1])
targ_vocab_size = len(targ_tokenizer.word_counts) + 1
print(f'a number of unique words in english dataset - {len(eng_tokenizer.word_counts)}')
print(f'max sentence size english dataset - {eng_length}')
print(f'a number of unique words in target dataset - {len(targ_tokenizer.word_counts)}')
print(f'max sentence size target dataset - {targ_length}')

a number of unique words in english dataset - 2957
max sentence size english dataset - 5
a number of unique words in target dataset - 4649
max sentence size target dataset - 9


### Encoding

In [65]:
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

In [68]:
trainX = eng_tokenizer.texts_to_sequences(train[:,0])

In [70]:
trainX[:10]

[[104, 1612],
 [26, 1131],
 [672, 270, 211],
 [6, 1132],
 [107, 402],
 [1, 403, 1613],
 [90, 1, 23, 69, 43],
 [1, 62, 54, 4, 404],
 [11, 66, 168],
 [18, 100, 228]]

In [66]:
trainX = pad_sequences(trainX, maxlen=eng_length, padding='post')

In [67]:
trainX

array([[ 104, 1612,    0,    0,    0],
       [  26, 1131,    0,    0,    0],
       [ 672,  270,  211,    0,    0],
       ...,
       [ 182,    5,  187,    0,    0],
       [   1,  837,   17,  468,    0],
       [ 103,   37,    1,   43,    0]], dtype=int32)

In [109]:
trainY = targ_tokenizer.texts_to_sequences(train[:,1])
trainY = pad_sequences(trainY, maxlen=targ_length, padding='post')

In [110]:
trainY[:5]

array([[   2,  123, 2191,    0,    0,    0,    0,    0,    0],
       [  40, 2192,    0,    0,    0,    0,    0,    0,    0],
       [1046, 1047, 1048,    0,    0,    0,    0,    0,    0],
       [  19, 1424,    0,    0,    0,    0,    0,    0,    0],
       [   1,  400,    0,    0,    0,    0,    0,    0,    0]],
      dtype=int32)

In [100]:
trainY = to_categorical(trainY, num_classes=targ_vocab_size)

In [123]:
trainY[6]

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [114]:
testX = eng_tokenizer.texts_to_sequences(test[:,0])
testX = pad_sequences(testX, maxlen=eng_length, padding='post')
testX

array([[   7,   51,  127,    5,    0],
       [  74, 1331,    0,    0,    0],
       [   3,    7, 2803,    0,    0],
       ...,
       [ 105,   62,  104,    0,    0],
       [   7,  249,  475,    0,    0],
       [  11,    4,  751,    0,    0]], dtype=int32)

In [122]:
testY = targ_tokenizer.texts_to_sequences(test[:,1])
testY = pad_sequences(testY, maxlen=targ_length, padding='post')
testY = to_categorical(testY, num_classes=targ_vocab_size)
testY[6]

array([[0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.]], dtype=float32)

## Define model

In [125]:
from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import Dense
from keras.layers import Embedding
from keras.layers import RepeatVector
from keras.layers import TimeDistributed
from keras.callbacks import ModelCheckpoint

In [126]:
model = Sequential()
model.add(Embedding(eng_vocab_size, 256, input_length=eng_length, mask_zero=True))
model.add(LSTM(256))
model.add(RepeatVector(targ_length))
model.add(LSTM(256, return_sequences=True))
model.add(TimeDistributed(Dense(targ_vocab_size, activation='softmax')))

In [127]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [128]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 5, 256)            757248    
_________________________________________________________________
lstm_1 (LSTM)                (None, 256)               525312    
_________________________________________________________________
repeat_vector_1 (RepeatVecto (None, 9, 256)            0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 9, 256)            525312    
_________________________________________________________________
time_distributed_1 (TimeDist (None, 9, 4650)           1195050   
Total params: 3,002,922
Trainable params: 3,002,922
Non-trainable params: 0
_________________________________________________________________


In [129]:
modelname = 'model_31_07.h5'

In [None]:
checkpoint = ModelCheckpoint(modelname, monitor='val_loss', verbose=1, save_best_only=True, mode='min')


In [None]:
filename = 'model1.h5'
checkpoint = ModelCheckpoint(filename, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
model.fit(trainX, trainY, epochs=20, batch_size=64, validation_data=(testX, testY), callbacks=[checkpoint], verbose=2)
