## Get data from URL and save

In [1]:
# download dataset
def get_data(link):
    
    from zipfile import ZipFile
    import requests
    import os
    
    output = 'temp.zip'

    response = requests.get(link, stream=True)
    handle = open(output, "wb")
    for chunk in response.iter_content(chunk_size=512):
        if chunk:
            handle.write(chunk)
    handle.close()


    with ZipFile(output,"r") as zip_file:
        for name in zip_file.namelist():
            if name != '_about.txt':
                filename = name
        zip_file.extractall("data")
    
    os.remove('temp.zip')
    os.remove('data/_about.txt')
    return filename

In [2]:
path = 'data/' + get_data('http://www.manythings.org/anki/spa-eng.zip')

In [3]:
path

'data/spa.txt'

## Clean data

In [4]:
def clean_lines(line):
    
    from unicodedata import normalize
    import string
    
    remove_punct_map = dict.fromkeys(map(ord, string.punctuation)) # thank you Reed!
    
    line = normalize('NFD', line).encode('ascii', 'ignore')
    line = line.decode('UTF-8')
    line = line.split()
    line = [word.lower() for word in line]
    line = [word.translate(remove_punct_map) for word in line]
    line = [word for word in line if word.isalpha()]
    
    return(line)

In [5]:
def clean_data(filepath):
    
    import numpy as np
    
    import re
    
    file = open(filepath, mode='rt')
    text = file.read()
    file.close()
    
    lines = text.strip().split('\n')
    pairs = [line.split('\t') for line in  lines]
    
    all_cleaned = []
    

    
    for pair in pairs:
        cleaned_pair = []
        for sentence in pair:
            clean_sentence = clean_lines(sentence)
            cleaned_pair.append(' '.join(clean_sentence))
            
        all_cleaned.append(cleaned_pair)
    
    no_duplicates = []
    previous_eng = ''
    previous_targ = ''
    
    for line in all_cleaned:
        if (line[0] != previous_eng) and (line[1] != previous_targ) and (len(line[1]) != 1) and (len(line[0]) != 1) and (line[0] != '') and (line[1] != '') and (' tom' not in line[0]) and ('tom ' not in line[0]) and (' tom ' not in line[0]):
            no_duplicates.append(line)
            previous_eng = line[0]
            previous_targ = line[1]
          
        
    return np.array(no_duplicates)

In [6]:
clean_data = clean_data(path)

In [45]:
clean_data[100:110]

array([['im hurt bad', 'estoy gravemente herido'],
       ['try to focus', 'intenta centrarte'],
       ['this is not true', 'esto no es verdad'],
       ['i was not drunk', 'no estaba borracho'],
       ['toms awake', 'tom esta despierto'],
       ['come up here', 'sube aqui'],
       ['now its my turn', 'ahora es mi turno'],
       ['hes a bad liar', 'el es un mal mentiroso'],
       ['tell the truth', 'deci la verdad'],
       ['how unfortunate', 'que desafortunado']], dtype='<U275')

In [8]:
clean_data.shape

(83627, 2)

## Split in test and train

In [9]:
def split_data(dataset, model_name):
    
    import numpy as np
    from sklearn.externals import joblib
    import os
    
    lines_number = 50000
    new_set = dataset[:lines_number, :]
    
    np.random.shuffle(new_set)
    train, test = new_set[:45000], new_set[45000:]
    
    directory = ('data/' + model_name)
    if not os.path.exists(directory):
        os.makedirs(directory)
    
#     joblib.dump(new_set, (directory + '/dataset.pkl'))
#     joblib.dump(train, (directory + '/train.pkl'))
#     joblib.dump(test, (directory + '/test.pkl'))
    
    return new_set, train, test

In [69]:
# IMPORTANT FIELD!!!!!!!!!!!!!!
# create a unique model name otherwise everything will be overwritten
new_model_name = 'model_50K'

In [70]:
dataset, train, test = split_data(clean_data, new_model_name)

In [94]:
#save datasets for evaluation
joblib.dump(train, ('data/' + new_model_name + '/train.pkl'))
joblib.dump(test, ('data/' + new_model_name + '/test.pkl'))

['data/model_01081_32_128/test.pkl']

In [72]:
len(train)

9000

In [73]:
test[500:510]

array([['shes turning red', 'ella se sonroja'],
       ['i liked it', 'me gusto'],
       ['i was impressed', 'estaba impresionada'],
       ['youve done it', 'lo has hecho'],
       ['my father is in', 'mi padre esta en casa'],
       ['i was about to go', 'estaba por irme'],
       ['what a phony', 'que farsante'],
       ['that helped', 'eso ayudo'],
       ['thats not my job', 'ese no es mi trabajo'],
       ['he has a point', 'en eso tiene razon']], dtype='<U275')

## Tokenize data

In [14]:
from keras.preprocessing.text import Tokenizer
from sklearn.externals import joblib

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [51]:
eng_tokenizer = Tokenizer()
eng_tokenizer.fit_on_texts(dataset[:,0])
eng_length = max(len(line.split()) for line in dataset[:,0])
eng_vocab_size = len(eng_tokenizer.word_counts) + 1
targ_tokenizer = Tokenizer()
targ_tokenizer.fit_on_texts(dataset[:,1])
targ_length = max(len(line.split()) for line in dataset[:,1])
targ_vocab_size = len(targ_tokenizer.word_counts) + 1
print(f'a number of unique words in english dataset - {len(eng_tokenizer.word_counts)}')
print(f'max sentence size english dataset - {eng_length}')
print(f'a number of unique words in target dataset - {len(targ_tokenizer.word_counts)}')
print(f'max sentence size target dataset - {targ_length}')

a number of unique words in english dataset - 2962
max sentence size english dataset - 5
a number of unique words in target dataset - 4661
max sentence size target dataset - 9


In [95]:
# save tokenizers and vocab sizes for further decoding
joblib.dump(eng_tokenizer, ('data/' + new_model_name + '/eng_tokenizer.pkl'))
joblib.dump(eng_length, ('data/' + new_model_name + '/eng_length.pkl'))
joblib.dump(targ_tokenizer, ('data/' + new_model_name + '/targ_tokenizer.pkl'))
joblib.dump(targ_length, ('data/' + new_model_name + '/targ_length.pkl'))

['data/model_01081_32_128/targ_length.pkl']

### Encoding

In [17]:
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

In [76]:
trainX = eng_tokenizer.texts_to_sequences(train[:,0])

In [77]:
trainX[:10]

[[1, 385, 128],
 [8, 83, 3, 129],
 [32, 4, 2860],
 [11, 4, 526],
 [6, 28, 296],
 [48, 98, 152],
 [11, 58, 1, 41],
 [22, 1046],
 [90, 4, 737],
 [32, 13, 250]]

In [78]:
trainX = pad_sequences(trainX, maxlen=eng_length, padding='post')

In [79]:
trainX

array([[   1,  385,  128,    0,    0],
       [   8,   83,    3,  129,    0],
       [  32,    4, 2860,    0,    0],
       ...,
       [  13,  561, 1395,    0,    0],
       [   1,   25,  203,    0,    0],
       [   1,   27,    8, 2115,    0]], dtype=int32)

In [80]:
joblib.dump(trainX, ('data/' + new_model_name + '/trainX.pkl'))

['data/model_01081_32_128/trainX.pkl']

In [81]:
trainY = targ_tokenizer.texts_to_sequences(train[:,1])
trainY = pad_sequences(trainY, maxlen=targ_length, padding='post')

In [82]:
trainY[:5]

array([[ 922,  103,    0,    0,    0,    0,    0,    0,    0],
       [   2,  155,    6,  129,    0,    0,    0,    0,    0],
       [  33,    5, 4469,    0,    0,    0,    0,    0,    0],
       [  66,    1,    5,  431,    0,    0,    0,    0,    0],
       [  19,   24,  358,    0,    0,    0,    0,    0,    0]],
      dtype=int32)

In [83]:
trainY = to_categorical(trainY, num_classes=targ_vocab_size)

In [84]:
trainY[6]

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [85]:
testX = eng_tokenizer.texts_to_sequences(test[:,0])
testX = pad_sequences(testX, maxlen=eng_length, padding='post')
testX

array([[  59,  239,  249,    0,    0],
       [   1,   41,  124,  368,    0],
       [  52,    3,  192,    0,    0],
       ...,
       [ 108,   21,   13,  642,    0],
       [  13, 1497,   14,  126,    0],
       [2335,   53,    0,    0,    0]], dtype=int32)

In [86]:
joblib.dump(testX, ('data/' + new_model_name + '/testX.pkl'))

['data/model_01081_32_128/testX.pkl']

In [87]:
testY = targ_tokenizer.texts_to_sequences(test[:,1])
testY = pad_sequences(testY, maxlen=targ_length, padding='post')
testY = to_categorical(testY, num_classes=targ_vocab_size)
testY[6]

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.]], dtype=float32)

## Define a model

In [28]:
from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import Dense
from keras.layers import Embedding
from keras.layers import RepeatVector
from keras.layers import TimeDistributed
from keras.callbacks import ModelCheckpoint
from keras.layers import Bidirectional

In [96]:
model = Sequential()
model.add(Embedding(eng_vocab_size, 256, input_length=eng_length, mask_zero=True))
model.add(LSTM(256))
model.add(RepeatVector(targ_length))
# model.add(LSTM(256, return_sequences=True))
model.add(Bidirectional(LSTM(256, return_sequences=True)))
model.add(TimeDistributed(Dense(targ_vocab_size, activation='softmax')))

In [97]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [98]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 5, 256)            758528    
_________________________________________________________________
lstm_5 (LSTM)                (None, 256)               525312    
_________________________________________________________________
repeat_vector_3 (RepeatVecto (None, 9, 256)            0         
_________________________________________________________________
bidirectional_3 (Bidirection (None, 9, 512)            1050624   
_________________________________________________________________
time_distributed_3 (TimeDist (None, 9, 4662)           2391606   
Total params: 4,726,070
Trainable params: 4,726,070
Non-trainable params: 0
_________________________________________________________________


In [91]:
model_name = 'data/' + new_model_name + '/' + new_model_name + '.h5'

In [92]:
checkpoint = ModelCheckpoint(model_name, monitor='val_loss', verbose=1, save_best_only=True, mode='min')


In [93]:
model.fit(trainX, trainY, epochs=30, batch_size=64, validation_data=(testX, testY), callbacks=[checkpoint], verbose=2)


Train on 9000 samples, validate on 1000 samples
Epoch 1/30
 - 47s - loss: 2.9965 - acc: 0.6593 - val_loss: 2.4787 - val_acc: 0.6586

Epoch 00001: val_loss improved from inf to 2.47871, saving model to data/model_01081_32_128/model_01081_32_128.h5
Epoch 2/30
 - 48s - loss: 2.3515 - acc: 0.6700 - val_loss: 2.4437 - val_acc: 0.6636

Epoch 00002: val_loss improved from 2.47871 to 2.44374, saving model to data/model_01081_32_128/model_01081_32_128.h5
Epoch 3/30
 - 48s - loss: 2.2819 - acc: 0.6737 - val_loss: 2.4211 - val_acc: 0.6689

Epoch 00003: val_loss improved from 2.44374 to 2.42113, saving model to data/model_01081_32_128/model_01081_32_128.h5
Epoch 4/30
 - 47s - loss: 2.1971 - acc: 0.6785 - val_loss: 2.3582 - val_acc: 0.6738

Epoch 00004: val_loss improved from 2.42113 to 2.35819, saving model to data/model_01081_32_128/model_01081_32_128.h5
Epoch 5/30
 - 48s - loss: 2.0891 - acc: 0.6868 - val_loss: 2.2992 - val_acc: 0.6829

Epoch 00005: val_loss improved from 2.35819 to 2.29917, sav

KeyboardInterrupt: 