## Idea original modificada:
http://philipperemy.github.io/keras-stateful-lstm/

Interestante:

https://machinelearningmastery.com/use-different-batch-sizes-training-predicting-python-keras/

## Generamos una secuencia artificial para ejemplificar el problema
- Cada muestra es una secuencia binaria de 20 bits. 
- Para X: El primer bit es es 1 con una probabilidad de 0.5, el resto son todos ceros
- Para y: La etiqueta es 1 si el primer bit es 1

In [6]:
import numpy as np
N_samples = 1200
N_bits = 20
from numpy.random import choice
one_indexes = choice(a=N_samples, size=int(N_samples / 2), replace=False)
X = np.zeros((N_samples, N_bits))
X[one_indexes, 0] = 1  # very long term memory.
y = X[:,0]

In [7]:
N_train = 1000
X_train = X[:N_train]
y_train = y[:N_train]
X_validation = X[N_train:]
y_validation = y[N_train:]

In [8]:
idx = 1
print('Entrada: ',X_train[idx])
print('Salida:', y_train[idx])

Entrada:  [ 1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.]
Salida: 1.0


In [9]:
print('Training')
print(X_train.shape)
print(y_train.shape)
print('Validation')
print(X_validation.shape)
print(y_validation.shape)

Training
(1000, 20)
(1000,)
Validation
(200, 20)
(200,)


In [10]:
def prepare_sequences(x_train, y_train, window_length):
    windows = []
    windows_y = []
    for i, sequence in enumerate(x_train):
        len_seq = len(sequence)
        for window_start in range(0, len_seq - window_length + 1):
            window_end = window_start + window_length
            window = sequence[window_start:window_end]
            windows.append(window)
            windows_y.append(y_train[i])
    return np.array(windows), np.array(windows_y)

## Que pasa si partimos la secuencia para entrenar?

### Probar:
- window_length = 10, 15, 20
- batch_size = 1, 50, 1000

In [22]:
window_length = 20
X_train_split_1, y_train_split_1 = prepare_sequences(X_train, y_train, window_length)
X_validation_split_1, y_validation_split_1 = prepare_sequences(X_validation, y_validation, window_length)

In [23]:
# Empieza el nuevo vector en (N_bits - window_length + 1)
next_vect_delta = N_bits - window_length + 1
print(X_train_split_1[next_vect_delta])
# Los y's son todos unos si el primer valor del vector fue 1
print(y_train_split_1[next_vect_delta])
print(y_train_split_1[next_vect_delta+1])
print(X_train_split_1.shape)

[ 1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.]
1.0
1.0
(1000, 20)


In [25]:
from keras.layers import SimpleRNN, Dense, LSTM
from keras.models import Sequential
batch_size = 50

model = Sequential()
model.add(LSTM(10, input_shape=(window_length, 1), return_sequences=False, stateful=False))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X_train_split_1.reshape(-1,window_length,1), y_train_split_1, batch_size=batch_size, epochs=15,
          validation_data=(X_validation_split_1.reshape(-1,window_length,1), y_validation_split_1), shuffle=False)
score, acc = model.evaluate(X_validation_split_1.reshape(-1,window_length,1), y_validation_split_1, batch_size=1, verbose=1)
print(score, acc)

Train on 1000 samples, validate on 200 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


- Como cambia la velocidad con los distintos tamaños de batch?
- Estudiar tiempos de convergencia
- Converge para todos los casos de longitud de la secuencia?

## Usando Batch_size de 1, stateful y window_lenght = 1

In [26]:
window_length = 1
X_train_split_2, y_train_split_2 = prepare_sequences(X_train, y_train, window_length)
X_validation_split_2, y_validation_split_2 = prepare_sequences(X_validation, y_validation, window_length)

In [32]:
batch_size = 1

model = Sequential()
model.add(LSTM(10, batch_input_shape=(1, 1, 1), return_sequences=False, stateful=True))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])


model.fit(X_train_split_2.reshape(-1,window_length,1), y_train_split_2, batch_size=batch_size, epochs=1,
          validation_data=(X_validation_split_2.reshape(-1,window_length,1), y_validation_split_2), shuffle=False)
score, acc = model.evaluate(X_validation_split_2.reshape(-1,window_length,1), y_validation_split_2, batch_size=1, verbose=1)
print(score, acc)

Train on 20000 samples, validate on 4000 samples
Epoch 1/1
0.667631969534 0.70575


Que pasa en este caso? Por que no llega a mejorar la accuracy?

## Reseteo despues de cada secuencia

In [31]:
# N_bits es 20, la longitud de la secuencia
i = 0
X_validation_split_2[i*N_bits:(i+1)*N_bits]

array([[ 1.],
       [ 0.],
       [ 0.],
       [ 0.],
       [ 0.],
       [ 0.],
       [ 0.],
       [ 0.],
       [ 0.],
       [ 0.],
       [ 0.],
       [ 0.],
       [ 0.],
       [ 0.],
       [ 0.],
       [ 0.],
       [ 0.],
       [ 0.],
       [ 0.],
       [ 0.]])

In [33]:
batch_size = 1

model = Sequential()
model.add(LSTM(10, batch_input_shape=(1, 1, 1), return_sequences=False, stateful=True))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

for i in range(X_train_split_2.shape[0]):
    print(i)
    model.fit(X_train_split_2[i*N_bits:(i+1)*N_bits].reshape(-1,window_length,1), 
              y_train_split_2[i*N_bits:(i+1)*N_bits], batch_size=batch_size, epochs=1, shuffle=False, verbose=1)
    model.reset_states()
score, acc = model.evaluate(X_validation_split_2.reshape(-1,window_length,1), y_validation_split_2, batch_size=1, verbose=1)
print(score, acc)

0
Epoch 1/1
1
Epoch 1/1
2
Epoch 1/1
3
Epoch 1/1
4
Epoch 1/1
5
Epoch 1/1
6
Epoch 1/1
7
Epoch 1/1
8
Epoch 1/1
9
Epoch 1/1
10
Epoch 1/1
11
Epoch 1/1
12
Epoch 1/1
13
Epoch 1/1
14
Epoch 1/1
15
Epoch 1/1
16
Epoch 1/1
17
Epoch 1/1
18
Epoch 1/1
19
Epoch 1/1
20
Epoch 1/1
21
Epoch 1/1
22
Epoch 1/1
23
Epoch 1/1
24
Epoch 1/1
25
Epoch 1/1
26
Epoch 1/1
27
Epoch 1/1
28
Epoch 1/1
29
Epoch 1/1
30
Epoch 1/1
31
Epoch 1/1
32
Epoch 1/1
33
Epoch 1/1
34
Epoch 1/1
35
Epoch 1/1
36
Epoch 1/1
37
Epoch 1/1
38
Epoch 1/1
39
Epoch 1/1
40
Epoch 1/1
41
Epoch 1/1
42
Epoch 1/1
43
Epoch 1/1
44
Epoch 1/1
45
Epoch 1/1
46
Epoch 1/1
47
Epoch 1/1
48
Epoch 1/1
49
Epoch 1/1
50
Epoch 1/1
51
Epoch 1/1
52
Epoch 1/1
53
Epoch 1/1
54
Epoch 1/1
55
Epoch 1/1
56
Epoch 1/1
57
Epoch 1/1
58
Epoch 1/1
59
Epoch 1/1
60
Epoch 1/1
61
Epoch 1/1
62
Epoch 1/1
63
Epoch 1/1
64
Epoch 1/1
65
Epoch 1/1
66
Epoch 1/1
67
Epoch 1/1
68
Epoch 1/1
69
Epoch 1/1
70
Epoch 1/1
71
Epoch 1/1
72
Epoch 1/1
73
Epoch 1/1
74
Epoch 1/1
75
Epoch 1/1
76
Epoch 1/1
77
Epoch 

AttributeError: 'ProgbarLogger' object has no attribute 'log_values'

## Batch size = 1000 y reshape del dataset

In [34]:
batch_size = 1000
X_train_split, y_train_split = prepare_sequences(X_train, y_train, 1)
N = len(X_train_split)
frac = N/batch_size
print(N, frac)
X_train_split_reshaped = X_train_split.reshape(batch_size,int(frac)).T.reshape(N,1,1)
y_train_split_reshaped = y_train_split.reshape(batch_size,int(frac)).T.reshape(N)
print(X_train_split.shape)
print(X_train_split_reshaped.shape)

20000 20.0
(20000, 1)
(20000, 1, 1)


In [35]:
from keras.layers import SimpleRNN, Dense, LSTM
from keras.models import Sequential
model = Sequential()
model.add(LSTM(10, batch_input_shape=(batch_size, 1, 1), return_sequences=False, stateful=True))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [36]:
for i in range(N_bits):
    model.fit(X_train_split_reshaped, y_train_split_reshaped, epochs=1, batch_size=batch_size, verbose=1, shuffle=False)
    model.reset_states()

Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1


## Para poder evaluar tengo que recargar el modelo con batch_size=200 para que sea eficiente

In [37]:
X_val_split, y_val_split = prepare_sequences(X_validation, y_validation, 1)
X_val_split_reshaped = X_val_split.reshape(200,20).T.reshape(4000,1,1)
y_val_split_reshaped = y_val_split.reshape(200,20).T.reshape(4000)

In [38]:
new_model = Sequential()
new_model.add(LSTM(10, batch_input_shape=(200, 1, 1), return_sequences=False, stateful=True))
new_model.add(Dense(1, activation='sigmoid'))
new_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [39]:
old_weights = model.get_weights()
new_model.set_weights(old_weights)

In [40]:
new_model.evaluate(X_val_split_reshaped, y_val_split_reshaped, batch_size=200, verbose=1)

 200/4000 [>.............................] - ETA: 10s

[0.10059057124890387, 1.0]

# Batch size = 100

Hay que tomar de a 100 del total de 1000, hacer un reshape a (100,20) y luego trasponer

In [56]:
X_train_split, y_train_split = prepare_sequences(X_train, y_train, 1)

In [57]:
# Asi es como veria los datos el modelo de training
X_train_split.reshape(1000,20)[:100].T #.flatten()

array([[ 1.,  1.,  1., ...,  1.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       ..., 
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.]])

In [58]:
X_train_split.shape

(20000, 1)

In [59]:
X_train_split_reshaped = np.zeros((X_train_split.shape[0]))

In [60]:
batch_size = 100
model = Sequential()
model.add(LSTM(10, batch_input_shape=(batch_size, 1, 1), return_sequences=False, stateful=True))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [61]:
n_batches = int(len(X_train_split_reshaped)/batch_size)
print(n_batches)
for k in range(20):
    i = k%10
    X_to_train = (X_train_split.reshape(1000,20)[i*batch_size:(i+1)*batch_size].T.flatten()).reshape(2000,1,1)
    y_to_train = y_train_split.reshape(1000,20)[i*batch_size:(i+1)*batch_size].T.flatten()
    model.fit(X_to_train, y_to_train, epochs=1, batch_size=batch_size, verbose=1, shuffle=False)
    model.reset_states()

200
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1


In [63]:
X_val_split, y_val_split = prepare_sequences(X_validation, y_validation, 1)
X_val_split_reshaped = X_val_split.reshape(200,20).T.reshape(4000,1,1)
y_val_split_reshaped = y_val_split.reshape(200,20).T.reshape(4000)

new_model = Sequential()
new_model.add(LSTM(10, batch_input_shape=(200, 1, 1), return_sequences=False, stateful=True))
new_model.add(Dense(1, activation='sigmoid'))
new_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

old_weights = model.get_weights()
new_model.set_weights(old_weights)

new_model.evaluate(X_val_split_reshaped, y_val_split_reshaped, batch_size=200, verbose=1)

 200/4000 [>.............................] - ETA: 12s

[0.096351571194827554, 1.0]