### Read from HDF5 files

In [1]:
%matplotlib notebook
import sys, os
module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)
from cookiebox_forYoussef.src.h5todatasets import *
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from tensorflow.keras import layers, utils
import glob

FPATH = '/home/ynashed/workspace/data/cookiebox/*.h5'
data_files = glob.glob(FPATH)
X = []
Y = []
for fname in data_files:
    f = h5py.File(fname,'r')
    imkeys = [i for i in list(f.keys()) if re.match('^img\d+',i)]
    for imkey in imkeys:
        carrier = f[imkey].attrs['carrier']
        h = f[imkey]['hist'][()]
        phases = [phase2id(carrier + f[imkey].attrs['ephases'][i]) for i in range(f[imkey].attrs['npulses'])]
        X.append(h)
        Y.append(phases)
    f.close()
X = np.array(X)
targets = np.array([len(y) for y in Y])[...,np.newaxis]
y = OneHotEncoder(sparse=False).fit_transform(targets)
print(X.shape, y.shape)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=32)
X_train = X_train.astype(np.float32)
X_test = X_test.astype(np.float32)

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


(20000, 2048, 16) (20000, 4)


### Visualize an example

In [2]:
dindex = np.random.choice(np.arange(X_train.shape[0]))
data = X_train[dindex]

print(data.dtype)

fig = plt.figure()
ax = fig.add_subplot(111)
img = ax.imshow(data)
ax.set_aspect('auto')
fig.colorbar(img, ax=ax)

print(y_train[dindex])

float32


<IPython.core.display.Javascript object>

[0. 1. 0. 0.]


### Let's try a baseline bi-directional LSTM model

In [5]:
from tensorflow.keras import Model, Input
from tensorflow.keras.layers import Bidirectional, Dense, CuDNNLSTM
from tensorflow.keras.callbacks import EarlyStopping, TensorBoard

nclasses = y_train.shape[1]
nenergies = X_train.shape[1]
nangles = X_train.shape[2]
nvocab = X_train.max()

# Input for variable-length sequences of integers
inputs = Input(shape=(nenergies, nangles))
x = inputs
# Embed each integer in a 3-dimensional vector
# x = layers.Embedding(nvocab, 3, input_length=nenergies)(x)
# Add 2 bidirectional LSTMs
x = (CuDNNLSTM(16, return_sequences=True))(x)
x = (CuDNNLSTM(16))(x)
# Add a classifier
# x = Dense(128, activation="relu")(x)
outputs = Dense(nclasses, activation="softmax")(x)
model = Model(inputs, outputs)
model.summary()

my_callbacks = [EarlyStopping(monitor='val_loss', min_delta=0,
                              patience=10, verbose=0, mode='auto',
                              baseline=None, restore_best_weights=True),
                TensorBoard(log_dir='./logs')]
model.compile("adam", "categorical_crossentropy", metrics=["accuracy"])
history = model.fit(X_train, y_train, batch_size=128, epochs=100, 
                    callbacks=my_callbacks, validation_split=0.1)
# list all data in history
print(history.history.keys())
# summarize history for accuracy
plt.figure()
plt.plot(history.history['acc'])
plt.plot(history.history['val_acc'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'validation'], loc='upper left')
# summarize history for loss
plt.figure()
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'validation'], loc='upper left')

model.save('best_lstm.h5')

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         [(None, 2048, 16)]        0         
_________________________________________________________________
cu_dnnlstm_2 (CuDNNLSTM)     (None, 2048, 16)          2176      
_________________________________________________________________
cu_dnnlstm_3 (CuDNNLSTM)     (None, 16)                2176      
_________________________________________________________________
dense_1 (Dense)              (None, 4)                 68        
Total params: 4,420
Trainable params: 4,420
Non-trainable params: 0
_________________________________________________________________
Train on 14400 samples, validate on 1600 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 1

Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100
dict_keys(['loss', 'acc', 'val_loss', 'val_acc'])


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

### Test the model on unseen data

In [6]:
from tensorflow.keras.models import load_model

model = load_model('best_lstm.h5')
model.evaluate(X_test, y_test)



[0.00838570858235471, 0.998]