In [1]:
"""-------------------------------------------------
Author: Rajkumar Conjeevaram Mohan
Email: rajkumarcm@yahoo.com
Program: Language Model with Attention mechanism
-------------------------------------------------"""

'-------------------------------------------------\nAuthor: Rajkumar Conjeevaram Mohan\nEmail: rajkumarcm@yahoo.com\nProgram: Language Model with Attention mechanism\n-------------------------------------------------'

In [2]:
import pdb
import pickle
import math
import os
os.environ['CUDA_VISIBLE_DEVICES']='1'
os.environ['TF_FORCE_GPU_ALLOW_GROWTH']='true'
import numpy as np
%matplotlib notebook
import matplotlib
from matplotlib import pyplot as plt
import tensorflow as tf
from tensorflow.keras import Sequential, Model
from tensorflow.keras.layers import *
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow_datasets as tfds

INFO:tensorflow:Enabling eager execution
INFO:tensorflow:Enabling v2 tensorshape
INFO:tensorflow:Enabling resource variables
INFO:tensorflow:Enabling tensor equality
INFO:tensorflow:Enabling control flow v2


In [3]:
%config Completer.use_jedi = False

### Load data

In [4]:
imdb, info = tfds.load("imdb_reviews", with_info=True, as_supervised=True)

In [5]:
train_data, test_data = imdb['train'], imdb['test']

training_sentences = []
training_labels = []

testing_sentences = []
testing_labels = []

for s, _ in train_data:
    training_sentences.append(s.numpy().decode('utf8'))
for s, _ in test_data:
    testing_sentences.append(s.numpy().decode('utf8'))

N_TR = len(training_sentences)
N_TS = len(testing_sentences)
N_VL = int(0.2 * N_TR)
validation_sentences = training_sentences[-N_VL:]
training_sentences = training_sentences[:-N_VL]
N_TR = len(training_sentences)

In [6]:
print(f'Length of training set: {N_TR}')
print(f'Length of testing set: {N_TS}')
print(f'Length of validation set: {N_VL}')

Length of training set: 20000
Length of testing set: 25000
Length of validation set: 5000


#### Limit data to subset of 1000 examples

In [7]:
# Temporary
training_sentences = training_sentences[:1000]
validation_sentences = validation_sentences[:200]
N_TR = 1000
N_VL = 200

#### Tokenizer

In [8]:
VOCAB_SIZE = 10000
tokenizer = Tokenizer(num_words=VOCAB_SIZE, oov_token='<OOV>')
tokenizer.fit_on_texts(training_sentences)
index_word = tokenizer.index_word

In [9]:
tmp_length1 = []
tmp_length2 = []

for sentence in training_sentences:
    tmp_tokens = tokenizer.texts_to_sequences([sentence])[0]
    tmp_length1.append(len(tmp_tokens))

for sentence in testing_sentences:
    tmp_tokens = tokenizer.texts_to_sequences([sentence])[0]
    tmp_length2.append(len(tmp_tokens))

tmp_length1 = np.max(tmp_length1)
tmp_length2 = np.max(tmp_length2)
MAX_LEN = np.max([tmp_length1, tmp_length2])

#### Create mask

In [10]:
noise = 0.15
MASK = ['X']
from numpy.random import random
def get_masks(tokens_seq):
    masks = []
    for tokens in tokens_seq:
        previously_masked = False
        mask = []
        for token in tokens:
            if random() < noise:
                if not previously_masked:
                    mask.append(True)
                else:
                    mask.append(False)
            else:
                mask.append(False)
        masks.append(mask)
    return masks

### Data preprocessing

In [11]:
def preprocess(review):
    tokenized_review = tokenizer.texts_to_sequences([str(review)])[0]
    masks = get_masks([tokenized_review])
    tokenized_review = pad_sequences([tokenized_review], maxlen=MAX_LEN)
    tokenized_review = tokenized_review.astype(np.int16)
    masks = pad_sequences(masks, maxlen=MAX_LEN)
    masks = masks.astype(np.bool)
    return np.squeeze(tokenized_review), np.squeeze(masks)

In [12]:
def stream(data):
    for i in range(len(data)):
        review = data[i]
        tokenized_review, mask = preprocess(review)
        yield tokenized_review, mask

In [13]:
def make_dict(tokenized_review, mask):
    return {'input_key':tokenized_review, 'input_query':tokenized_review}, \
           {'mask':mask, 'target':tokenized_review}

#### Verify whether the preprocessing steps work as they should

In [14]:
train_gen = stream(training_sentences)
for review, mask in train_gen:
    tmp_review = review
    tmp_mask = mask
    print('Stream output shape-----------------------')
    print(f'train_token.shape={review.shape}, dtype={review.dtype}')
    print(f'mask.shape={mask.shape}, dtype={mask.dtype}')
    print('------------------------------------------\n')
    break

Stream output shape-----------------------
train_token.shape=(2332,), dtype=int16
mask.shape=(2332,), dtype=bool
------------------------------------------



In [15]:
# input_x, target_x = make_dict(tmp_review, tmp_mask)
# print(f'input_x["input_key"].shape={input_x["input_key"].shape}')
# print(f'input_x["input_key"].shape={input_x["input_key"].shape}')
# print(f'mask.shape={target_x["mask"].shape}')

### Now, let's create data pipeline

In [16]:
BATCH_SIZE = 1

train_gen = tf.data.Dataset.from_generator(generator=stream, 
                                           args=[training_sentences], 
                                           output_shapes=([MAX_LEN], [MAX_LEN]),
                                           output_types=(tf.int16, tf.bool)
                                          )
train_gen = train_gen.repeat().batch(BATCH_SIZE)#.map(make_dict).prefetch(3)

val_gen = tf.data.Dataset.from_generator(generator=stream,
                                         args=[validation_sentences],
                                         output_shapes=([MAX_LEN], [MAX_LEN]),
                                         output_types=(tf.int16, tf.bool))
val_gen = val_gen.repeat().batch(BATCH_SIZE)#.map(make_dict).prefetch(3)

tes_gen = tf.data.Dataset.from_generator(generator=stream,
                                         args=[testing_sentences],
                                         output_shapes=([MAX_LEN], [MAX_LEN]),
                                         output_types=(tf.int16, tf.bool))
tes_gen = val_gen.repeat().batch(BATCH_SIZE)#.map(make_dict).prefetch(3)

### Now once again let's confirm whether data generator work as they should

##### Use the following snippet when map function is enabled

In [17]:
# for review_dict, target_dict in train_gen.as_numpy_iterator():
#     print(f'review: {review_dict["input_key"].shape}')
#     print(f'mask: {target_dict["mask"].shape}')
#     break

##### Use this when map is disabled

In [18]:
for tokenized_review, mask in train_gen:
    input_x, target_x = make_dict(tokenized_review, mask)
    print('Output signature of train generator-------------')
    print(f'review: {tokenized_review.shape}')
    print(f'mask: {mask.shape}')
    print('------------------------------------------------\n')
    break

Output signature of train generator-------------
review: (1, 2332)
mask: (1, 2332)
------------------------------------------------



In [19]:
def accuracy(y_true, y_pred):
    y_pred = tf.cast(tf.argmax(y_pred, axis=-1), tf.int16)
    y_pred = tf.squeeze(y_pred)  # batch_size x max_len
    target = tf.cast(y_true['target'], tf.int16)  # batch_size x max_len
    correct = tf.cast(tf.equal(y_pred, target), dtype=tf.int16)
    mask = tf.cast(y_true['mask'], tf.int16)
    acc = tf.reduce_sum(correct * mask)/tf.reduce_sum(mask)
    return acc

In [20]:
def loss(y_true, y_pred):
    mask = y_true['mask']
    target = tf.one_hot(depth=VOCAB_SIZE+1, indices=tf.cast(y_true['target'], tf.int32))
    loss = -1 * (target * tf.math.log(y_pred + 1e-9))
    loss = tf.reduce_sum(loss, axis=-1)
    loss = tf.squeeze(loss)
    loss = loss * tf.cast(mask, dtype=tf.float32)
    loss = tf.reduce_sum(loss, axis=-1)
    loss = tf.reduce_mean(loss)
    return loss

#### Verify whether the loss work as it should

In [21]:
tmp_pred = tf.one_hot(indices=tf.cast(input_x['input_key'], dtype=tf.int32), depth=VOCAB_SIZE+1)
print(f'tmp_pred.shape={tmp_pred.shape}')
loss(target_x, tmp_pred)

tmp_pred.shape=(1, 2332, 10001)


<tf.Tensor: shape=(), dtype=float32, numpy=0.0>

#### Verify whether accuracy is working correctly

In [22]:
accuracy(target_x, tmp_pred)

<tf.Tensor: shape=(), dtype=float32, numpy=1.0>

In [23]:
def get_angles(pos, i, d_model):
    angle_rates = 1 / np.power(10000, (2 * (i//2)) / np.float32(d_model))
    return pos * angle_rates

def positional_encoding(position, d_model):
    angle_rads = get_angles(np.arange(position)[:, np.newaxis],
                          np.arange(d_model)[np.newaxis, :],
                          d_model)

    # apply sin to even indices in the array; 2i
    angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])

    # apply cos to odd indices in the array; 2i+1
    angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])

    pos_encoding = angle_rads[np.newaxis, ...]

    return tf.cast(pos_encoding, dtype=tf.float32)

In [24]:
def create_model(d_model, n_heads, dropout):

    D_MODEL = d_model

    input_key = Input(shape=[MAX_LEN], batch_size=BATCH_SIZE, dtype=tf.int16, name='input_key')
    input_query = Input(shape=[MAX_LEN], batch_size=BATCH_SIZE, dtype=tf.int16, name='input_query')

    # # Shift Right
    y = Lambda(lambda y: tf.pad(y, paddings=[[0, 0], [1, 0]]))(input_query)

    x = Embedding(input_dim=VOCAB_SIZE, output_dim=D_MODEL)(input_key)
    x += positional_encoding(MAX_LEN, D_MODEL)
    
    y = Embedding(input_dim=VOCAB_SIZE, output_dim=D_MODEL)(y)
    y += positional_encoding(MAX_LEN+1, D_MODEL)

    mask = np.tril(np.ones([1, MAX_LEN+1, MAX_LEN])) # batch_size x (length_of_query+1) x length_of_key

    output = MultiHeadAttention(num_heads=n_heads, key_dim=D_MODEL, value_dim=D_MODEL, 
                                dropout=dropout, use_bias=True)(query=y, value=x)
    
    output = Lambda(lambda x: x[:, :-1])(output)
    output = LayerNormalization(axis=-1)(output)
    output = Dense(VOCAB_SIZE+1, use_bias=True, kernel_regularizer='l2', bias_regularizer='l2')(output)  # 0 for padding, and (1-VOCAB_SIZE) for words

    output = Softmax(axis=-1)(output)
    model = Model(inputs=[input_key, input_query], outputs=output)
    tb_callback = tf.keras.callbacks.TensorBoard('logs_full')
    tb_callback.set_model(model)
#     model.summary()
    return model

In [25]:
def train(model, epochs, initial_epoch=0):
    
    train_loss_results = []
    train_accuracy_results = []

    vl_loss_results = []
    vl_accuracy_results = []
    
    optimizer = tf.keras.optimizers.RMSprop(learning_rate=1e-4)
    lr_switch = {
                  1: 1e-5, # [20 - 40)
                  2: 1e-6, # [40 - 60)
                  3: 5e-7, # [60 - 80)
                  4: 1e-7, # [80 - 100]
    }
    
    def grad(model, inputs, targets):
        with tf.GradientTape() as tape:
            y_pred = model(inputs)
            loss_value = loss(targets, y_pred)
            acc = accuracy(targets, y_pred)
        return loss_value, acc, tape.gradient(loss_value, model.trainable_variables)
    
    for epoch in range(initial_epoch, epochs):
        epoch_loss_avg = tf.keras.metrics.Mean()
        epoch_accuracy = tf.keras.metrics.Mean()
        
        vl_loss_avg = tf.keras.metrics.Mean()
        vl_accuracy = tf.keras.metrics.Mean()

        for (tk_reviews, mask), _ in zip(train_gen, range(N_TR//BATCH_SIZE)):
            input_dict, target_dict = make_dict(tk_reviews, mask)

            tr_loss, tr_acc, grads = grad(model, input_dict, target_dict)
            optimizer.apply_gradients(zip(grads, model.trainable_variables))

            epoch_loss_avg.update_state(tr_loss)
            epoch_accuracy.update_state(tr_acc)

        tr_avg_loss = epoch_loss_avg.result().numpy()
        tr_avg_acc = epoch_accuracy.result().numpy()
        epoch_loss_avg.reset_states()
        epoch_accuracy.reset_states()

        train_loss_results.append(tr_avg_loss)
        train_accuracy_results.append(tr_avg_acc)
        
        for (tk_reviews, mask), _ in zip(val_gen, range(N_VL//BATCH_SIZE)):
            input_dict, target_dict = make_dict(tk_reviews, mask)
            y_pred = model(input_dict)
            vl_loss = loss(target_dict, y_pred)
            vl_acc = accuracy(target_dict, y_pred)
            
            vl_loss_avg.update_state(vl_loss)
            vl_accuracy.update_state(vl_acc)
        
        vl_loss = vl_loss_avg.result().numpy()
        vl_acc = vl_accuracy.result().numpy()
        vl_loss_avg.reset_states()
        vl_accuracy.reset_states()
        
        vl_loss_results.append(vl_loss)    
        vl_accuracy_results.append(vl_acc)

        print("Epoch %d Loss: %.3f Accuracy: %.3f Val_Loss: %.3f Val_Accuracy: %.3f" %\
                      (epoch, tr_avg_loss, tr_avg_acc, vl_loss, vl_acc))
        
        old_lr = optimizer.learning_rate.numpy()
        if epoch%20 == 0:
            period = old_lr//20
            optimizer.learning_rate.assign(lr_switch.get(period, 1e-4))
            
    return [ np.array(train_loss_results)[..., None], \
             np.array(train_accuracy_results)[..., None], \
             np.array(vl_loss_results)[..., None], \
             np.array(vl_accuracy_results)[..., None] ]

In [26]:
EPOCHS = 50

In [26]:
performance = {}

for d_model in [128, 256, 512]:
    for n_heads in [2, 4, 6, 8]:
        for dropout in [0.1, 0.15, 0.2]:
            tmp_model = create_model(d_model=d_model, n_heads=n_heads, dropout=dropout)
            performance[f'{d_model},{n_heads},{dropout}'] = np.hstack(train(tmp_model, epochs=EPOCHS))

perf_file = open('performance_1l.pkl', 'wb')
pickle.dump(performance, perf_file)
perf_file.close()

Epoch 0 Loss: 242.644 Accuracy: 0.055 Val_Loss: 236.806 Val_Accuracy: 0.076
Epoch 1 Loss: 232.148 Accuracy: 0.055 Val_Loss: 234.053 Val_Accuracy: 0.054
Epoch 2 Loss: 231.919 Accuracy: 0.055 Val_Loss: 234.038 Val_Accuracy: 0.055
Epoch 3 Loss: 229.862 Accuracy: 0.060 Val_Loss: 232.382 Val_Accuracy: 0.064
Epoch 4 Loss: 229.429 Accuracy: 0.065 Val_Loss: 232.826 Val_Accuracy: nan
Epoch 5 Loss: 228.430 Accuracy: 0.067 Val_Loss: 229.938 Val_Accuracy: 0.070
Epoch 6 Loss: 225.704 Accuracy: nan Val_Loss: 229.592 Val_Accuracy: 0.069
Epoch 7 Loss: 226.894 Accuracy: 0.069 Val_Loss: 225.772 Val_Accuracy: 0.087
Epoch 8 Loss: 226.552 Accuracy: 0.071 Val_Loss: 231.028 Val_Accuracy: 0.086
Epoch 9 Loss: 223.213 Accuracy: 0.073 Val_Loss: 228.645 Val_Accuracy: 0.072
Epoch 10 Loss: 224.786 Accuracy: 0.072 Val_Loss: 229.725 Val_Accuracy: 0.074
Epoch 11 Loss: 224.219 Accuracy: 0.071 Val_Loss: 231.298 Val_Accuracy: 0.076
Epoch 12 Loss: 224.210 Accuracy: 0.076 Val_Loss: 231.896 Val_Accuracy: 0.077
Epoch 13 Loss

#### Plot the performance

In [29]:
perf_file = open('performance_1l.pkl', 'rb')
performance = pickle.load(perf_file)
perf_file.close()

keys = list(performance.keys())
n_keys = len(keys)
c = np.random.rand(n_keys) * 3
norm = matplotlib.colors.Normalize(vmin=0, vmax=3)
cmap = plt.cm.rainbow

import matplotlib.pyplot as plt

fig, axes = plt.subplots(2, 1, figsize=(10, 15))
axes[0].set_title('Loss')
axes[0].set_xlabel('Epochs')
axes[0].set_ylabel('CrossEntropy')
axes[1].set_title('Accuracy')
axes[1].set_xlabel('Epochs')
axes[1].set_ylabel('Accuracy')
# axes[0].plot(range(len(train_loss_results)), train_loss_results, '-r', label='training')

for key, i in zip(keys, range(n_keys)):
    tmp_loss_acc = performance[key]
    axes[0].plot(range(EPOCHS), tmp_loss_acc[:, 0], c=cmap(norm(c[i])), label=key)
    axes[1].plot(range(EPOCHS), tmp_loss_acc[:, 1], c=cmap(norm(c[i])), label=key)

axes[0].grid(True)
# axes[0].legend()
axes[1].grid(True)
# axes[1].legend()
# axes[1].plot(range(len(train_accuracy_results)), train_accuracy_results, '-r', label='training')
plt.show()

<IPython.core.display.Javascript object>

In [63]:
# Moving average
# for key in  keys:
#     if (performance[key][:, 0] == np.nan).any() or (performance[key][:, 1] == np.nan).any():
#         print('nan found')
print(performance['128,4,0.15'][:, -1])
print(performance['128,4,0.15'][:, -1][2])
print(type(performance['128,4,0.15'][:, -1][2]))
any([math.isnan(e) for e in performance['128,4,0.15'][:, -1]])

[0.05450804 0.05023003        nan 0.06703164 0.07115673 0.07462306
 0.06734305 0.07148367 0.07062805 0.07736326 0.07495247 0.07420566
 0.08355295 0.07579081 0.0793337  0.08671837 0.08895482 0.09128039
 0.08546905 0.08844045 0.0947853  0.08530635 0.09331235 0.09881746
 0.08861834 0.0896976  0.10228613 0.09431574 0.10484477 0.09653278
 0.09739635        nan 0.10056282 0.09750094 0.10021588 0.10182509
 0.10233739 0.1021123  0.1084828         nan 0.10244526 0.10589474
 0.10546809 0.10965379 0.10780047 0.10513072 0.1098827  0.1099591
        nan 0.11084422]
nan
<class 'numpy.float32'>


True

#### Nothing is easily traceable at this point, so we shall do a moving average to smoothen the plot

In [30]:
def filterout_nans(x):
    b_indices = [math.isnan(e) is False for e in x]
    return x[b_indices]

perf_file = open('performance_1l.pkl', 'rb')
performance = pickle.load(perf_file)
perf_file.close()

w_size = 10
for key in keys:
    for i in range(EPOCHS-w_size):
        for j in range(4):
            filtered = filterout_nans(performance[key][:, j][i:i+w_size])
            performance[key][:, j][i] = np.mean(filtered)
    

In [37]:
def update_annot(line, ind):
    x,y = line.get_data()
    annot.xy = (0, 224)
    # print(line.get_label())
    text = f"{line.get_label()}"
    annot.set_text(text)
    annot.get_bbox_patch().set_alpha(0.4)

def hover(event):
    print('hover activated')
    vis = annot.get_visible()
    if event.inaxes == axes[0]:
        line = None
        for i in range(len(lines)):
            cont, ind = lines[i].contains(event)
            if cont:
                line = lines[i]
                break
        if cont:
            update_annot(line, ind)
            annot.set_visible(True)
            fig.canvas.draw_idle()
        else:
            if vis:
                annot.set_visible(False)
                fig.canvas.draw_idle()

In [38]:
fig, axes = plt.subplots(2, 1, figsize=(10, 15))
axes[0].set_title('Loss')
axes[0].set_xlabel('Epochs')
axes[0].set_ylabel('CrossEntropy')
axes[1].set_title('Accuracy')
axes[1].set_xlabel('Epochs')
axes[1].set_ylabel('Accuracy')
# axes[0].plot(range(len(train_loss_results)), train_loss_results, '-r', label='training')
lines = []

for key, i in zip(keys, range(n_keys)):
    tmp_loss_acc = performance[key]
    line, = axes[0].plot(range(EPOCHS-w_size), tmp_loss_acc[:, 2][:-w_size], c=cmap(norm(c[i])), label=key)
    lines.append(line)
    axes[1].plot(range(EPOCHS-w_size), tmp_loss_acc[:, 3][:-w_size], c=cmap(norm(c[i])), label=key)

for i in range(2):
    annot = axes[i].annotate("", xy=(0,0), xytext=(-20,20), textcoords="offset points",
                        bbox=dict(boxstyle="round", fc="w"),
                        arrowprops=dict(arrowstyle="->"))
    annot.set_visible(False)

axes[0].grid(True)
# axes[0].legend()
axes[1].grid(True)
fig.canvas.mpl_connect("motion_notify_event", hover)
# axes[1].legend()
# axes[1].plot(range(len(train_accuracy_results)), train_accuracy_results, '-r', label='training')
plt.show()

<IPython.core.display.Javascript object>

In [29]:
# Best parameter found from the above plot using hover method on Pycharm... (256, 4, 0.1)

def create_model_2l(d_model, n_heads, dropout):

    D_MODEL = d_model

    input_key = Input(shape=[MAX_LEN], batch_size=BATCH_SIZE, dtype=tf.int16, name='input_key')
    input_query = Input(shape=[MAX_LEN], batch_size=BATCH_SIZE, dtype=tf.int16, name='input_query')

    # # Shift Right
    y = Lambda(lambda y: tf.pad(y, paddings=[[0, 0], [1, 0]]))(input_query)

    x = Embedding(input_dim=VOCAB_SIZE, output_dim=D_MODEL)(input_key)
    x += positional_encoding(MAX_LEN, D_MODEL)
    x = MultiHeadAttention(num_heads=n_heads, key_dim=D_MODEL, value_dim=D_MODEL, 
                           dropout=dropout, use_bias=True, kernel_regularizer='l2', 
                           bias_regularizer='l2')(query=x, value=x)
    
    y = Embedding(input_dim=VOCAB_SIZE, output_dim=D_MODEL)(y)
    y += positional_encoding(MAX_LEN+1, D_MODEL)
    y = MultiHeadAttention(num_heads=n_heads, key_dim=D_MODEL, value_dim=D_MODEL, 
                           dropout=dropout, use_bias=True, kernel_regularizer='l2', 
                           bias_regularizer='l2')(query=y, value=y)

    mask = np.tril(np.ones([1, MAX_LEN+1, MAX_LEN])) # batch_size x (length_of_query+1) x length_of_key

    output = MultiHeadAttention(num_heads=n_heads, key_dim=D_MODEL, value_dim=D_MODEL, 
                                dropout=dropout, use_bias=True, kernel_regularizer='l2', 
                                bias_regularizer='l2')(query=y, value=x)
    output = Lambda(lambda x: x[:, :-1])(output)
    output = LayerNormalization(axis=-1)(output)
    output = Dense(VOCAB_SIZE+1, use_bias=True, kernel_regularizer='l2', bias_regularizer='l2')(output)  # 0 for padding, and (1-VOCAB_SIZE) for words

    output = Softmax(axis=-1)(output)
    model = Model(inputs=[input_key, input_query], outputs=output)
    tb_callback = tf.keras.callbacks.TensorBoard('logs_full')
    tb_callback.set_model(model)
#     model.summary()
    return model

In [30]:
train_loss_results = []
train_accuracy_results = []

vl_loss_results = []
vl_accuracy_results = []

best_model_key = max(performance, key=performance.get)
d_model, n_heads, dropout = best_model_key.split(',')
d_model = int(d_model)
n_heads = int(n_heads)
dropout = float(dropout)

model3 = create_model_2l(128)
train(model=model3, epochs=10)

fig, axes = plt.subplots(1, 2, figsize=(15, 6))
axes[0].set_title('Loss')
axes[0].set_xlabel('Epochs')
axes[0].set_ylabel('CrossEntropy')
axes[0].plot(range(len(train_loss_results)), train_loss_results, '-r', label='training')
axes[0].plot(range(len(vl_loss_results)), vl_loss_results, '-b', label='validation')
axes[0].grid(True)
axes[0].legend()
axes[1].set_title('Accuracy')
axes[1].set_xlabel('Epochs')
axes[1].set_ylabel('Accuracy')
axes[1].plot(range(len(train_accuracy_results)), train_accuracy_results, '-r', label='training')
axes[1].plot(range(len(vl_accuracy_results)), vl_accuracy_results, '-b', label='validation')
axes[1].grid(True)
axes[1].legend()
plt.show()

Epoch 0 Loss: 237.938 Accuracy: 0.055 Val_Loss: 237.454 Val_Accuracy: 0.055
Epoch 1 Loss: 231.840 Accuracy: 0.055 Val_Loss: 232.943 Val_Accuracy: 0.055
Epoch 2 Loss: 230.886 Accuracy: 0.055 Val_Loss: 231.977 Val_Accuracy: 0.055


KeyboardInterrupt: 