In [1]:
from tensorflow.keras.layers import Bidirectional, Concatenate, Permute, Dot, Input, LSTM, Multiply, Softmax
from tensorflow.keras.layers import RepeatVector, Dense, Activation, Lambda
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.activations import softmax
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import load_model, Model
import tensorflow.keras.backend as K
import tensorflow as tf
import numpy as np

from faker import Faker
import random
from tqdm import tqdm
from babel.dates import format_date
from nmt_utils import *
import matplotlib.pyplot as plt
%matplotlib inline

Loading the Dataset: 
This includes 10,000 arbitarily made dates in different formats that are only human-readable; the task is to transform these dates into the farmat that is machine-readable (yyyy-mm-dd)

In [35]:
m = 10000
dataset, human_vocab, machine_vocab, inv_machine_vocab = load_dataset(m)

100%|██████████████████████████████████| 10000/10000 [00:00<00:00, 85842.78it/s]


In [36]:
#Dataset is a list of tuples. In each tuple we have the human-readable and machine-readable dates. 
dataset[0:10] #note the different formats. 

[('nov 11 1994', '1994-11-11'),
 ('tuesday july 7 2015', '2015-07-07'),
 ('06.04.82', '1982-04-06'),
 ('30 jul 2000', '2000-07-30'),
 ('19 nov 1999', '1999-11-19'),
 ('27 03 77', '1977-03-27'),
 ('23 oct 1992', '1992-10-23'),
 ('14 apr 2023', '2023-04-14'),
 ('6 august 1988', '1988-08-06'),
 ('friday november 4 1994', '1994-11-04')]

In [46]:
def preprocess_data(dataset, human_vocab, machine_vocab, Tx, Ty):
    
    X, Y = zip(*dataset) #the start command unlists the dataset into seperate tuples. The zip command then creates two tuples one with the first elements and one with the seocnd elements. 
    
    X = np.array([string_to_int(i, Tx, human_vocab) for i in X]) #this command converts each element of X into a vector of integers length 30. integers are based on the human vocab and extra padding is added at the end. 
    Y = [string_to_int(t, Ty, machine_vocab) for t in Y]
    
    Xoh = np.array(list(map(lambda x: to_categorical(x, num_classes=len(human_vocab)), X))) #one-hot vector of each X element
    Yoh = np.array(list(map(lambda x: to_categorical(x, num_classes=len(machine_vocab)), Y)))

    return X, np.array(Y), Xoh, Yoh

In [51]:
X, Y, Xoh, Yoh = preprocess_data(dataset, human_vocab, machine_vocab, Tx, Ty)

In [45]:
human_vocab

{' ': 0,
 '.': 1,
 '/': 2,
 '0': 3,
 '1': 4,
 '2': 5,
 '3': 6,
 '4': 7,
 '5': 8,
 '6': 9,
 '7': 10,
 '8': 11,
 '9': 12,
 'a': 13,
 'b': 14,
 'c': 15,
 'd': 16,
 'e': 17,
 'f': 18,
 'g': 19,
 'h': 20,
 'i': 21,
 'j': 22,
 'l': 23,
 'm': 24,
 'n': 25,
 'o': 26,
 'p': 27,
 'r': 28,
 's': 29,
 't': 30,
 'u': 31,
 'v': 32,
 'w': 33,
 'y': 34,
 '<unk>': 35,
 '<pad>': 36}

In [42]:
#
X, Y = zip(*dataset)

In [43]:
X[0]

'nov 11 1994'

In [52]:
Y[0]

array([ 2, 10, 10,  5,  0,  2,  2,  0,  2,  2])

In [47]:
print(X[:1])
type(X)

('nov 11 1994',)


tuple

In [7]:
print(Y[0:5])
type(Y)

('1992-09-30', '1970-07-22', '2015-01-22', '1986-04-22', '1990-02-12')


tuple

In [8]:
print(*dataset[0:10])

('30 sep 1992', '1992-09-30') ('22.07.70', '1970-07-22') ('1/22/15', '2015-01-22') ('tuesday april 22 1986', '1986-04-22') ('monday february 12 1990', '1990-02-12') ('tuesday july 29 1980', '1980-07-29') ('monday november 27 2000', '2000-11-27') ('30 oct 1978', '1978-10-30') ('14 oct 1976', '1976-10-14') ('sunday august 22 1993', '1993-08-22')


In [48]:
Tx = 30
X = np.array([string_to_int(i, Tx, human_vocab) for i in X])

In [10]:
X[0]

array([ 6,  3,  0, 29, 17, 27,  0,  4, 12, 12,  5, 36, 36, 36, 36, 36, 36,
       36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36])

In [11]:
human_vocab

{' ': 0,
 '.': 1,
 '/': 2,
 '0': 3,
 '1': 4,
 '2': 5,
 '3': 6,
 '4': 7,
 '5': 8,
 '6': 9,
 '7': 10,
 '8': 11,
 '9': 12,
 'a': 13,
 'b': 14,
 'c': 15,
 'd': 16,
 'e': 17,
 'f': 18,
 'g': 19,
 'h': 20,
 'i': 21,
 'j': 22,
 'l': 23,
 'm': 24,
 'n': 25,
 'o': 26,
 'p': 27,
 'r': 28,
 's': 29,
 't': 30,
 'u': 31,
 'v': 32,
 'w': 33,
 'y': 34,
 '<unk>': 35,
 '<pad>': 36}

In [50]:
Ty = 10
Y = [string_to_int(t, Ty, machine_vocab) for t in Y]

AttributeError: 'list' object has no attribute 'lower'

In [13]:
Y[0:2]

[[2, 10, 10, 3, 0, 1, 10, 0, 4, 1], [2, 10, 8, 1, 0, 1, 8, 0, 3, 3]]

In [14]:
Xoh = np.array(list(map(lambda x: to_categorical(x, num_classes=len(human_vocab)), X)))
Xoh[0][0]

array([0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0.])

In [53]:
Tx = 30
Ty = 10
X, Y, Xoh, Yoh = preprocess_data(dataset, human_vocab, machine_vocab, Tx, Ty)

print("X.shape:", X.shape)
print("Y.shape:", Y.shape)
print("Xoh.shape:", Xoh.shape)
print("Yoh.shape:", Yoh.shape)

X.shape: (10000, 30)
Y.shape: (10000, 10)
Xoh.shape: (10000, 30, 37)
Yoh.shape: (10000, 10, 11)


In [54]:
print(f"First element of X is :\n{X[0]}")
print(f"First element of Y is :\n{Y[0]}")
print(f"First one-hot vector encoding for the first element of X is: \n{Xoh[0][0]}")

First element of X is :
[25 26 32  0  4  4  0  4 12 12  7 36 36 36 36 36 36 36 36 36 36 36 36 36
 36 36 36 36 36 36]
First element of Y is :
[ 2 10 10  5  0  2  2  0  2  2]
First one-hot vector encoding for the first element of X is: 
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]


Note that X is padded to ensure it's length is constant. 

Xoh and Yoh are one-hot vector versions of the X and Y lists. 

In [55]:
index = 0
print("Source date:", dataset[index][0])
print("Target date:", dataset[index][1])
print()
print("Source after preprocessing (indices):", X[index])
print("Target after preprocessing (indices):", Y[index])
print()
print("Source after preprocessing (one-hot):", Xoh[index])
print("Target after preprocessing (one-hot):", Yoh[index])


Source date: nov 11 1994
Target date: 1994-11-11

Source after preprocessing (indices): [25 26 32  0  4  4  0  4 12 12  7 36 36 36 36 36 36 36 36 36 36 36 36 36
 36 36 36 36 36 36]
Target after preprocessing (indices): [ 2 10 10  5  0  2  2  0  2  2]

Source after preprocessing (one-hot): [[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 1.]
 [0. 0. 0. ... 0. 0. 1.]
 [0. 0. 0. ... 0. 0. 1.]]
Target after preprocessing (one-hot): [[0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]]


### One_step attention 
In this step, we define a function that takes the previous hidden state of post-attention LSTM network, $s^{(t-1)}$, as well as the hidden states of the pre-attention bidirectional LSTM $a^{(t')}$ for all t' $\in$  $T_x$ as inputs. This function then runs these two inputs through a fully connected neural network to calculate the energies. Once the energies are calculated, they are run through a Softmax layer to get the alpha attention weights. $\alpha^{t'}$ weights are then multiplied with their respective $a^{(t')}$ to give the context vector $C^{(t)}$. 

In [18]:
def NeuralAttention(a,s_prev): 
    """
    Implements one step of attention mechanism
    
    Arguments:
    a -- output of the Bi-LSTM of shape (m, Tx, 2* n_a)  #(#samples, #rows, #columns)
    s_prev -- previous hidden state of the LSTM of shape (m, n_s)
    Tx -- length of the input sequence (Global Variable)

    Returns:
    context -- context vector, input of the next LSTM cell
    """
    #Create copies of s_prev 
    s_prev = RepeatVector(Tx)(s_prev) #what about all samples together 
    
    #Concatenate s_prev and a: 
    concat = Concatenate(axis = -1)([a,s_prev])
    
    #Run through the first layer of FFN with activation tanh and with 10 neurons: 
    dense1 = Dense(10, activation = "tanh")(concat) #[m,30,10+len(s_prev]-> [m,1,30]
    
    #Run through the final layer of FFN with activation ReLU and 1 neuron: 
    energies = Dense(1,activation = "relu")(dense1)
    
    #Run through a Softmax function to find alphas: 
    alphas = Softmax(axis = 1)(energies)
    
    #Multiply the alphas with their respective a<t'>: 
    Context = Dot(axes=1)([alphas,a])
    
    return(Context,alphas)



In [66]:
#Example: 
np.random.seed(10)
tf.random.set_seed(10)
m = 10 #samples 
Tx = 30 #length of seq
n_a = 32 #neurons 
n_s = 64
a = np.random.uniform(1, 0, (m, Tx, 2 * n_a)).astype(np.float32)
s_prev =np.random.uniform(1, 0, (m, n_s)).astype(np.float32) * 1
Context, alphas= NeuralAttention(a,s_prev)

In [68]:
print(a.shape)
s_prev.shape

(10, 30, 64)


(10, 64)

In [69]:
s_prev = RepeatVector(Tx)(s_prev)
s_prev.shape

TensorShape([10, 30, 64])

In [70]:
concat = Concatenate(axis = -1)([a,s_prev])
concat.shape

TensorShape([10, 30, 128])

In [72]:
dense1 = Dense(10, activation = "tanh")(concat)
dense1.shape

TensorShape([10, 30, 10])

In [73]:
energies = Dense(1,activation = "relu")(dense1)
energies.shape

TensorShape([10, 30, 1])

In [74]:
alphas = Softmax(axis = 1)(energies)
alphas.shape #attention scores

TensorShape([10, 30, 1])

In [75]:
Context = Dot(axes=1)([alphas,a]) #alpha1 * hidden_state1 + alpha2 * hidden_state2 + ... alphaTx * hidden_stateTx
Context.shape

TensorShape([10, 1, 64])

#### Now let's get the $a^{(t')}$ values; the network will be a bi-directional LSTM. 

The pre-attention bi-directional LSTM will have 32 hidden neurons at each time step, meaning that the outcome of each hidden state at time t is 64 (since bidirectional) and since the maximum sequence length is 30, the output of the bi-directional LSTM will be a tensor of size (30,64). 

On the other hand, the context vector will represent the output of the bi-directional pre-attention encoder but with the attention scores multiplied to the output of every hidden state. So, when predicting the $t^{th}$ word in the y vector, we will have the output of the hidden states from the pre-attention bi-directional LSTM, where the outputs are weighed by the attention scores. We will also need to have the previous hidden state and cell state of the decoder LSTM. The decoder LSTM is also bi-directional and has 64 neurons in each hidden layer. Note that this means that there are 64 neurons in the hidden state and 64 neurons in cell state at every time step. 

Therefore: 
* a dim at each time step t: (None, 30, 64) --> n_a = 32
* alphas at each time step t: (None, 30, 1) --> T_x = 30 : for every layer of hidden state encoder we get one attention score. 
* context vector at time t: (None, 1, 64)  --> n_a = 32 + 32 = 64 bi-directional

#### Decoder LSTM: 

We'll have an LSTM structure with n_s = 64 neurons in each hidden state, which equals to the number of neurons in each cell state. So, in the decoder structure, we'll have an LSTM layer and then a Dense layer to make prediction using a softmax activation function. 

two questions: 
1, what does the LSTM cell output once we run the initial state and the context vector in it? 
2, what exactly does the Dense layer use? 
3, does using the Dense command mean that we are going to add another neural network to make predictions? 
4, what are the arguments of the Dense layer and 
5, why is the first argument len(machine_vocab)? 

In [22]:
n_a = 32 # number of units for the pre-attention, bi-directional LSTM's hidden state 'a'
n_s = 64 # number of units for the post-attention LSTM's hidden state "s"

post_activation_LSTM_cell = LSTM(n_s, return_state = True) 
output_layer = Dense(len(machine_vocab), activation=softmax)

In [23]:
# UNQ_C2 (UNIQUE CELL IDENTIFIER, DO NOT EDIT)
# GRADED FUNCTION: model

def modelf(Tx, Ty, n_a, n_s, human_vocab_size, machine_vocab_size):
    """
    Arguments:
    Tx -- length of the input sequence
    Ty -- length of the output sequence
    n_a -- hidden state size of the Bi-LSTM
    n_s -- hidden state size of the post-attention LSTM
    human_vocab_size -- size of the python dictionary "human_vocab"
    machine_vocab_size -- size of the python dictionary "machine_vocab"

    Returns:
    model -- Keras model instance
    """
    
    # Define the inputs of your model with a shape (Tx, human_vocab_size)
    # Define s0 (initial hidden state) and c0 (initial cell state)
    # for the decoder LSTM with shape (n_s,)
    X = Input(shape=(Tx, human_vocab_size))
    # initial hidden state
    s0 = Input(shape=(n_s,), name='s0')
    # initial cell state
    c0 = Input(shape=(n_s,), name='c0')
    # hidden state
    s = s0
    # cell state
    c = c0
    
    # Initialize empty list of outputs
    outputs = []
    
    ### START CODE HERE ###
    
    # Step 1: Define your pre-attention Bi-LSTM. (≈ 1 line)
    a = Bidirectional(LSTM(units=n_a, return_sequences=True))(X)

    
    # Step 2: Iterate for Ty steps
    for t in range(Ty):
    
        # Step 2.A: Perform one step of the attention mechanism to get back the context vector at step t (≈ 1 line)
        context = NeuralAttention(a,s)
        
        # Step 2.B: Apply the post-attention LSTM cell to the "context" vector. (≈ 1 line)
        # Don't forget to pass: initial_state = [hidden state, cell state] 
        # Remember: s = hidden state, c = cell state
        s, _, c = post_activation_LSTM_cell(context, initial_state=[s, c])
        #s, _, c = post_activation_LSTM_cell(context,initial_state=[s, c])
        
        # Step 2.C: Apply Dense layer to the hidden state output of the post-attention LSTM (≈ 1 line)
        out = output_layer(s)
        
        # Step 2.D: Append "out" to the "outputs" list (≈ 1 line)
        outputs.append(out)
    
    # Step 3: Create model instance taking three inputs and returning the list of outputs. (≈ 1 line)
    model = Model(inputs = [X,s0,c0], outputs = outputs)
    
    ### END CODE HERE ###
    
    return model

In [24]:
Tx = 30
n_a = 32
n_s = 64
len_human_vocab = 37
len_machine_vocab = 11
    
    
model = modelf(Tx, Ty, n_a, n_s, len_human_vocab, len_machine_vocab)
    
print(summary(model))


ValueError: Layer 'lstm' expected 1 input(s). Received 2 instead.

In [None]:
model = modelf(Tx, Ty, n_a, n_s, len(human_vocab), len(machine_vocab))
model.summary()

#### Now that the structure of the model is defined, we will also define what loss function to use and the learning rate: 

In [None]:
opt = Adam(0.005,beta_1 = 0.9, beta_2 = 0.999, decay = 0.01) 
model.compile(loss = "categorical_crossentropy", optimizer = opt, metrics = ["accuracy"])

In [None]:
s0 = np.zeros((m, n_s))
c0 = np.zeros((m, n_s))
outputs = list(Yoh.swapaxes(0,1))

In [None]:
model.fit([Xoh, s0, c0], outputs, epochs=1, batch_size=100)

In [None]:
outputs[0].shape

In [None]:
# GRADED FUNCTION: model

def model(Tx, Ty, n_a, n_s, human_vocab_size, machine_vocab_size):
    """
    Arguments:
    Tx -- length of the input sequence
    Ty -- length of the output sequence
    n_a -- hidden state size of the Bi-LSTM
    n_s -- hidden state size of the post-attention LSTM
    human_vocab_size -- size of the python dictionary "human_vocab"
    machine_vocab_size -- size of the python dictionary "machine_vocab"

    Returns:
    model -- Keras model instance
    """
    
    # Define the inputs of your model with a shape (Tx,)
    # Define s0 and c0, initial hidden state for the decoder LSTM of shape (n_s,)
    X = Input(shape=(Tx, human_vocab_size))
    s0 = Input(shape=(n_s,), name='s0')
    c0 = Input(shape=(n_s,), name='c0')
    s = s0
    c = c0
    
    # Initialize empty list of outputs
    outputs = []
    
    ### START CODE HERE ###
    
    # Step 1: Define your pre-attention Bi-LSTM. Remember to use return_sequences=True. (≈ 1 line)
    a = Bidirectional(LSTM(n_a, return_sequences=True))(X)
    
    # Step 2: Iterate for Ty steps
    for t in range(Ty):
    
        # Step 2.A: Perform one step of the attention mechanism to get back the context vector at step t (≈ 1 line)
        context = one_step_attention(a,s)
        
        # Step 2.B: Apply the post-attention LSTM cell to the "context" vector.
        # Don't forget to pass: initial_state = [hidden state, cell state] (≈ 1 line)
        s, _, c = post_activation_LSTM_cell(context,initial_state=[s,c])
        
        # Step 2.C: Apply Dense layer to the hidden state output of the post-attention LSTM (≈ 1 line)
        out = output_layer(s)
        
        # Step 2.D: Append "out" to the "outputs" list (≈ 1 line)
        outputs.append(out)
    
    # Step 3: Create model instance taking three inputs and returning the list of outputs. (≈ 1 line)
    model = Model(inputs = [X,s0,c0],outputs=outputs)
    
    ### END CODE HERE ###
    
    return model

In [25]:
# Defined shared layers as global variables
repeator = RepeatVector(Tx)
concatenator = Concatenate(axis=-1)
densor1 = Dense(10, activation = "tanh")
densor2 = Dense(1, activation = "relu")
activator = Activation(softmax, name='attention_weights') # We are using a custom softmax(axis = 1) loaded in this notebook
dotor = Dot(axes = 1)





In [26]:
def one_step_attention(a, s_prev):
    """
    Performs one step of attention: Outputs a context vector computed as a dot product of the attention weights
    "alphas" and the hidden states "a" of the Bi-LSTM.
    
    Arguments:
    a -- hidden state output of the Bi-LSTM, numpy-array of shape (m, Tx, 2*n_a)
    s_prev -- previous hidden state of the (post-attention) LSTM, numpy-array of shape (m, n_s)
    
    Returns:
    context -- context vector, input of the next (post-attetion) LSTM cell
    """
    
    ### START CODE HERE ###
    # Use repeator to repeat s_prev to be of shape (m, Tx, n_s) so that you can concatenate it with all hidden states "a" (≈ 1 line)
    s_prev = repeator(s_prev)
    # Use concatenator to concatenate a and s_prev on the last axis (≈ 1 line)
    concat = concatenator([a, s_prev])
    # Use densor1 to propagate concat through a small fully-connected neural network to compute the "intermediate energies" variable e. (≈1 lines)
    e = densor1(concat)
    # Use densor2 to propagate e through a small fully-connected neural network to compute the "energies" variable energies. (≈1 lines)
    energies = densor2(e)
    # Use "activator" on "energies" to compute the attention weights "alphas" (≈ 1 line)
    alphas = activator(energies)
    # Use dotor together with "alphas" and "a" to compute the context vector to be given to the next (post-attention) LSTM-cell (≈ 1 line)
    context = dotor([alphas, a])
    ### END CODE HERE ###
    
    return context


In [27]:
n_a = 32
n_s = 64
post_activation_LSTM_cell = LSTM(n_s, return_state = True)
output_layer = Dense(len(machine_vocab), activation=softmax)

In [28]:
# GRADED FUNCTION: model

def model(Tx, Ty, n_a, n_s, human_vocab_size, machine_vocab_size):
    """
    Arguments:
    Tx -- length of the input sequence
    Ty -- length of the output sequence
    n_a -- hidden state size of the Bi-LSTM
    n_s -- hidden state size of the post-attention LSTM
    human_vocab_size -- size of the python dictionary "human_vocab"
    machine_vocab_size -- size of the python dictionary "machine_vocab"

    Returns:
    model -- Keras model instance
    """
    
    # Define the inputs of your model with a shape (Tx,)
    # Define s0 and c0, initial hidden state for the decoder LSTM of shape (n_s,)
    X = Input(shape=(Tx, human_vocab_size))
    s0 = Input(shape=(n_s,), name='s0')
    c0 = Input(shape=(n_s,), name='c0')
    s = s0
    c = c0
    
    # Initialize empty list of outputs
    outputs = []
    
    ### START CODE HERE ###
    
    # Step 1: Define your pre-attention Bi-LSTM. Remember to use return_sequences=True. (≈ 1 line)
    a = Bidirectional(LSTM(n_a, return_sequences=True))(X)
    
    # Step 2: Iterate for Ty steps
    for t in range(Ty):
    
        # Step 2.A: Perform one step of the attention mechanism to get back the context vector at step t (≈ 1 line)
        context = one_step_attention(a,s)
        
        # Step 2.B: Apply the post-attention LSTM cell to the "context" vector.
        # Don't forget to pass: initial_state = [hidden state, cell state] (≈ 1 line)
        s, _, c = post_activation_LSTM_cell(context,initial_state=[s,c])
        
        # Step 2.C: Apply Dense layer to the hidden state output of the post-attention LSTM (≈ 1 line)
        out = output_layer(s)
        
        # Step 2.D: Append "out" to the "outputs" list (≈ 1 line)
        outputs.append(out)
    
    # Step 3: Create model instance taking three inputs and returning the list of outputs. (≈ 1 line)
    model = Model(inputs = [X,s0,c0],outputs=outputs)
    
    ### END CODE HERE ###
    
    return model

In [29]:
model = model(Tx, Ty, n_a, n_s, len(human_vocab), len(machine_vocab))

In [30]:
model.summary()

In [32]:
opt = model.compile(optimizer=Adam(0.005, beta_1=0.9, beta_2=0.999, decay=0.01),
                    metrics=['accuracy'],
                    loss = 'categorical_crossentropy')

In [33]:
s0 = np.zeros((m, n_s))
c0 = np.zeros((m, n_s))
outputs = list(Yoh.swapaxes(0,1))

In [34]:
model.fit([Xoh, s0, c0], outputs, epochs=1, batch_size=100)

ValueError: Data cardinality is ambiguous. Make sure all arrays contain the same number of samples.'x' sizes: 10000, 10, 10
'y' sizes: 10000, 10000, 10000, 10000, 10000, 10000, 10000, 10000, 10000, 10000
