# This notebook is a tutorial on NetDebugger
Author: Rishi Gurnani, Georgia Institute of Technology<br />
Creation Date: July 21, 2021 4:54 PM EST

# Import
Some python packages are needed to run this notebook. We import all of those below.

In [6]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem
from torch import tensor, cuda, manual_seed, zeros, nn, optim
from torch import float as torch_float
from torch_geometric.data import DataLoader
from torch import device as torch_device
import random
from sklearn.model_selection import train_test_split
from torch_geometric.data import Data

In [7]:
from nndebugger import constants, loss, dl_debug
from nndebugger import torch_utils as utils

# TODO For Rishi before publishing notebook:

1. Remove all 'importlib' statements
1. Consider using `trainer` function for all tests in `dl_debug`
1. Clean up arguments passed into DebugSession
1. Run all cells and verify that the outputs are what you expected
1. Try using all the polymers for "chart dependencies" instead of a small sample
1. Change FFNet to MyNet
1. Change overfit small batch to an R2 requirement
1. Delete this cell

# Fix random seeds to ensure reproducible results

In [8]:
random.seed(constants.RANDOM_SEED)
manual_seed(constants.RANDOM_SEED)
np.random.seed(constants.RANDOM_SEED)

# Load data set 

In [9]:
data_df = pd.read_csv('data/export.csv',index_col=0)
data_df.head()

Unnamed: 0,smiles,property,value
822,[*]C[*],Egc,6.8972
823,[*]CC([*])C,Egc,6.5196
824,[*]CC([*])CC,Egc,6.517
825,[*]CC([*])CCC,Egc,6.7336
826,[*]CC([*])CC(C)C,Egc,6.7394


In [10]:
len(data_df)

3380

# Featurize data set

In [11]:
N_FEATURES = 512
N_DATA = len(data_df)

def featurize_smiles(smile):
    smile = smile.replace('*', 'H')
    mol = Chem.MolFromSmiles(smile)
    fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=N_FEATURES, useChirality=True)
    return np.array(fp)

feature_array = np.zeros((N_DATA, N_FEATURES))

ind = 0
for smiles in data_df.smiles.values:
    feature_array[ind,:] = featurize_smiles(smiles)
    ind += 1

# Write a logical architecture that will pass all test cases

In [12]:
class FFNet(nn.Module):
    
    def __init__(self, input_dim, output_dim, capacity):

        super(FFNet,self).__init__()
        self.layers = nn.ModuleList()
        self.input_dim = input_dim
        self.output_dim = output_dim
        self.n_hidden = capacity
        unit_sequence = utils.unit_sequence(self.input_dim, 
                                            self.output_dim, 
                                            self.n_hidden)
        self.relu = nn.ReLU()
        # set up hidden layers
        for ind,n_units in enumerate(unit_sequence[:-2]):
            size_out_ = unit_sequence[ind+1]
            layer = nn.Linear(n_units, size_out_)
            self.layers.append(layer)

        # set up output layer
        size_in_ = unit_sequence[-2]
        size_out_ = unit_sequence[-1]
        layer = nn.Linear(size_in_, size_out_)
        self.layers.append(layer)
    
    def forward(self, data):
        x = data.x
        for i in range(len(self.layers)):
            x = self.layers[i](x)
            if i < (self.n_hidden - 1):
                x = self.relu(x)
   
        return x.view(data.num_graphs,)

# a list of models that are bug free!
capacity_ls = [1,2,3]
correct_model_class_ls = [lambda : FFNet(N_FEATURES, 1, capacity) for capacity in
                          capacity_ls]

# Prepare inputs for DebugSession

In [13]:
# bug free processing pipeline!
model_type = 'mlp'
# data_set
n_test = int(np.floor(N_DATA*constants.TRAIN_FRAC))
n_train = N_DATA - n_test
(X_train, X_test, label_train, 
label_test) = train_test_split(
                                    feature_array,
                                    data_df.value.values.tolist(),
                                    test_size=n_test,
                                    shuffle=True,
                                    random_state=constants.RANDOM_SEED
                                )

train_X = [Data(x=tensor(X_train[ind,:], dtype=torch_float).view(1,N_FEATURES),
                y=tensor(label_train[ind], dtype=torch_float)
            ) 
            for ind in range(n_train)]
zero_data_set = [Data(x=zeros((1,N_FEATURES)), y=x.y) for x in train_X]
data_set = {}
data_set['train'] = train_X
loss_fn = loss.st_loss()
target_mean = np.mean(label_train)
epsilon = constants.DL_DBG_OVERFIT_EPS_RATIO*(target_mean)
device = torch_device('cuda' if cuda.is_available() else 'cpu')

# Test output shape

The shape of the model output should match the shape of the labels.

In [14]:
# this cell should pass since it uses a bug-free model

ds = dl_debug.DebugSession(correct_model_class_ls, model_type, capacity_ls, data_set, zero_data_set, loss_fn, epsilon,
                 device, do_test_output_shape=True)
ds.main()

Training data contains 676 points


Verified that shape of model predictions is equal to shape of labels


Debug session complete.


In [15]:
# buggy model. Can you spot the bug?

class BuggyNet(nn.Module):
    
    def __init__(self, input_dim, output_dim, capacity):

        super(BuggyNet,self).__init__()
        self.layers = nn.ModuleList()
        self.input_dim = input_dim
        self.output_dim = output_dim
        self.n_hidden = capacity
        unit_sequence = utils.unit_sequence(self.input_dim, 
                                            self.output_dim, 
                                            self.n_hidden)
        self.relu = nn.ReLU()
        # set up hidden layers
        for ind,n_units in enumerate(unit_sequence[:-2]):
            size_out_ = unit_sequence[ind+1]
            layer = nn.Linear(n_units, size_out_)
            self.layers.append(layer)

        # set up output layer
        size_in_ = unit_sequence[-2]
        size_out_ = unit_sequence[-1]
        layer = nn.Linear(size_in_, size_out_)
        self.layers.append(layer)
    
    def forward(self, data):
        x = data.x
        for i in range(len(self.layers)):
            x = self.layers[i](x)
            if i < (self.n_hidden - 1):
                x = self.relu(x)
   
        return x # Spoiler! The bug is here. The correct line is 'return x.view(data.num_graphs,)'

# a list of models that are buggy
capacity_ls = [1,2,3]
buggy_model_class_ls = [lambda : BuggyNet(N_FEATURES, 1, capacity) for capacity in
                          capacity_ls]

In [16]:
# this cell should NOT pass since it uses a buggy model 

ds = dl_debug.DebugSession(buggy_model_class_ls, model_type, capacity_ls, data_set, zero_data_set, loss_fn, epsilon,
                 device, do_test_output_shape=True)
ds.main()

Training data contains 676 points



  return F.mse_loss(input, target, reduction=self.reduction)


AssertionError: The model output shape torch.Size([6, 1]) and label shape torch.Size([6]) are not the same

# Test input independent baseline
The loss of the model should be lower when real features are passed in than when zeroed features are passed in.

In [17]:
# trainer without bugs!

def trainer(model, data_set, batch_size, learning_rate, n_epochs, device, loss_obj):
    
    data_loader = DataLoader(data_set, batch_size=batch_size, shuffle=True)
    optimizer = optim.Adam(model.parameters(), lr=learning_rate) # Adam optimization
    model.train() # set model to train mode
    loss_history = []
    for epoch in range(n_epochs):
        per_epoch_loss = 0
        for ind, data in enumerate(data_loader): # loop through training batches
            data = data.to(device) # send data to GPU, if available
            optimizer.zero_grad() # zero the gradients
            output = model(data) # perform forward pass
            loss = loss_obj(output, data) # compute loss
            per_epoch_loss += loss.detach().cpu().numpy()
            loss.backward() # perform backward pass
            optimizer.step() # update weights
        loss_history.append(per_epoch_loss)
    
    return loss_history

In [18]:
# this test should pass since we are using a trainer without bugs

ds = dl_debug.DebugSession(correct_model_class_ls, model_type, capacity_ls, data_set, zero_data_set, loss_fn, epsilon,
                 device, do_test_input_independent_baseline=True, trainer=trainer)
ds.main()

Training data contains 676 points


Checking input-independent baseline
..last epoch real_data_loss 0.22421006858348846
..last epoch zero_data_loss 14.373095989227295
Input-independent baseline is verified


Debug session complete.


In [19]:
# trainer with bugs! Can you spot the bug?

def buggy_trainer(model, data_set, batch_size, learning_rate, n_epochs, device, loss_obj):
    
    data_loader = DataLoader(data_set, batch_size=batch_size, shuffle=True)
    optimizer = optim.Adam(model.parameters(), lr=learning_rate) # Adam optimization
    model.train() # set model to train mode
    loss_history = []
    for epoch in range(n_epochs):
        per_epoch_loss = 0
        for ind, data in enumerate(data_loader): # loop through training batches
            data = data.to(device) # send data to GPU, if available
            optimizer.zero_grad() # zero the gradients
            output = model(data) # perform forward pass
            loss = loss_obj(output, data) # compute loss
            per_epoch_loss += loss.detach().cpu().numpy()
            optimizer.step() # update weights
        loss_history.append(per_epoch_loss)
    
    return loss_history

# Spoiler! The bug is that there is no backward pass being performed!

In [20]:
# this test should NOT pass since we are using a buggy trainer

ds = dl_debug.DebugSession(correct_model_class_ls, model_type, capacity_ls, data_set, zero_data_set, loss_fn, epsilon,
                 device, do_test_input_independent_baseline=True, trainer=buggy_trainer)
ds.main()

Training data contains 676 points


Checking input-independent baseline
..last epoch real_data_loss 134.40209197998047
..last epoch zero_data_loss 134.75465393066406


ValueError: The loss of zeroed inputs is nearly the same as the loss of
                    real inputs. This may indicate that your model is not learning anything
                    during training. Check your trainer function and your model architecture.

# Overfit small batch
If you hope to learn a good map on your whole data set using model archicture ***A***, then ***A*** should have enough capacity to completely overfit a small batch of the data set.

In [21]:
# this test should pass since we are using a good model

ds = dl_debug.DebugSession(correct_model_class_ls, model_type, capacity_ls, data_set, zero_data_set, loss_fn, epsilon,
                 device, do_test_overfit_small_batch=True, trainer=trainer)
ds.main()

Training data contains 676 points


Checking if a small batch can be overfit
epsilon is 0.22499709319526628
..Epoch 0
....Loss: 5.164145787847168
....Outputs -0.1179 -0.0832 -0.0910 -0.1023 -0.1179
....Labels  5.6991 5.3739 5.0497 2.9694 5.7012
..Epoch 1
....Loss: 5.1002019730011945
....Outputs -0.0351 -0.0503 -0.0165 -0.0337 -0.0503
....Labels  2.9694 5.7012 5.3739 5.0497 5.6991
..Epoch 2
....Loss: 5.040911384104809
....Outputs 0.0199 0.0257 0.0519 0.0095 0.0095
....Labels  5.0497 2.9694 5.3739 5.7012 5.6991
..Epoch 3
....Loss: 4.973188421236432
....Outputs 0.0799 0.1340 0.0778 0.0778 0.0908
....Labels  5.0497 5.3739 5.6991 5.7012 2.9694
..Epoch 4
....Loss: 4.888306674583275
....Outputs 0.2393 0.1645 0.1709 0.1514 0.1645
....Labels  5.3739 5.6991 2.9694 5.0497 5.7012
..Epoch 5
....Loss: 4.7807606964063485
....Outputs 0.2755 0.2755 0.2396 0.2719 0.3727
....Labels  5.6991 5.7012 5.0497 2.9694 5.3739
..Epoch 6
....Loss: 4.644630629981769
....Outputs 0.5403 0.4175 0.3498 0.4175 0.4008
...

In [22]:
# buggy model. Can you spot the "bug"?

class BuggyNet(nn.Module):
    
    def __init__(self, input_dim, output_dim, capacity):

        super(BuggyNet,self).__init__()
        self.layers = nn.ModuleList()
        self.input_dim = input_dim
        self.output_dim = output_dim
        self.n_hidden = capacity
        unit_sequence = utils.unit_sequence(self.input_dim, 
                                            self.output_dim, 
                                            self.n_hidden)
        self.sigmoid = nn.Sigmoid() 
        # set up hidden layers
        for ind,n_units in enumerate(unit_sequence[:-2]):
            size_out_ = unit_sequence[ind+1]
            layer = nn.Linear(n_units, size_out_)
            self.layers.append(layer)

        # set up output layer
        size_in_ = unit_sequence[-2]
        size_out_ = unit_sequence[-1]
        layer = nn.Linear(size_in_, size_out_)
        self.layers.append(layer)
    
    def forward(self, data):
        x = data.x
        for i in range(len(self.layers)):
            x = self.layers[i](x)
            if i < (self.n_hidden - 1):
                x = self.sigmoid(x) # Spoiler! The "bug" is here.
   
        return x.view(data.num_graphs,) 

# a list of models that are buggy
capacity_ls = [1,2,3]
buggy_model_class_ls = [lambda : BuggyNet(N_FEATURES, 1, capacity) for capacity in
                          capacity_ls]

In [23]:
# this test should not pass since we are using a buggy model

ds = dl_debug.DebugSession(buggy_model_class_ls, model_type, capacity_ls, data_set, zero_data_set, loss_fn, epsilon,
                 device, do_test_overfit_small_batch=True, trainer=trainer)
ds.main()

Training data contains 676 points


Checking if a small batch can be overfit
epsilon is 0.22499709319526628
..Epoch 0
....Loss: 4.805500439356576
....Outputs 0.2629 0.2636 0.2640 0.2640 0.2620
....Labels  5.3739 5.0497 5.6991 5.7012 2.9694
..Epoch 1
....Loss: 4.435932796055207
....Outputs 0.6406 0.6434 0.6434 0.6420 0.6415
....Labels  2.9694 5.7012 5.6991 5.3739 5.0497
..Epoch 2
....Loss: 4.065092697306583
....Outputs 1.0243 1.0260 1.0221 1.0260 1.0225
....Labels  5.3739 5.7012 2.9694 5.6991 5.0497
..Epoch 3
....Loss: 3.689960887050353
....Outputs 1.4155 1.4155 1.4135 1.4100 1.4102
....Labels  5.7012 5.6991 5.3739 5.0497 2.9694
..Epoch 4
....Loss: 3.3075887871990797
....Outputs 1.8080 1.8161 1.8138 1.8161 1.8091
....Labels  5.0497 5.6991 5.3739 5.7012 2.9694
..Epoch 5
....Loss: 2.9168664772983215
....Outputs 2.2281 2.2307 2.2216 2.2307 2.2195
....Labels  5.3739 5.7012 2.9694 5.6991 5.0497
..Epoch 6
....Loss: 2.519406243113757
....Outputs 2.6488 2.6607 2.6607 2.6577 2.6456
....Labels  2

ValueError: Error: Your model was not able to overfit a small batch 
                               of data. The minimum RMSE over 100 epochs was not less than 0.22499709319526628

# Chart Dependencies
The `forward` method should not mix information from separate instances.

![image info](./images/graph.png)

![image info](./images/graphnet.png)

![image info](./images/graph_batch2.png)

In [24]:
# data to illustrate the point

np.random.seed(constants.RANDOM_SEED)
polymer_indices = data_df.sample(n=4).index
polymer_smiles = data_df.loc[polymer_indices, 'smiles'].values.tolist()
polymer_smiles

['[*]C(C#N)=C([*])c1ccccc1',
 '[*]CCCCOC(=O)C(=O)O[*]',
 '[*]CC(CCl)(CCl)C(=O)O[*]',
 '[*]c1[nH]c([*])c(C(=O)O)c1C']

In [25]:
feature_dict = {'C': np.array([1,0,0,0]),
    'O': np.array([0,1,0,0]),
    'N': np.array([0,0,1,0]),
    'Cl': np.array([0,0,0,1])
}
N_FEATURES_ = len(feature_dict)
N_DATA_ = len(polymer_smiles)
MAX_N_ATOMS = max([Chem.MolFromSmiles(smile).GetNumAtoms() for smile in polymer_smiles])
PROJECTOR_DIM = 100

def featurize_smiles_by_atom(smile):
    smile = smile.replace('*', 'H')
    mol = Chem.MolFromSmiles(smile)
    features = np.zeros((MAX_N_ATOMS, N_FEATURES_))
    for ind,atom in enumerate(mol.GetAtoms()):
        atom_feature = feature_dict[atom.GetSymbol()]
        features[ind, :] = atom_feature

    return features

# feature_array = np.zeros((N_DATA_, MAX_N_ATOMS, N_FEATURES_))
labels = data_df.loc[polymer_indices, 'value'].values
# for ind, smiles in enumerate(polymer_smiles):
#     feature_array[ind, ].append(featurize_smiles_by_atom(smiles))

train_X_ = [Data(x=tensor(featurize_smiles_by_atom(polymer_smiles[ind]), dtype=torch_float),
                    y=tensor(labels[ind], dtype=torch_float)
            ) 
            for ind in range(N_DATA_)
]
# for smiles,data in zip(polymer_smiles,train_X_):
#     data.num_atoms = Chem.MolFromSmiles(smiles).GetNumAtoms()
data_set_ = {'train': train_X_}

In [26]:
class GraphNet(nn.Module):
    
    def __init__(self, input_dim, output_dim, capacity):

        super(GraphNet,self).__init__()
        self.layers = nn.ModuleList()
        self.input_dim = input_dim
        self.output_dim = output_dim
        self.n_hidden = capacity
        unit_sequence = utils.unit_sequence(self.input_dim, 
                                            self.output_dim, 
                                            self.n_hidden)
        self.node_projector = nn.Linear(N_FEATURES_, PROJECTOR_DIM)
        self.relu = nn.ReLU()
        # set up hidden layers
        for ind,n_units in enumerate(unit_sequence[:-2]):
            size_out_ = unit_sequence[ind+1]
            layer = nn.Linear(n_units, size_out_)
            self.layers.append(layer)

        # set up output layer
        size_in_ = unit_sequence[-2]
        size_out_ = unit_sequence[-1]
        layer = nn.Linear(size_in_, size_out_)
        self.layers.append(layer)
    
    def forward(self, data):
        x = data.x
        x = x.view(data.num_graphs, MAX_N_ATOMS, N_FEATURES_)
        x = self.node_projector(x)
        x_mean = x.mean(dim=2)
        x = x - x_mean[:, :, None] # make use of broadcasting
        x = x.sum(dim=1)
        for i in range(len(self.layers)):
            x = self.layers[i](x)
            if i < (self.n_hidden - 1):
                x = self.relu(x)
   
        return x.view(data.num_graphs,)

# a list of models that are bug free!
capacity_ls = [1,2,3]
correct_graphnet_class_ls = [lambda : GraphNet(PROJECTOR_DIM, 1, capacity) for capacity in
                          capacity_ls]

In [28]:
# this test should pass since we are using a bug-free model

ds = dl_debug.DebugSession(correct_graphnet_class_ls, 'gnn', capacity_ls, data_set_, zero_data_set, loss_fn, epsilon,
                 device, do_chart_dependencies=True)
ds.main()

Training data contains 4 points


Beginning to chart dependencies
..Epoch 0
....Outputs 0.5427 0.4155 0.3948 0.4339
....Labels  4.3452 5.0922 6.5510 3.2017
....Loss: 0.542738676071167
Finished charting dependencies. Data is not getting mixed between instances in the same batch.


Debug session complete.


In [29]:
# this is a buggy model. Can you spot the bugs?

class BuggyGraphNet(nn.Module):
    
    def __init__(self, input_dim, output_dim, capacity):

        super(BuggyGraphNet,self).__init__()
        self.layers = nn.ModuleList()
        self.input_dim = input_dim
        self.output_dim = output_dim
        self.n_hidden = capacity
        unit_sequence = utils.unit_sequence(self.input_dim, 
                                            self.output_dim, 
                                            self.n_hidden)
        self.node_projector = nn.Linear(N_FEATURES_, PROJECTOR_DIM)
        self.relu = nn.ReLU()
        # set up hidden layers
        for ind,n_units in enumerate(unit_sequence[:-2]):
            size_out_ = unit_sequence[ind+1]
            layer = nn.Linear(n_units, size_out_)
            self.layers.append(layer)

        # set up output layer
        size_in_ = unit_sequence[-2]
        size_out_ = unit_sequence[-1]
        layer = nn.Linear(size_in_, size_out_)
        self.layers.append(layer)
    
    def forward(self, data):
        x = data.x
        x = x.view(data.num_graphs, MAX_N_ATOMS, N_FEATURES_)
        x = self.node_projector(x)
        x_mean = x.mean(dim=0)
        x = x - x_mean[None, :, :] # make use of broadcasting
        x = x.sum(dim=1)
        for i in range(len(self.layers)):
            x = self.layers[i](x)
            if i < (self.n_hidden - 1):
                x = self.relu(x)
   
        return x.view(data.num_graphs,)

# a list of models that are bug free!
capacity_ls = [1,2,3]
buggy_graphnet_class_ls = [lambda : BuggyGraphNet(PROJECTOR_DIM, 1, capacity) for capacity in
                          capacity_ls]

In [31]:
# this test should not pass since we are using a buggy model

ds = dl_debug.DebugSession(buggy_graphnet_class_ls, 'gnn', capacity_ls, data_set_, zero_data_set, loss_fn, epsilon,
                 device, do_chart_dependencies=True)
best_model_capacity = ds.main()

Training data contains 4 points


Beginning to chart dependencies
..Epoch 0
....Outputs -0.1925 -0.2155 -0.2169 -0.1876
....Labels  4.3452 5.0922 6.5510 3.2017
....Loss: -0.19253747165203094


ValueError: Data is getting mixed between instances in the same batch.

# Overfit training data
The capacity of your architecture should be just large enough to overfit the training data. 

In [36]:
ds = dl_debug.DebugSession(correct_model_class_ls, model_type, capacity_ls, data_set, zero_data_set, loss_fn, epsilon,
                 device, do_choose_model_size_by_overfit=True, trainer=trainer)
ds.main()

Training data contains 676 points


Beginning model size search

..Training model 0 


....Epoch 0
......[rmse] 4.567168941972658 [r2] -7.786489497257955
......Outputs 0.0509 0.0604 0.0698 0.0604 0.0365 0.0808 0.0562 0.0560 0.0603 0.0559
......Labels  5.0497 5.6991 2.9694 5.7012 5.3739 2.8673 6.7030 5.1590 2.1188 3.1085
......[best rmse] 4.567168941972658 [best r2] -7.786489497257955

....Epoch 1
......[rmse] 3.6914496056835606 [r2] -4.740042775357589
......Outputs 0.4901 0.7204 0.5924 0.7204 0.6978 0.9185 0.5859 0.6776 0.5902 0.8029
......Labels  5.0497 5.6991 2.9694 5.7012 5.3739 2.8673 6.7030 5.1590 2.1188 3.1085
......[best rmse] 3.6914496056835606 [best r2] -4.740042775357589

....Epoch 2
......[rmse] 2.6362727976802036 [r2] -1.9275339331338874
......Outputs 2.2987 3.6586 2.9156 3.6586 3.7082 4.9065 2.7765 3.5014 2.7815 4.1027
......Labels  5.0497 5.6991 2.9694 5.7012 5.3739 2.8673 6.7030 5.1590 2.1188 3.1085
......[best rmse] 2.6362727976802036 [best r2] -1.9275339331338874

....

1

# Run all tests

In [38]:
ds = dl_debug.DebugSession(correct_model_class_ls, model_type, capacity_ls, data_set, zero_data_set, loss_fn, epsilon,
                 device, do_all_tests=True, trainer=trainer)
ds.main()

Training data contains 676 points


Verified that shape of model predictions is equal to shape of labels


Checking input-independent baseline
..last epoch real_data_loss 0.26398674584925175
..last epoch zero_data_loss 14.002456188201904
Input-independent baseline is verified


Checking if a small batch can be overfit
epsilon is 0.22499709319526628
..Epoch 0
....Loss: 4.918188840078714
....Outputs 0.1491 0.1444 0.1491 0.1363 0.1560
....Labels  5.6991 5.0497 5.7012 2.9694 5.3739
..Epoch 1
....Loss: 4.861387173518942
....Outputs 0.2143 0.1915 0.2085 0.2085 0.2008
....Labels  5.3739 2.9694 5.7012 5.6991 5.0497
..Epoch 2
....Loss: 4.803974477072832
....Outputs 0.2729 0.2704 0.2704 0.2444 0.2559
....Labels  5.3739 5.6991 5.7012 2.9694 5.0497
..Epoch 3
....Loss: 4.738009527580887
....Outputs 0.3409 0.3409 0.3183 0.3039 0.3430
....Labels  5.6991 5.7012 5.0497 2.9694 5.3739
..Epoch 4
....Loss: 4.658560570012228
....Outputs 0.3929 0.4257 0.4257 0.4283 0.3762
....Labels  5.0497 5.6991 5.7012 5.3

1