# This notebook is a tutorial on NetDebugger
Author: Rishi Gurnani, Georgia Institute of Technology<br />
Creation Date: July 21, 2021 4:54 PM EST

# Import
Some python packages are needed to run this notebook. We import all of those below.

In [1]:
# standard libraries
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem
from torch import tensor, cuda, manual_seed, zeros, nn, optim
from torch import float as torch_float
from torch_geometric.data import DataLoader
from torch import device as torch_device
import random
from sklearn.model_selection import train_test_split
from torch_geometric.data import Data

# nndebugger functions
from nndebugger import constants, loss, dl_debug
from nndebugger import torch_utils as utils

# Fix random seeds to ensure reproducible results

In [2]:
random.seed(constants.RANDOM_SEED)
manual_seed(constants.RANDOM_SEED)
np.random.seed(constants.RANDOM_SEED)

# Load data set 

In [3]:
data_df = pd.read_csv('data/export.csv',index_col=0)
data_df.head()

Unnamed: 0,smiles,property,value
822,[*]C[*],Egc,6.8972
823,[*]CC([*])C,Egc,6.5196
824,[*]CC([*])CC,Egc,6.517
825,[*]CC([*])CCC,Egc,6.7336
826,[*]CC([*])CC(C)C,Egc,6.7394


In [4]:
len(data_df)

3380

# Featurize data set

In [5]:
N_FEATURES = 512
N_DATA = len(data_df)

def featurize_smiles(smile):
    smile = smile.replace('*', 'H')
    mol = Chem.MolFromSmiles(smile)
    fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=N_FEATURES, useChirality=True)
    return np.array(fp)

feature_array = np.zeros((N_DATA, N_FEATURES))

ind = 0
for smiles in data_df.smiles.values:
    feature_array[ind,:] = featurize_smiles(smiles)
    ind += 1

# Write a logical architecture that will pass all test cases

In [6]:
class MyNet(nn.Module):
    
    def __init__(self, input_dim, output_dim, capacity):

        super(MyNet,self).__init__()
        self.layers = nn.ModuleList()
        self.input_dim = input_dim
        self.output_dim = output_dim
        self.n_hidden = capacity
        unit_sequence = utils.unit_sequence(self.input_dim, 
                                            self.output_dim, 
                                            self.n_hidden)
        self.relu = nn.ReLU()
        # set up hidden layers
        for ind,n_units in enumerate(unit_sequence[:-2]):
            size_out_ = unit_sequence[ind+1]
            layer = nn.Linear(n_units, size_out_)
            self.layers.append(layer)

        # set up output layer
        size_in_ = unit_sequence[-2]
        size_out_ = unit_sequence[-1]
        layer = nn.Linear(size_in_, size_out_)
        self.layers.append(layer)
    
    def forward(self, data):
        x = data.x
        for i in range(len(self.layers)):
            x = self.layers[i](x)
            if i < (self.n_hidden - 1):
                x = self.relu(x)
   
        return x.view(data.num_graphs,)

# a list of models that are bug free!
capacity_ls = [1,2,3]
correct_model_class_ls = [lambda : MyNet(N_FEATURES, 1, capacity) for capacity in
                          capacity_ls]

# Prepare inputs for DebugSession

In [7]:
# bug free processing pipeline!
model_type = 'mlp'
# data_set
n_test = int(np.floor(N_DATA*constants.TRAIN_FRAC))
n_train = N_DATA - n_test
(X_train, X_test, label_train, 
label_test) = train_test_split(
                                    feature_array,
                                    data_df.value.values.tolist(),
                                    test_size=n_test,
                                    shuffle=True,
                                    random_state=constants.RANDOM_SEED
                                )

train_X = [Data(x=tensor(X_train[ind,:], dtype=torch_float).view(1,N_FEATURES),
                y=tensor(label_train[ind], dtype=torch_float)
            ) 
            for ind in range(n_train)]
data_set = train_X
zero_data_set = [Data(x=zeros((1,N_FEATURES)), y=x.y) for x in train_X]
loss_fn = loss.st_loss()
target_mean = np.mean(label_train)
device = torch_device('cuda' if cuda.is_available() else 'cpu')

# Test output shape

The shape of the model output should match the shape of the labels.

In [8]:
# this cell should pass since it uses a bug-free model

ds = dl_debug.DebugSession(correct_model_class_ls, model_type, capacity_ls, data_set, zero_data_set, loss_fn,
                 device, do_test_output_shape=True)
ds.main()

Training data contains 676 points

Skipping test_target_abs_mean

target_abs_mean 4.499941825866699 


Verified that shape of model predictions is equal to shape of labels


Debug session complete. No errors detected.


In [9]:
# buggy model. Can you spot the bug?

class BuggyNet(nn.Module):
    
    def __init__(self, input_dim, output_dim, capacity):

        super(BuggyNet,self).__init__()
        self.layers = nn.ModuleList()
        self.input_dim = input_dim
        self.output_dim = output_dim
        self.n_hidden = capacity
        unit_sequence = utils.unit_sequence(self.input_dim, 
                                            self.output_dim, 
                                            self.n_hidden)
        self.relu = nn.ReLU()
        # set up hidden layers
        for ind,n_units in enumerate(unit_sequence[:-2]):
            size_out_ = unit_sequence[ind+1]
            layer = nn.Linear(n_units, size_out_)
            self.layers.append(layer)

        # set up output layer
        size_in_ = unit_sequence[-2]
        size_out_ = unit_sequence[-1]
        layer = nn.Linear(size_in_, size_out_)
        self.layers.append(layer)
    
    def forward(self, data):
        x = data.x
        for i in range(len(self.layers)):
            x = self.layers[i](x)
            if i < (self.n_hidden - 1):
                x = self.relu(x)
   
        return x # Spoiler! The bug is here. The correct line is 'return x.view(data.num_graphs,)'

# a list of models that are buggy
capacity_ls = [1,2,3]
buggy_model_class_ls = [lambda : BuggyNet(N_FEATURES, 1, capacity) for capacity in
                          capacity_ls]

In [10]:
# this cell should NOT pass since it uses a buggy model 

ds = dl_debug.DebugSession(buggy_model_class_ls, model_type, capacity_ls, data_set, zero_data_set, loss_fn,
                 device, do_test_output_shape=True)
ds.main()

Training data contains 676 points

Skipping test_target_abs_mean

target_abs_mean 4.499941825866699 



  return F.mse_loss(input, target, reduction=self.reduction)


AssertionError: The model output shape torch.Size([6, 1]) and label shape torch.Size([6]) are not the same

# Test input independent baseline
The loss of the model should be lower when real features are passed in than when zeroed features are passed in.

In [11]:
# trainer without bugs!

def trainer(model, data_set, batch_size, learning_rate, n_epochs, device, loss_obj):
    
    data_loader = DataLoader(data_set, batch_size=batch_size, shuffle=True)
    optimizer = optim.Adam(model.parameters(), lr=learning_rate) # Adam optimization
    model.train() # set model to train mode
    loss_history = []
    for epoch in range(n_epochs):
        per_epoch_loss = 0
        for ind, data in enumerate(data_loader): # loop through training batches
            data = data.to(device) # send data to GPU, if available
            optimizer.zero_grad() # zero the gradients
            output = model(data) # perform forward pass
            loss = loss_obj(output, data) # compute loss
            per_epoch_loss += loss.detach().cpu().numpy()
            loss.backward() # perform backward pass
            optimizer.step() # update weights
        loss_history.append(per_epoch_loss)
    
    return loss_history

In [12]:
# this test should pass since we are using a trainer without bugs

ds = dl_debug.DebugSession(correct_model_class_ls, model_type, capacity_ls, data_set, zero_data_set, loss_fn,
                 device, do_test_input_independent_baseline=True, trainer=trainer)
ds.main()

Training data contains 676 points

Skipping test_target_abs_mean

target_abs_mean 4.499941825866699 


Checking input-independent baseline




..last epoch real_data_loss 0.2033699443563819
..last epoch zero_data_loss 14.290782451629639
Input-independent baseline is verified


Debug session complete. No errors detected.


In [13]:
# trainer with bugs! Can you spot the bug?

def buggy_trainer(model, data_set, batch_size, learning_rate, n_epochs, device, loss_obj):
    
    data_loader = DataLoader(data_set, batch_size=batch_size, shuffle=True)
    optimizer = optim.Adam(model.parameters(), lr=learning_rate) # Adam optimization
    model.train() # set model to train mode
    loss_history = []
    for epoch in range(n_epochs):
        per_epoch_loss = 0
        for ind, data in enumerate(data_loader): # loop through training batches
            data = data.to(device) # send data to GPU, if available
            optimizer.zero_grad() # zero the gradients
            output = model(data) # perform forward pass
            loss = loss_obj(output, data) # compute loss
            per_epoch_loss += loss.detach().cpu().numpy()
            optimizer.step() # update weights
        loss_history.append(per_epoch_loss)
    
    return loss_history

# Spoiler! The bug is that there is no backward pass being performed!

In [14]:
# this test should NOT pass since we are using a buggy trainer

ds = dl_debug.DebugSession(correct_model_class_ls, model_type, capacity_ls, data_set, zero_data_set, loss_fn,
                 device, do_test_input_independent_baseline=True, trainer=buggy_trainer)
ds.main()

Training data contains 676 points

Skipping test_target_abs_mean

target_abs_mean 4.499941825866699 


Checking input-independent baseline
..last epoch real_data_loss 133.59508323669434
..last epoch zero_data_loss 135.39850997924805


AssertionError: The loss of zeroed inputs is nearly the same as the loss of
                    real inputs. This may indicate that your model is not learning anything
                    during training. Check your trainer function and your model architecture.

# Overfit small batch
If you hope to learn a good map on your whole data set using model archicture ***A***, then ***A*** should have enough capacity to completely overfit a small batch of the data set.

In [15]:
# this test should pass since we are using a good model

ds = dl_debug.DebugSession(correct_model_class_ls, model_type, capacity_ls, data_set, zero_data_set, loss_fn,
                 device, do_test_overfit_small_batch=True, trainer=trainer)
ds.main()

Training data contains 676 points

Skipping test_target_abs_mean

target_abs_mean 4.499941825866699 


Checking if a small batch can be overfit
..Epoch 0
....Outputs [0.138 0.149 0.138 0.138 0.144]
....Labels  [5.699 5.05  5.701 5.374 2.969]
....Loss: [4.925]
....R2: -22.158624877757855
..Epoch 1
....Outputs [0.208 0.206 0.204 0.206 0.203]
....Labels  [5.05  5.699 2.969 5.701 5.374]
....Loss: [4.862]
....R2: -21.56688609289492
..Epoch 2
....Outputs [0.267 0.278 0.267 0.265 0.278]
....Labels  [5.05  5.699 5.374 2.969 5.701]
....Loss: [4.797]
....R2: -20.970442835281403
..Epoch 3
....Outputs [0.345 0.332 0.336 0.363 0.363]
....Labels  [5.374 5.05  2.969 5.699 5.701]
....Loss: [4.721]
....R2: -20.279669495607827
..Epoch 4
....Outputs [0.467 0.423 0.412 0.442 0.467]
....Labels  [5.699 2.969 5.05  5.374 5.701]
....Loss: [4.628]
....R2: -19.44770725648312
..Epoch 5
....Outputs [0.567 0.532 0.596 0.596 0.511]
....Labels  [5.374 2.969 5.701 5.699 5.05 ]
....Loss: [4.511]
....R2: -18.4266484834

In [16]:
# buggy model. Can you spot the "bug"?

class BuggyNet(nn.Module):
    
    def __init__(self, input_dim, output_dim, capacity):

        super(BuggyNet,self).__init__()
        self.layers = nn.ModuleList()
        self.input_dim = input_dim
        self.output_dim = output_dim
        self.n_hidden = capacity
        unit_sequence = utils.unit_sequence(self.input_dim, 
                                            self.output_dim, 
                                            self.n_hidden)
        self.sigmoid = nn.Sigmoid() 
        # set up hidden layers
        for ind,n_units in enumerate(unit_sequence[:-2]):
            size_out_ = unit_sequence[ind+1]
            layer = nn.Linear(n_units, size_out_)
            self.layers.append(layer)

        # set up output layer
        size_in_ = unit_sequence[-2]
        size_out_ = unit_sequence[-1]
        layer = nn.Linear(size_in_, size_out_)
        self.layers.append(layer)
    
    def forward(self, data):
        x = data.x
        for i in range(len(self.layers)):
            x = self.layers[i](x)
            if i < (self.n_hidden - 1):
                x = self.sigmoid(x) # Spoiler! The "bug" is here.
   
        return x.view(data.num_graphs,) 

# a list of models that are buggy
capacity_ls = [1,2,3]
buggy_model_class_ls = [lambda : BuggyNet(N_FEATURES, 1, capacity) for capacity in
                          capacity_ls]

In [17]:
# this test should not pass since we are using a buggy model

ds = dl_debug.DebugSession(buggy_model_class_ls, model_type, capacity_ls, data_set, zero_data_set, loss_fn,
                 device, do_test_overfit_small_batch=True, trainer=trainer)
ds.main()

Training data contains 676 points

Skipping test_target_abs_mean

target_abs_mean 4.499941825866699 


Checking if a small batch can be overfit
..Epoch 0
....Outputs [-0.049 -0.049 -0.045 -0.047 -0.045]
....Labels  [5.699 5.701 5.374 5.05  2.969]
....Loss: [5.109]
....R2: -23.920606107374308
..Epoch 1
....Outputs [0.342 0.337 0.337 0.339 0.337]
....Labels  [5.374 5.699 5.701 2.969 5.05 ]
....Loss: [4.732]
....R2: -20.37939789027356
..Epoch 2
....Outputs [0.721 0.721 0.726 0.721 0.719]
....Labels  [5.701 5.699 5.374 2.969 5.05 ]
....Loss: [4.359]
....R2: -17.138601481545766
..Epoch 3
....Outputs [1.106 1.106 1.105 1.102 1.112]
....Labels  [5.701 5.699 2.969 5.05  5.374]
....Loss: [3.986]
....R2: -14.167050679121228
..Epoch 4
....Outputs [1.496 1.493 1.49  1.496 1.503]
....Labels  [5.701 2.969 5.05  5.699 5.374]
....Loss: [3.61]
....R2: -11.44431820457998
..Epoch 5
....Outputs [1.895 1.895 1.885 1.888 1.902]
....Labels  [5.699 5.701 5.05  2.969 5.374]
....Loss: [3.231]
....R2: -8.9665448

AssertionError: Error: Your model was not able to overfit a small batch 
            of data. The maximum R2 over 1000 epochs was not greater than 0.99

# Chart Dependencies
The `forward` method should not mix information from separate instances.

![image info](./images/graph.png)

![image info](./images/graphnet.png)

![image info](./images/graph_batch2.png)

In [18]:
# data to illustrate the point

np.random.seed(constants.RANDOM_SEED)
polymer_indices = data_df.sample(n=4).index
polymer_smiles = data_df.loc[polymer_indices, 'smiles'].values.tolist()

In [19]:
feature_dict = {'C': np.array([1,0,0,0]),
    'O': np.array([0,1,0,0]),
    'N': np.array([0,0,1,0]),
    'Cl': np.array([0,0,0,1])
}
N_FEATURES_ = len(feature_dict)
N_DATA_ = len(polymer_smiles)
MAX_N_ATOMS = max([Chem.MolFromSmiles(smile).GetNumAtoms() for smile in polymer_smiles])
PROJECTOR_DIM = 100

def featurize_smiles_by_atom(smile):
    smile = smile.replace('*', 'H')
    mol = Chem.MolFromSmiles(smile)
    features = np.zeros((MAX_N_ATOMS, N_FEATURES_))
    for ind,atom in enumerate(mol.GetAtoms()):
        atom_feature = feature_dict[atom.GetSymbol()]
        features[ind, :] = atom_feature

    return features

labels = data_df.loc[polymer_indices, 'value'].values

train_X_ = [Data(x=tensor(featurize_smiles_by_atom(polymer_smiles[ind]), dtype=torch_float),
                    y=tensor(labels[ind], dtype=torch_float)
            ) 
            for ind in range(N_DATA_)
]

data_set_ = train_X_

In [20]:
class GraphNet(nn.Module):
    
    def __init__(self, input_dim, output_dim, capacity):

        super(GraphNet,self).__init__()
        self.layers = nn.ModuleList()
        self.input_dim = input_dim
        self.output_dim = output_dim
        self.n_hidden = capacity
        unit_sequence = utils.unit_sequence(self.input_dim, 
                                            self.output_dim, 
                                            self.n_hidden)
        self.node_projector = nn.Linear(N_FEATURES_, PROJECTOR_DIM)
        self.relu = nn.ReLU()
        # set up hidden layers
        for ind,n_units in enumerate(unit_sequence[:-2]):
            size_out_ = unit_sequence[ind+1]
            layer = nn.Linear(n_units, size_out_)
            self.layers.append(layer)

        # set up output layer
        size_in_ = unit_sequence[-2]
        size_out_ = unit_sequence[-1]
        layer = nn.Linear(size_in_, size_out_)
        self.layers.append(layer)
    
    def forward(self, data):
        x = data.x
        x = x.view(data.num_graphs, MAX_N_ATOMS, N_FEATURES_)
        x = self.node_projector(x)
        x_mean = x.mean(dim=2)
        x = x - x_mean[:, :, None] # make use of broadcasting
        x = x.sum(dim=1)
        for i in range(len(self.layers)):
            x = self.layers[i](x)
            if i < (self.n_hidden - 1):
                x = self.relu(x)
   
        return x.view(data.num_graphs,)

# a list of models that are bug free!
capacity_ls = [1,2,3]
correct_graphnet_class_ls = [lambda : GraphNet(PROJECTOR_DIM, 1, capacity) for capacity in
                          capacity_ls]

In [21]:
# this test should pass since we are using a bug-free model

ds = dl_debug.DebugSession(correct_graphnet_class_ls, 'gnn', capacity_ls, data_set_, zero_data_set, loss_fn,
                 device, do_chart_dependencies=True)
ds.main()

Training data contains 4 points

Skipping test_target_abs_mean

target_abs_mean 4.797524929046631 


Beginning to chart dependencies
..Epoch 0
....Outputs [-0.016 -0.056 -0.023 -0.031]
....Labels  [4.345 5.092 6.551 3.202]
....Loss: -0.01612958312034607
Finished charting dependencies. Data is not getting mixed between instances in the same batch.


Debug session complete. No errors detected.


In [22]:
# this is a buggy model. Can you spot the bugs?

class BuggyGraphNet(nn.Module):
    
    def __init__(self, input_dim, output_dim, capacity):

        super(BuggyGraphNet,self).__init__()
        self.layers = nn.ModuleList()
        self.input_dim = input_dim
        self.output_dim = output_dim
        self.n_hidden = capacity
        unit_sequence = utils.unit_sequence(self.input_dim, 
                                            self.output_dim, 
                                            self.n_hidden)
        self.node_projector = nn.Linear(N_FEATURES_, PROJECTOR_DIM)
        self.relu = nn.ReLU()
        # set up hidden layers
        for ind,n_units in enumerate(unit_sequence[:-2]):
            size_out_ = unit_sequence[ind+1]
            layer = nn.Linear(n_units, size_out_)
            self.layers.append(layer)

        # set up output layer
        size_in_ = unit_sequence[-2]
        size_out_ = unit_sequence[-1]
        layer = nn.Linear(size_in_, size_out_)
        self.layers.append(layer)
    
    def forward(self, data):
        x = data.x
        x = x.view(data.num_graphs, MAX_N_ATOMS, N_FEATURES_)
        x = self.node_projector(x)
        x_mean = x.mean(dim=0) # Spoiler! this is the bug.
        x = x - x_mean[None, :, :] # make use of broadcasting
        x = x.sum(dim=1)
        for i in range(len(self.layers)):
            x = self.layers[i](x)
            if i < (self.n_hidden - 1):
                x = self.relu(x)
   
        return x.view(data.num_graphs,)

# a list of models that are bug free!
capacity_ls = [1,2,3]
buggy_graphnet_class_ls = [lambda : BuggyGraphNet(PROJECTOR_DIM, 1, capacity) for capacity in
                          capacity_ls]

In [23]:
# this test should not pass since we are using a buggy model

ds = dl_debug.DebugSession(buggy_graphnet_class_ls, 'gnn', capacity_ls, data_set_, zero_data_set, loss_fn,
                 device, do_chart_dependencies=True)
best_model_capacity = ds.main()

Training data contains 4 points

Skipping test_target_abs_mean

target_abs_mean 4.797524929046631 


Beginning to chart dependencies
..Epoch 0
....Outputs [-0.062 -0.093 -0.118 -0.071]
....Labels  [4.345 5.092 6.551 3.202]
....Loss: -0.06184771656990051


AssertionError: Data is getting mixed between instances in the same batch.

# Overfit training data
The capacity of your architecture should be just large enough to overfit the training data. 

In [24]:
# this cell should return the integer 1

ds = dl_debug.DebugSession(correct_model_class_ls, model_type, capacity_ls, data_set, zero_data_set, loss_fn,
                 device, do_choose_model_size_by_overfit=True, trainer=trainer)
ds.main()

Training data contains 676 points

Skipping test_target_abs_mean

target_abs_mean 4.499941825866699 


Beginning model size search

..Training model 0 


....Epoch 0
......[rmse] 4.616294722833711 [r2] -7.740300747067694
......Outputs [0.03264283388853073, 0.04629906639456749, 0.04549751430749893, 0.038314089179039, 0.03908307105302811, 0.03337091952562332, 0.03660642355680466, 0.025915656238794327, 0.04784395545721054, 0.04190545156598091]
......Labels  [0.10930000245571136, 5.328400135040283, 5.512800216674805, 6.876200199127197, 5.369800090789795, 6.88040018081665, 3.8441998958587646, 5.745699882507324, 4.430699825286865, 6.022600173950195]
......Total time til this epoch 0.017838478088378906
......[best rmse] 4.616294722833711 [best r2] -7.740300747067694

....Epoch 1
......[rmse] 3.9318583925991195 [r2] -5.477115142147944
......Outputs [0.5904588103294373, 0.2525731027126312, 0.5672978758811951, 0.5274205207824707, 0.49651095271110535, 0.791317880153656, 0.2772960364818573, 0.3837

# Run all tests

In [25]:
ds = dl_debug.DebugSession(correct_model_class_ls, model_type, capacity_ls, data_set, zero_data_set, loss_fn,
                 device, do_all_tests=True, trainer=trainer)
ds.main()

Training data contains 676 points

Skipping test_target_abs_mean

target_abs_mean 4.499941825866699 


Verified that shape of model predictions is equal to shape of labels


Checking input-independent baseline




..last epoch real_data_loss 0.511672668159008
..last epoch zero_data_loss 14.159886598587036
Input-independent baseline is verified


Checking if a small batch can be overfit
..Epoch 0
....Outputs [0.125 0.111 0.123 0.117 0.117]
....Labels  [5.374 2.969 5.05  5.701 5.699]
....Loss: [4.947]
....R2: -22.358456036526945
..Epoch 1
....Outputs [0.188 0.187 0.173 0.188 0.201]
....Labels  [5.701 5.05  2.969 5.699 5.374]
....Loss: [4.878]
....R2: -21.71802491625147
..Epoch 2
....Outputs [0.283 0.26  0.233 0.26  0.256]
....Labels  [5.374 5.699 2.969 5.701 5.05 ]
....Loss: [4.808]
....R2: -21.068315133224168
..Epoch 3
....Outputs [0.302 0.334 0.381 0.344 0.344]
....Labels  [2.969 5.05  5.374 5.699 5.701]
....Loss: [4.726]
....R2: -20.320096205411648
..Epoch 4
....Outputs [0.45  0.383 0.502 0.45  0.428]
....Labels  [5.699 2.969 5.374 5.701 5.05 ]
....Loss: [4.624]
....R2: -19.413510447427782
..Epoch 5
....Outputs [0.582 0.483 0.582 0.655 0.543]
....Labels  [5.701 2.969 5.699 5.374 5.05 ]
....Loss