# This notebook is a tutorial on NetDebugger
Author: Rishi Gurnani, Georgia Institute of Technology<br />
Creation Date: July 21, 2021 4:54 PM EST

# Import
Some python packages are needed to run this notebook. We import all of those below.

In [1]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem
from torch import tensor, cuda, manual_seed, zeros, nn, optim, reshape
from torch import float as torch_float
from torch_geometric.data import DataLoader
from torch import device as torch_device
import torch.nn.functional as F
import random
from sklearn.model_selection import train_test_split
from torch_geometric.data import Data

In [2]:
from nndebugger import constants, loss, dl_debug
from nndebugger import torch_utils as utils

# TODO For Rishi before publishing notebook:

1. Consider using `trainer` function for all tests in `dl_debug`
1. Remove all 'importlib' statements
1. Run all cells and verify that the outputs are what you expected
1. Try using all the polymers for "chart_dependencies" instead of a small sample
1. Delete this cell
1. Delete the cell below

# TODO for Rishi on presentation:
1. Describe GNN?

# Fix random seeds to ensure reproducible results

In [3]:
random.seed(constants.RANDOM_SEED)
manual_seed(constants.RANDOM_SEED)
np.random.seed(constants.RANDOM_SEED)

# Load data set 

In [4]:
data_df = pd.read_csv('data/export.csv',index_col=0)
data_df.head()

Unnamed: 0,smiles,property,value
822,[*]C[*],Egc,6.8972
823,[*]CC([*])C,Egc,6.5196
824,[*]CC([*])CC,Egc,6.517
825,[*]CC([*])CCC,Egc,6.7336
826,[*]CC([*])CC(C)C,Egc,6.7394


# Featurize data set

In [5]:
N_FEATURES = 512
N_DATA = len(data_df)

def featurize_smiles(smile):
    smile = smile.replace('*', 'H')
    mol = Chem.MolFromSmiles(smile)
    fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=N_FEATURES, useChirality=True)
    return np.array(fp)

feature_array = np.zeros((N_DATA, N_FEATURES))

ind = 0
for smiles in data_df.smiles.values:
    feature_array[ind,:] = featurize_smiles(smiles)
    ind += 1

# Prepare inputs for DebugSession

In [6]:
# bug free processing pipeline!
model_type = 'mlp'
# data_set
n_test = int(np.floor(N_DATA*constants.TRAIN_FRAC))
n_train = N_DATA - n_test
(X_train, X_test, label_train, 
label_test) = train_test_split(
                                    feature_array,
                                    data_df.value.values.tolist(),
                                    test_size=n_test,
                                    shuffle=True,
                                    random_state=constants.RANDOM_SEED
                                )

train_X = [Data(x=tensor(X_train[ind,:], dtype=torch_float).view(1,N_FEATURES),
                y=tensor(label_train[ind], dtype=torch_float)
            ) 
            for ind in range(n_train)]
zero_data_set = [Data(x=zeros((1,N_FEATURES)), y=x.y) for x in train_X]
data_set = {}
data_set['train'] = train_X
loss_fn = loss.st_loss()
target_mean = np.mean(label_train)
epsilon = constants.DL_DBG_OVERFIT_EPS_RATIO*(target_mean)
device = torch_device('cuda' if cuda.is_available() else 'cpu')

# Write a logical architecture that will pass all test cases

In [7]:
class FFNet(nn.Module):
    
    def __init__(self, input_dim, output_dim, complexity):

        super(FFNet,self).__init__()
        self.layers = nn.ModuleList()
        self.input_dim = input_dim
        self.output_dim = output_dim
        self.n_hidden = complexity
        unit_sequence = utils.unit_sequence(self.input_dim, 
                                            self.output_dim, 
                                            self.n_hidden)
        self.relu = nn.ReLU()
        # set up hidden layers
        for ind,n_units in enumerate(unit_sequence[:-2]):
            size_out_ = unit_sequence[ind+1]
            layer = nn.Linear(n_units, size_out_)
            self.layers.append(layer)

        # set up output layer
        size_in_ = unit_sequence[-2]
        size_out_ = unit_sequence[-1]
        layer = nn.Linear(size_in_, size_out_)
        self.layers.append(layer)
    
    def forward(self, data):
        x = data.x
        for i in range(len(self.layers)):
            x = self.layers[i](x)
            if i < (self.n_hidden - 1):
                x = self.relu(x)
   
        return x.view(data.num_graphs,)

# a list of models that are bug free!
complexity_ls = [1,2,3]
correct_model_class_ls = [lambda : FFNet(N_FEATURES, 1, complexity) for complexity in
                          complexity_ls]

# Test output shape

The shape of the model output should match the shape of the labels.

In [None]:
# this cell should pass since it uses a bug-free model

ds = dl_debug.DebugSession(model_type, correct_model_class_ls, complexity_ls, data_set, zero_data_set, loss_fn, epsilon,
                 device, do_test_output_shape=True)
ds.main()

In [None]:
# buggy model. Can you spot the bug?

class BuggyNet(nn.Module):
    
    def __init__(self, input_dim, output_dim, complexity):

        super(BuggyNet,self).__init__()
        self.layers = nn.ModuleList()
        self.input_dim = input_dim
        self.output_dim = output_dim
        self.n_hidden = complexity
        unit_sequence = utils.unit_sequence(self.input_dim, 
                                            self.output_dim, 
                                            self.n_hidden)
        self.relu = nn.ReLU()
        # set up hidden layers
        for ind,n_units in enumerate(unit_sequence[:-2]):
            size_out_ = unit_sequence[ind+1]
            layer = nn.Linear(n_units, size_out_)
            self.layers.append(layer)

        # set up output layer
        size_in_ = unit_sequence[-2]
        size_out_ = unit_sequence[-1]
        layer = nn.Linear(size_in_, size_out_)
        self.layers.append(layer)
    
    def forward(self, data):
        x = data.x
        for i in range(len(self.layers)):
            x = self.layers[i](x)
            if i < (self.n_hidden - 1):
                x = self.relu(x)
   
        return x # Spoiler! The bug is here.

# a list of models that are buggy
complexity_ls = [1,2,3]
buggy_model_class_ls = [lambda : BuggyNet(N_FEATURES, 1, complexity) for complexity in
                          complexity_ls]

In [None]:
# this cell should NOT pass since it uses a buggy model 

ds = dl_debug.DebugSession(model_type, buggy_model_class_ls, complexity_ls, data_set, zero_data_set, loss_fn, epsilon,
                 device, do_test_output_shape=True)
ds.main()

# Test input independent baseline
The loss of the model should be lower when real features are passed in than when zeroed features are passed in.

In [None]:
# trainer without bugs!

def trainer(model, data_set, batch_size, learning_rate, n_epochs, device, loss_obj):
    
    data_loader = DataLoader(data_set, batch_size=batch_size, shuffle=True)
    optimizer = optim.Adam(model.parameters(), lr=learning_rate) # Adam optimization
    model.train() # set model to train mode
    loss_history = []
    for epoch in range(n_epochs):
        per_epoch_loss = 0
        for ind, data in enumerate(data_loader): # loop through training batches
            data = data.to(device) # send data to GPU, if available
            optimizer.zero_grad() # zero the gradients
            output = model(data) # perform forward pass
            loss = loss_obj(output, data) # compute loss
            per_epoch_loss += loss.detach().cpu().numpy()
            loss.backward() # perform backward pass
            optimizer.step() # update weights
        loss_history.append(per_epoch_loss)
    
    return loss_history

In [None]:
# this test should pass since we are using a trainer without bugs

ds = dl_debug.DebugSession(model_type, correct_model_class_ls, complexity_ls, data_set, zero_data_set, loss_fn, epsilon,
                 device, do_test_input_independent_baseline=True, trainer=trainer)
ds.main()

In [None]:
# trainer with bugs! Can you spot the bug?

def buggy_trainer(model, data_set, batch_size, learning_rate, n_epochs, device, loss_obj):
    
    data_loader = DataLoader(data_set, batch_size=batch_size, shuffle=True)
    optimizer = optim.Adam(model.parameters(), lr=learning_rate) # Adam optimization
    model.train() # set model to train mode
    loss_history = []
    for epoch in range(n_epochs):
        per_epoch_loss = 0
        for ind, data in enumerate(data_loader): # loop through training batches
            data = data.to(device) # send data to GPU, if available
            optimizer.zero_grad() # zero the gradients
            output = model(data) # perform forward pass
            loss = loss_obj(output, data) # compute loss
            per_epoch_loss += loss.detach().cpu().numpy()
            optimizer.step() # update weights
        loss_history.append(per_epoch_loss)
    
    return loss_history

# Spoiler! The bug is that there is no backward pass being performed!

In [None]:
import importlib
importlib.reload(dl_debug)

In [None]:
# this test should NOT pass since we are using a buggy trainer

ds = dl_debug.DebugSession(model_type, correct_model_class_ls, complexity_ls, data_set, zero_data_set, loss_fn, epsilon,
                 device, do_test_input_independent_baseline=True, trainer=buggy_trainer)
ds.main()

# Overfit small batch
If you hope to learn a good map on your whole data set using model archicture ***A***, then ***A*** should have enough capacity to completely overfit a small batch of the data set.

In [None]:
import importlib
importlib.reload(constants)
importlib.reload(dl_debug)

In [None]:
# this test should pass since we are using a good model

ds = dl_debug.DebugSession(model_type, correct_model_class_ls, complexity_ls, data_set, zero_data_set, loss_fn, epsilon,
                 device, do_test_overfit_small_batch=True, trainer=trainer)
ds.main()

In [None]:
# buggy model. Can you spot the "bug"?

class BuggyNet(nn.Module):
    
    def __init__(self, input_dim, output_dim, complexity):

        super(BuggyNet,self).__init__()
        self.layers = nn.ModuleList()
        self.input_dim = input_dim
        self.output_dim = output_dim
        self.n_hidden = complexity
        unit_sequence = utils.unit_sequence(self.input_dim, 
                                            self.output_dim, 
                                            self.n_hidden)
        self.sigmoid = nn.Sigmoid() # Spoiler! The "bug" is here.
        # set up hidden layers
        for ind,n_units in enumerate(unit_sequence[:-2]):
            size_out_ = unit_sequence[ind+1]
            layer = nn.Linear(n_units, size_out_)
            self.layers.append(layer)

        # set up output layer
        size_in_ = unit_sequence[-2]
        size_out_ = unit_sequence[-1]
        layer = nn.Linear(size_in_, size_out_)
        self.layers.append(layer)
    
    def forward(self, data):
        x = data.x
        for i in range(len(self.layers)):
            x = self.layers[i](x)
            if i < (self.n_hidden - 1):
                x = self.sigmoid(x)
   
        return x.view(data.num_graphs,) 

# a list of models that are buggy
complexity_ls = [1,2,3]
buggy_model_class_ls = [lambda : BuggyNet(N_FEATURES, 1, complexity) for complexity in
                          complexity_ls]

In [None]:
# this test should not pass since we are using a buggy model

ds = dl_debug.DebugSession(model_type, buggy_model_class_ls, complexity_ls, data_set, zero_data_set, loss_fn, epsilon,
                 device, do_test_overfit_small_batch=True, trainer=trainer)
ds.main()

# Visualize predictions of a large batch as a function of epoch
There should not be a large jump in predicted value between epochs (except, perhaps, in the first few epochs). However, predictions should not stay constant between epochs either.

# Chart Dependencies
The `forward` method should not pass information along the batch dimension.

In [36]:
# data to illustrate the point

import importlib
importlib.reload(dl_debug)
np.random.seed(constants.RANDOM_SEED)
polymer_indices = data_df.sample(n=4).index
polymer_smiles = data_df.loc[polymer_indices, 'smiles'].values.tolist()
polymer_smiles

['[*]C(C#N)=C([*])c1ccccc1',
 '[*]CCCCOC(=O)C(=O)O[*]',
 '[*]CC(CCl)(CCl)C(=O)O[*]',
 '[*]c1[nH]c([*])c(C(=O)O)c1C']

In [37]:
feature_dict = {'C': np.array([1,0,0,0]),
    'O': np.array([0,1,0,0]),
    'N': np.array([0,0,1,0]),
    'Cl': np.array([0,0,0,1])
}
N_FEATURES_ = len(feature_dict)
N_DATA_ = len(polymer_smiles)
MAX_N_ATOMS = max([Chem.MolFromSmiles(smile).GetNumAtoms() for smile in polymer_smiles])
PROJECTOR_DIM = 100

def featurize_smiles_by_atom(smile):
    smile = smile.replace('*', 'H')
    mol = Chem.MolFromSmiles(smile)
    features = np.zeros((MAX_N_ATOMS, N_FEATURES_))
    for ind,atom in enumerate(mol.GetAtoms()):
        atom_feature = feature_dict[atom.GetSymbol()]
        features[ind, :] = atom_feature

    return features

# feature_array = np.zeros((N_DATA_, MAX_N_ATOMS, N_FEATURES_))
labels = data_df.loc[polymer_indices, 'value'].values
# for ind, smiles in enumerate(polymer_smiles):
#     feature_array[ind, ].append(featurize_smiles_by_atom(smiles))

train_X_ = [Data(x=tensor(featurize_smiles_by_atom(polymer_smiles[ind]), dtype=torch_float),
                    y=tensor(labels[ind], dtype=torch_float)
            ) 
            for ind in range(N_DATA_)
]
# for smiles,data in zip(polymer_smiles,train_X_):
#     data.num_atoms = Chem.MolFromSmiles(smiles).GetNumAtoms()
data_set_ = {'train': train_X_}

In [38]:
[(smile, featurize_smiles_by_atom(smile)) for smile in polymer_smiles]

[('[*]C(C#N)=C([*])c1ccccc1',
  array([[1., 0., 0., 0.],
         [1., 0., 0., 0.],
         [0., 0., 1., 0.],
         [1., 0., 0., 0.],
         [1., 0., 0., 0.],
         [1., 0., 0., 0.],
         [1., 0., 0., 0.],
         [1., 0., 0., 0.],
         [1., 0., 0., 0.],
         [1., 0., 0., 0.],
         [0., 0., 0., 0.],
         [0., 0., 0., 0.]])),
 ('[*]CCCCOC(=O)C(=O)O[*]',
  array([[1., 0., 0., 0.],
         [1., 0., 0., 0.],
         [1., 0., 0., 0.],
         [1., 0., 0., 0.],
         [0., 1., 0., 0.],
         [1., 0., 0., 0.],
         [0., 1., 0., 0.],
         [1., 0., 0., 0.],
         [0., 1., 0., 0.],
         [0., 1., 0., 0.],
         [0., 0., 0., 0.],
         [0., 0., 0., 0.]])),
 ('[*]CC(CCl)(CCl)C(=O)O[*]',
  array([[1., 0., 0., 0.],
         [1., 0., 0., 0.],
         [1., 0., 0., 0.],
         [0., 0., 0., 1.],
         [1., 0., 0., 0.],
         [0., 0., 0., 1.],
         [1., 0., 0., 0.],
         [0., 1., 0., 0.],
         [0., 1., 0., 0.],
         [0., 0

In [39]:
class GraphNet(nn.Module):
    
    def __init__(self, input_dim, output_dim, complexity):

        super(GraphNet,self).__init__()
        self.layers = nn.ModuleList()
        self.input_dim = input_dim
        self.output_dim = output_dim
        self.n_hidden = complexity
        unit_sequence = utils.unit_sequence(self.input_dim, 
                                            self.output_dim, 
                                            self.n_hidden)
        self.node_projector = nn.Linear(N_FEATURES_, PROJECTOR_DIM)
        self.relu = nn.ReLU()
        # set up hidden layers
        for ind,n_units in enumerate(unit_sequence[:-2]):
            size_out_ = unit_sequence[ind+1]
            layer = nn.Linear(n_units, size_out_)
            self.layers.append(layer)

        # set up output layer
        size_in_ = unit_sequence[-2]
        size_out_ = unit_sequence[-1]
        layer = nn.Linear(size_in_, size_out_)
        self.layers.append(layer)
    
    def forward(self, data):
        x = data.x
        x = x.view(data.num_graphs, MAX_N_ATOMS, N_FEATURES_)
        x = self.node_projector(x)
        print(x.shape)
        x_mean = x.mean(dim=1)
        x = x - x_mean[:, None, :] # make use of broadcasting
        x = x.sum(dim=1)
        print(x.shape)
        for i in range(len(self.layers)):
            x = self.layers[i](x)
            if i < (self.n_hidden - 1):
                x = self.relu(x)
   
        return x.view(data.num_graphs,)

# a list of models that are bug free!
complexity_ls = [1,2,3]
correct_graphnet_class_ls = [lambda : GraphNet(PROJECTOR_DIM, 1, complexity) for complexity in
                          complexity_ls]

In [40]:
# this test should pass since we are using a bug-free model

ds = dl_debug.DebugSession('gnn', correct_graphnet_class_ls, complexity_ls, data_set_, zero_data_set, loss_fn, epsilon,
                 device, do_chart_dependencies=True)
ds.main()

Training data contains 4 points

torch.Size([4, 12, 100])


RuntimeError: The size of tensor a (100) must match the size of tensor b (4) at non-singleton dimension 2

In [19]:
# this is a buggy model. Can you spot the bugs?
class BuggyGraphNet(nn.Module):
    
    def __init__(self, input_dim, output_dim, complexity):

        super(BuggyGraphNet,self).__init__()
        self.layers = nn.ModuleList()
        self.input_dim = input_dim
        self.output_dim = output_dim
        self.n_hidden = complexity
        unit_sequence = utils.unit_sequence(self.input_dim, 
                                            self.output_dim, 
                                            self.n_hidden)
        self.relu = nn.ReLU()
        # set up hidden layers
        for ind,n_units in enumerate(unit_sequence[:-2]):
            size_out_ = unit_sequence[ind+1]
            layer = nn.Linear(n_units, size_out_)
            self.layers.append(layer)

        # set up output layer
        size_in_ = unit_sequence[-2]
        size_out_ = unit_sequence[-1]
        layer = nn.Linear(size_in_, size_out_)
        self.layers.append(layer)
    
    def forward(self, data):
        x = data.x
        x = x.view(data.num_graphs, MAX_N_ATOMS, N_FEATURES_)
        x_mean = x.mean(dim=0) # Spoiler! This is the bug, Part 1
        x = x - x_mean[None, :, :] # This is the bug, Part 2
        x = x.sum(dim=1)
        # print(x)
        for i in range(len(self.layers)):
            x = self.layers[i](x)
            if i < (self.n_hidden - 1):
                x = self.relu(x)
   
        return x.view(data.num_graphs,)

# a list of models that are bug free!
complexity_ls = [1,2,3]
buggy_graphnet_class_ls = [lambda : BuggyGraphNet(N_FEATURES_, 1, complexity) for complexity in
                          complexity_ls]

In [20]:
# this test should not pass since we are using a buggy model

ds = dl_debug.DebugSession('gnn', buggy_graphnet_class_ls, complexity_ls, data_set_, zero_data_set, loss_fn, epsilon,
                 device, do_chart_dependencies=True)
ds.main()

Training data contains 4 points


Beginning to chart dependencies
..Epoch 0
....Outputs 0.1773 0.2965 0.2670 0.2801
....Labels  4.3452 5.0922 6.5510 3.2017
....Loss: 0.17732521891593933
Finished charting dependencies. Data is not getting passed along the batch dimension.


Debug session complete.


# Overfit training data & gradient check
The capacity of your architecture should be just large enough to overfit the training data. Also, the gradients should not equal zero before overfitting all training data.