# This notebook is a tutorial on NetDebugger
Author: Rishi Gurnani, Georgia Institute of Technology<br />
Creation Date: July 21, 2021 4:54 PM EST

# Import
Some python packages are needed to run this notebook. We import all of those below.

In [1]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem
from torch import tensor, cuda, manual_seed, zeros, nn
from torch import float as torch_float
from torch_geometric.data import DataLoader
from torch import optim
from torch import device as torch_device
import torch.nn.functional as F
import random
from sklearn.model_selection import train_test_split
from torch_geometric.data import Data

In [2]:
from nndebugger import constants, loss, dl_debug
from nndebugger import torch_utils as utils

# TODO For Rishi before publishing:

1) Remove all 'importlib' statements
2) Run all cells and verify that the outputs are what you expected

# Fix random seeds to ensure reproducible results

In [3]:
random.seed(constants.RANDOM_SEED)
manual_seed(constants.RANDOM_SEED)
np.random.seed(constants.RANDOM_SEED)

# Load data set 

In [4]:
data_df = pd.read_csv('data/export.csv',index_col=0)
data_df.head()

Unnamed: 0,smiles,property,value
822,[*]C[*],Egc,6.8972
823,[*]CC([*])C,Egc,6.5196
824,[*]CC([*])CC,Egc,6.517
825,[*]CC([*])CCC,Egc,6.7336
826,[*]CC([*])CC(C)C,Egc,6.7394


# Featurize data set

In [5]:
N_FEATURES = 512
N_DATA = len(data_df)

def featurize_smiles(smile):
    smile = smile.replace('*', 'H')
    mol = Chem.MolFromSmiles(smile)
    fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=N_FEATURES, useChirality=True)
    return np.array(fp)

feature_array = np.zeros((N_DATA, N_FEATURES))

ind = 0
for smiles in data_df.smiles.values:
    feature_array[ind,:] = featurize_smiles(smiles)
    ind += 1

# Prepare inputs for DebugSession

In [6]:
# bug free processing pipeline!
model_type = 'mlp'
# data_set
n_test = int(np.floor(N_DATA*constants.TRAIN_FRAC))
n_train = N_DATA - n_test
(X_train, X_test, label_train, 
label_test) = train_test_split(
                                    feature_array,
                                    data_df.value.values.tolist(),
                                    test_size=n_test,
                                    shuffle=True,
                                    random_state=constants.RANDOM_SEED
                                )

train_X = [Data(x=tensor(X_train[ind,:], dtype=torch_float).view(1,N_FEATURES),
                y=tensor(label_train[ind], dtype=torch_float)
            ) 
            for ind in range(n_train)]
zero_data_set = [Data(x=zeros((1,N_FEATURES)), y=x.y) for x in train_X]
data_set = {}
data_set['train'] = train_X
loss_fn = loss.st_loss()
target_mean = np.mean(label_train)
epsilon = constants.DL_DBG_OVERFIT_EPS_RATIO*(target_mean)
device = torch_device('cuda' if cuda.is_available() else 'cpu')

# Write a logical architecture that will pass all test cases

In [7]:
class FFNet(nn.Module):
    
    def __init__(self, input_dim, output_dim, complexity):
        """
        Keyword arguments:
        shift -- A *tensor* to add to model outputs      
        """
        super(FFNet,self).__init__()
        self.layers = nn.ModuleList()
        self.input_dim = input_dim
        self.output_dim = output_dim
        self.n_hidden = complexity
        unit_sequence = utils.unit_sequence(self.input_dim, 
                                            self.output_dim, 
                                            self.n_hidden)
        self.relu = nn.ReLU()
        # set up hidden layers
        for ind,n_units in enumerate(unit_sequence[:-2]):
            size_out_ = unit_sequence[ind+1]
            layer = nn.Linear(n_units, size_out_)
            self.layers.append(layer)

        # set up output layer
        size_in_ = unit_sequence[-2]
        size_out_ = unit_sequence[-1]
        layer = nn.Linear(size_in_, size_out_)
        self.layers.append(layer)
    
    def forward(self, data):
        x = data.x
        for i in range(len(self.layers)):
            x = self.layers[i](x)
            if i < (self.n_hidden - 1):
                x = self.relu(x)
   
        return x.view(data.num_graphs,)

# a list of models that are bug free!
complexity_ls = [1,2,3]
correct_model_class_ls = [lambda : FFNet(N_FEATURES, 1, complexity) for complexity in
                          complexity_ls]

# Test output shape

The shape of the model output should match the shape of the labels.

In [8]:
# this cell should pass since it uses a bug-free model

ds = dl_debug.DebugSession(model_type, correct_model_class_ls, complexity_ls, data_set, zero_data_set, loss_fn, epsilon,
                 device, do_test_output_shape=True)
ds.main()

Training data contains 676 points


Verified that shape of model predictions is equal to shape of labels


Debug session complete.


In [9]:
# buggy model. Can you spot the bug?

class BuggyNet(nn.Module):
    
    def __init__(self, input_dim, output_dim, complexity):
        """
        Keyword arguments:
        shift -- A *tensor* to add to model outputs      
        """
        super(BuggyNet,self).__init__()
        self.layers = nn.ModuleList()
        self.input_dim = input_dim
        self.output_dim = output_dim
        self.n_hidden = complexity
        unit_sequence = utils.unit_sequence(self.input_dim, 
                                            self.output_dim, 
                                            self.n_hidden)
        self.relu = nn.ReLU()
        # set up hidden layers
        for ind,n_units in enumerate(unit_sequence[:-2]):
            size_out_ = unit_sequence[ind+1]
            layer = nn.Linear(n_units, size_out_)
            self.layers.append(layer)

        # set up output layer
        size_in_ = unit_sequence[-2]
        size_out_ = unit_sequence[-1]
        layer = nn.Linear(size_in_, size_out_)
        self.layers.append(layer)
    
    def forward(self, data):
        x = data.x
        for i in range(len(self.layers)):
            x = self.layers[i](x)
            if i < (self.n_hidden - 1):
                x = self.relu(x)
   
        return x # Spoiler! The bug is here.

# a list of models that are buggy
complexity_ls = [1,2,3]
buggy_model_class_ls = [lambda : BuggyNet(N_FEATURES, 1, complexity) for complexity in
                          complexity_ls]

In [10]:
# this cell should NOT pass since it uses a buggy model 

ds = dl_debug.DebugSession(model_type, buggy_model_class_ls, complexity_ls, data_set, zero_data_set, loss_fn, epsilon,
                 device, do_test_output_shape=True)
ds.main()

Training data contains 676 points



  return F.mse_loss(input, target, reduction=self.reduction)


AssertionError: The model output shape torch.Size([6, 1]) and label shape torch.Size([6]) are not the same

# Test input independent baseline
The loss of the model should be lower when real features are passed in than when zeroed features are passed in.

In [30]:
import importlib
importlib.reload(constants)
importlib.reload(utils)
importlib.reload(dl_debug)

<module 'nndebugger.dl_debug' from '/data/rgur/nndebugger/nndebugger/dl_debug.py'>

In [31]:
# trainer without bugs!

def trainer(model, data_set, batch_size, learning_rate, n_epochs, device, loss_obj):
    
    data_loader = DataLoader(data_set, batch_size=batch_size, shuffle=True)
    optimizer = optim.Adam(model.parameters(), lr=learning_rate) # Adam optimization
    model.train() # set model to train mode
    loss_history = []
    for epoch in range(n_epochs):
        per_epoch_loss = 0
        for ind, data in enumerate(data_loader): # loop through training batches
            data = data.to(device) # send data to GPU, if available
            optimizer.zero_grad() # zero the gradients
            output = model(data) # perform forward pass
            loss = loss_obj(output, data) # compute loss
            per_epoch_loss += loss.detach().cpu().numpy()
            loss.backward() # perform backward pass
            optimizer.step() # update weights
        loss_history.append(per_epoch_loss)
    
    return loss_history

In [32]:
ds = dl_debug.DebugSession(model_type, correct_model_class_ls, complexity_ls, data_set, zero_data_set, loss_fn, epsilon,
                 device, do_test_input_independent_baseline=True, trainer=trainer)
ds.main()

Training data contains 676 points


Checking input-independent baseline
..last epoch real_data_loss 0.25427124463021755
..last epoch zero_data_loss 14.015729784965515
Input-independent baseline is verified


Debug session complete.


In [33]:
# trainer with bugs! Can you spot the bug?

def buggy_trainer(model, data_set, batch_size, learning_rate, n_epochs, device, loss_obj):
    
    data_loader = DataLoader(data_set, batch_size=batch_size, shuffle=True)
    optimizer = optim.Adam(model.parameters(), lr=learning_rate) # Adam optimization
    model.train() # set model to train mode
    loss_history = []
    for epoch in range(n_epochs):
        per_epoch_loss = 0
        for ind, data in enumerate(data_loader): # loop through training batches
            data = data.to(device) # send data to GPU, if available
            optimizer.zero_grad() # zero the gradients
            output = model(data) # perform forward pass
            loss = loss_obj(output, data) # compute loss
            per_epoch_loss += loss.detach().cpu().numpy()
            optimizer.step() # update weights
        loss_history.append(per_epoch_loss)
    
    return loss_history

# Spoiler! The bug is that there is no backward pass being performed!

In [34]:
ds = dl_debug.DebugSession(model_type, correct_model_class_ls, complexity_ls, data_set, zero_data_set, loss_fn, epsilon,
                 device, do_test_input_independent_baseline=True, trainer=buggy_trainer)
ds.main()

Training data contains 676 points


Checking input-independent baseline
..last epoch real_data_loss 128.15090370178223
..last epoch zero_data_loss 130.61536026000977


ValueError: The loss of zeroed inputs is nearly the same as the loss of                     real inputs. This may indicate that your model is not learning anything                     during training. Check your trainer function and your model architecture.

# Overfit small batch
If you hope to learn a good map on your whole data set using model archicture ***A***, then ***A*** should have enough capacity to completely overfit a small batch of the data set.

# Visualize predictions of a large batch as a function of epoch
There should not be a large jump in predicted value between epochs (except, perhaps, in the first few epochs). However, predictions should not stay constant between epochs either.

# Chart Dependencies
The `forward` method should not pass information along the batch dimension.

# Overfit training data & gradient check
The capacity of your architecture should be just large enough to overfit the training data. Also, the gradients should not equal zero before overfitting all training data.