# This notebook is a tutorial on NetDebugger
Author: Rishi Gurnani, Georgia Institute of Technology<br />
Creation Date: July 21, 2021 4:54 PM EST

# Import
Some python packages are needed to run this notebook. We import all of those below.

In [1]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem
from torch import tensor, cuda, manual_seed, zeros, numel, nn
from torch import float as torch_float
from torch import device as torch_device
import torch.nn.functional as F
import random
from sklearn.model_selection import train_test_split
from torch_geometric.data import Data


In [2]:
from nndebugger import constants, loss, dl_debug
from nndebugger import torch_utils as utils

# Fix random seeds to ensure reproducible results

In [3]:
random.seed(constants.RANDOM_SEED)
manual_seed(constants.RANDOM_SEED)
np.random.seed(constants.RANDOM_SEED)

# Load data set 

In [4]:
data_df = pd.read_csv('data/export.csv',index_col=0)
data_df.head()

Unnamed: 0,smiles,property,value
822,[*]C[*],Egc,6.8972
823,[*]CC([*])C,Egc,6.5196
824,[*]CC([*])CC,Egc,6.517
825,[*]CC([*])CCC,Egc,6.7336
826,[*]CC([*])CC(C)C,Egc,6.7394


# Featurize data set

In [5]:
N_FEATURES = 512
N_DATA = len(data_df)

def featurize_smiles(smile):
    smile = smile.replace('*', 'H')
    mol = Chem.MolFromSmiles(smile)
    fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=N_FEATURES, useChirality=True)
    return np.array(fp)

feature_array = np.zeros((N_DATA, N_FEATURES))

ind = 0
for smiles in data_df.smiles.values:
    feature_array[ind,:] = featurize_smiles(smiles)
    ind += 1

# Prepare inputs for DebugSession

In [6]:
# bug free processing pipeline!
model_type = 'mlp'
complexity_ls = [1,2,3]
# data_set
n_test = int(np.floor(N_DATA*constants.TRAIN_FRAC))
n_train = N_DATA - n_test
(X_train, X_test, label_train, 
label_test) = train_test_split(
                                    feature_array,
                                    data_df.value.values.tolist(),
                                    test_size=n_test,
                                    shuffle=True,
                                    random_state=constants.RANDOM_SEED
                                )

train_X = [Data(x=tensor(X_train[ind,:], dtype=torch_float).view(1,N_FEATURES),
                y=tensor(label_train[ind], dtype=torch_float)
            ) 
            for ind in range(n_train)]
zero_data_set = [Data(x=zeros((1,N_FEATURES)), y=x.y) for x in train_X]
data_set = {}
data_set['train'] = train_X
loss_fn = loss.st_loss()
target_mean = np.mean(label_train)
epsilon = constants.DL_DBG_OVERFIT_EPS_RATIO*(target_mean)
device = torch_device('cuda' if cuda.is_available() else 'cpu')

# Write a logical architecture that will pass all test cases

In [10]:
class FFNet(nn.Module):
    
    def __init__(self, input_dim, output_dim, complexity):
        """
        Keyword arguments:
        shift -- A *tensor* to add to model outputs      
        """
        super(FFNet,self).__init__()
        self.layers = nn.ModuleList()
        self.input_dim = input_dim
        self.output_dim = output_dim
        self.n_hidden = complexity
        unit_sequence = utils.unit_sequence(self.input_dim, 
                                            self.output_dim, 
                                            self.n_hidden)
        self.relu = nn.ReLU()
        # set up hidden layers
        for ind,n_units in enumerate(unit_sequence[:-2]):
            size_out_ = unit_sequence[ind+1]
            layer = nn.Linear(n_units, size_out_)
            self.layers.append(layer)

        # set up output layer
        size_in_ = unit_sequence[-2]
        size_out_ = unit_sequence[-1]
        layer = nn.Linear(size_in_, size_out_)
        self.layers.append(layer)
    
    def forward(self, data):
        x = data.x
        for i in range(len(self.layers)):
            x = self.layers[i](x)
            if i < (self.n_hidden - 1):
                x = self.relu(x)
   
        return x

# a list of models that are bug free!
correct_model_class_ls = [lambda : FFNet(N_FEATURES, 1, complexity) for complexity in
                          complexity_ls]

# Test output shape

The shape of the model output should match the shape of the labels.

In [11]:
# this cell should pass since it uses a bug-free processing pipeline 

ds = dl_debug.DebugSession(model_type, correct_model_class_ls, complexity_ls, data_set, zero_data_set, loss_fn, epsilon,
                 device, do_test_output_shape=True)
ds.main()

Training data contains 676 points


target_abs_mean 4.499941825866699 


Verified that shape of model predictions is equal to shape of labels


Debug session complete.


In [18]:
# buggy processing pipeline. Can you spot the bug?
model_type = 'mlp'
complexity_ls = [1,2,3]
# data_set
n_test = int(np.floor(N_DATA*constants.TRAIN_FRAC))
n_train = N_DATA - n_test
(X_train, X_test, label_train, 
label_test) = train_test_split(
                                    feature_array,
                                    data_df.value.values.tolist(),
                                    test_size=n_test,
                                    shuffle=True,
                                    random_state=constants.RANDOM_SEED
                                )
label_train = np.array(label_train).reshape(1, n_train) # Spoiler! The bug is this line
train_X = [Data(x=tensor(X_train[ind,:], dtype=torch_float).view(1,N_FEATURES),
                y=tensor(label_train[:, ind], dtype=torch_float) 
            ) 
            for ind in range(n_train)]
zero_data_set = [Data(x=zeros((1,N_FEATURES)), y=x.y) for x in train_X]
data_set = {}
data_set['train'] = train_X
loss_fn = loss.st_loss()
target_mean = np.mean(label_train)
epsilon = constants.DL_DBG_OVERFIT_EPS_RATIO*(target_mean)
device = torch_device('cuda' if cuda.is_available() else 'cpu')

In [19]:
# this cell should NOT pass since it uses a buggy processing pipeline 

ds = dl_debug.DebugSession(model_type, correct_model_class_ls, complexity_ls, data_set, zero_data_set, loss_fn, epsilon,
                 device, do_test_output_shape=True)
ds.main()

Training data contains 676 points


target_abs_mean 4.499941825866699 


Verified that shape of model predictions is equal to shape of labels


Debug session complete.


# Test input independent baseline
The loss of the model should be lower when real features are passed in than when zeroed features are passed in.

# Overfit small batch
If you hope to learn a good map on your whole data set using model archicture ***A***, then ***A*** should have enough capacity to completely overfit a small batch of the data set.

# Visualize predictions of a large batch as a function of epoch
There should not be a large jump in predicted value between epochs (except, perhaps, in the first few epochs). However, predictions should not stay constant between epochs either.

# Chart Dependencies
The `forward` method should not pass information along the batch dimension.

# Overfit training data & gradient check
The capacity of your architecture should be just large enough to overfit the training data. Also, the gradients should not equal zero before overfitting all training data.