PyTorch Dropout layers cause issue in candidate selection

When the Seldonian model is a PyTorch model which contains calls to `torch.nn.Dropout`, gradient descent produces nans. Removing the dropout calls only but keeping all other layers resolves the issue. 

Example:
```python
from seldonian.models.pytorch_model import SupervisedPytorchBaseModel
import torch.nn as nn
import torch

class testCNN(nn.Module):
	def __init__(self):
		super(testCNN,self).__init__()
		# Define all layers here
		self.cnn1 = nn.Conv2d(in_channels=1, out_channels=16, kernel_size=3, padding=1)
		self.cnn2 = nn.Conv2d(in_channels=16, out_channels=32, kernel_size=3)
		self.cnn3 = nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3)
		self.cnn4 = nn.Conv2d(in_channels=64, out_channels=128, kernel_size=3)

		self.relu = nn.ReLU()
		self.maxpool = nn.MaxPool2d(kernel_size=2)

		self.Batch1=nn.BatchNorm2d(16)
		self.Batch2=nn.BatchNorm2d(32)
		self.Batch3=nn.BatchNorm2d(64)
		self.Batch4=nn.BatchNorm2d(128)

		self.Drop1=nn.Dropout(0.2)
		self.Drop2=nn.Dropout(0.5)

		# Fully connected 1 (readout)
		self.fc1 = nn.Linear(128 * 1 * 1, 128) 
		self.fc2=nn.Linear(128,256)
		self.fc3=nn.Linear(256,2)
		self.softmax = nn.Softmax(dim=1)

	def forward(self, x):
		# Call all layers here
		out = self.cnn1(x) 
		out = self.relu(out)
		out = self.maxpool(out)
		out=self.Batch1(out)
		out=self.Drop1(out)

		out = self.cnn2(out)
		out = self.relu(out)
		out = self.maxpool(out)
		out=self.Batch2(out)
		out=self.Drop1(out)

		out = self.cnn3(out)
		out = self.relu(out)
		out = self.maxpool(out)
		out=self.Batch3(out)
		out=self.Drop1(out)

		out = self.cnn4(out)
		out = self.relu(out)
		out = self.maxpool(out)
		out=self.Batch4(out)
		out=self.Drop1(out)

		# Resize
		# Original size: (100, 32, 7, 7)
		# out.size(0): 100
		# New out size: (100, 32*7*7)
		out = out.view(out.size(0), -1)

		# Linear function (readout)
		out = self.fc1(out)

		out=self.Drop2(out)

		out=self.fc2(out)

		out=self.Drop2(out)

		out=self.fc3(out)

		# Softmax to make probabilities
		out=self.softmax(out)[:,1] 

		return out

class SeldoPytorchTestModel(SupervisedPytorchBaseModel):
	def __init__(self,device):
		""" Implements a CNN with PyTorch. 
		CNN consists of two hidden layers followed 
		by a linear + softmax output layer 

		:param input_dim: Number of features
		:param output_dim: Size of output layer (number of label columns)
		"""
		super().__init__(device)

	def create_model(self,**kwargs):
		""" Create the pytorch model and return it
		Inputs are N,1,28,28 where N is the number of them,
		1 channel and 28x28 pixels.
		Do Conv2d,ReLU,maxpool twice then
		output in a fully connected layer to 10 output classes
		"""
		return testCNN()

import autograd.numpy as np   # Thinly-wrapped version of Numpy

from seldonian.spec import SupervisedSpec
from seldonian.dataset import SupervisedDataSet
from seldonian.utils.io_utils import load_pickle,save_pickle
from seldonian.models import objectives
from seldonian.seldonian_algorithm import SeldonianAlgorithm
from seldonian.parse_tree.parse_tree import (
	make_parse_trees_from_constraints)

import torch

if __name__ == "__main__":
	torch.manual_seed(0)
	regime='supervised_learning'
	sub_regime='classification'
	
	N=23700 # Clips off 5 samples (at random) to make total divisible by 150,
	# the desired batch size
	
	# Get the data, load from file if already saved
	savename_features = './features.pkl'
	savename_labels = './labels.pkl'
	savename_sensitive_attrs = './sensitive_attrs.pkl'

	features = load_pickle(savename_features)
	labels = load_pickle(savename_labels)
	sensitive_attrs = load_pickle(savename_sensitive_attrs)
	
	assert len(features) == N
	assert len(labels) == N
	assert len(sensitive_attrs) == N
	frac_data_in_safety = 0.5
	sensitive_col_names = ['M','F']

	meta_information = {}
	meta_information['feature_col_names'] = ['img']
	meta_information['label_col_names'] = ['label']
	meta_information['sensitive_col_names'] = sensitive_col_names
	meta_information['sub_regime'] = sub_regime
	
	print("Making SupervisedDataSet...")
	dataset = SupervisedDataSet(
		features=features,
		labels=labels,
		sensitive_attrs=sensitive_attrs,
		num_datapoints=N,
		meta_information=meta_information)

	constraint_strs = ['min((ACC | [M])/(ACC | [F]),(ACC | [F])/(ACC | [M])) >= 0.8']
	deltas = [0.05] 
	print("Making parse trees for constraint(s):")
	print(constraint_strs," with deltas: ", deltas)
	parse_trees = make_parse_trees_from_constraints(
		constraint_strs,deltas,regime=regime,
		sub_regime=sub_regime,columns=sensitive_col_names)
	device = torch.device("mps")
	model = SeldoPyTorchTestModel(device)

	initial_solution_fn = model.get_model_params
	spec = SupervisedSpec(
		dataset=dataset,
		model=model,
		parse_trees=parse_trees,
		frac_data_in_safety=frac_data_in_safety,
		primary_objective=objectives.binary_logistic_loss,
		use_builtin_primary_gradient_fn=False,
		sub_regime=sub_regime,
		initial_solution_fn=initial_solution_fn,
		optimization_technique='gradient_descent',
		optimizer='adam',
		optimization_hyperparams={
			'lambda_init'   : np.array([0.5]),
			'alpha_theta'   : 0.001,
			'alpha_lamb'    : 0.001,
			'beta_velocity' : 0.9,
			'beta_rmsprop'  : 0.95,
			'use_batches'   : True,
			'batch_size'    : 237,
			'n_epochs'      : 40,
			'gradient_library': "autograd",
			'hyper_search'  : None,
			'verbose'       : True,
		},
		batch_size_safety=2000
	)
	save_pickle('./spec.pkl',spec,verbose=True)
	SA = SeldonianAlgorithm(spec)

	passed_safety,solution = SA.run(debug=True,write_cs_logfile=True)
	if passed_safety:
		print("Passed safety test")
		st_primary_objective = SA.evaluate_primary_objective(theta=solution,
		branch='safety_test')
	else:
		print("Failed safety test")
```

Running the `SeldoPytorchTestModel` in the engine produces nans after only a few steps of gradient descent:

```python
Have 40 epochs and 50 batches of size 237 for a total of 2000 iterations
Epoch: 0, batch iteration 0
epoch,batch_i,overall_i,f,g,theta,lambda: 0 0 0 0.6966847490660751 [-0.11317656] [-0.00249562  0.17881456 -0.27434838 ...  0.02548225 -0.05133569
  0.03980389] [0.5]

epoch,batch_i,overall_i,f,g,theta,lambda: 0 1 1 0.6979988501544743 [-0.11469971] [-0.00249562  0.17881456 -0.27434838 ...  0.02448849 -0.05033569
  0.03880389] [0.49988682]

epoch,batch_i,overall_i,f,g,theta,lambda: 0 2 2 0.6960756663028701 [-0.11772262] [-0.00249562  0.17881456 -0.27434838 ...  0.02493663 -0.04869457
  0.03716277] [0.49977212]

epoch,batch_i,overall_i,f,g,theta,lambda: 0 3 3 0.6964742956282217 [-0.12167626] [-0.00249562  0.17881456 -0.27434838 ...  0.02554206 -0.0465446
  0.0350128 ] [0.4996544]

  File '/Users/ahoag/anaconda3/envs/seldo-pytorch/lib/python3.9/site-packages/autograd/numpy/numpy_vjps.py', line 338
    divide by zero encountered in double_scalars
  File '/Users/ahoag/anaconda3/envs/seldo-pytorch/lib/python3.9/site-packages/autograd/numpy/numpy_vjps.py', line 340
    invalid value encountered in multiply
epoch,batch_i,overall_i,f,g,theta,lambda: 0 4 4 nan [inf] [-0.00249562  0.17881456 -0.27434838 ...         nan         nan
         nan] [0.49953272]

  File '/Users/ahoag/anaconda3/envs/seldo-pytorch/lib/python3.9/site-packages/seldonian/optimizers/gradient_descent.py', line 191
    Warning: a nan or inf was found during gradient descent. Stopping prematurely and returning NSF.
Wrote /Users/ahoag/beri/code/engine-repo-dev/examples/debug_dropout_issue/logs/candidate_selection_log1.p with candidate selection log info
Failed safety test
```

Removing the dropout layers from `forward()` makes the nans go away and gradient descent proceeds normally. We'd like to be able to support dropout layers, so understanding why dropouts produce this issue is a high priority.

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

PyTorch Dropout layers cause issue in candidate selection #20

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

PyTorch Dropout layers cause issue in candidate selection #20

Description

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions