Skip to content

PyTorch Dropout layers cause issue in candidate selection #20

@austinhoag

Description

@austinhoag

When the Seldonian model is a PyTorch model which contains calls to torch.nn.Dropout, gradient descent produces nans. Removing the dropout calls only but keeping all other layers resolves the issue.

Example:

from seldonian.models.pytorch_model import SupervisedPytorchBaseModel
import torch.nn as nn
import torch

class testCNN(nn.Module):
	def __init__(self):
		super(testCNN,self).__init__()
		# Define all layers here
		self.cnn1 = nn.Conv2d(in_channels=1, out_channels=16, kernel_size=3, padding=1)
		self.cnn2 = nn.Conv2d(in_channels=16, out_channels=32, kernel_size=3)
		self.cnn3 = nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3)
		self.cnn4 = nn.Conv2d(in_channels=64, out_channels=128, kernel_size=3)

		self.relu = nn.ReLU()
		self.maxpool = nn.MaxPool2d(kernel_size=2)

		self.Batch1=nn.BatchNorm2d(16)
		self.Batch2=nn.BatchNorm2d(32)
		self.Batch3=nn.BatchNorm2d(64)
		self.Batch4=nn.BatchNorm2d(128)

		self.Drop1=nn.Dropout(0.2)
		self.Drop2=nn.Dropout(0.5)

		# Fully connected 1 (readout)
		self.fc1 = nn.Linear(128 * 1 * 1, 128) 
		self.fc2=nn.Linear(128,256)
		self.fc3=nn.Linear(256,2)
		self.softmax = nn.Softmax(dim=1)

	def forward(self, x):
		# Call all layers here
		out = self.cnn1(x) 
		out = self.relu(out)
		out = self.maxpool(out)
		out=self.Batch1(out)
		out=self.Drop1(out)

		out = self.cnn2(out)
		out = self.relu(out)
		out = self.maxpool(out)
		out=self.Batch2(out)
		out=self.Drop1(out)

		out = self.cnn3(out)
		out = self.relu(out)
		out = self.maxpool(out)
		out=self.Batch3(out)
		out=self.Drop1(out)

		out = self.cnn4(out)
		out = self.relu(out)
		out = self.maxpool(out)
		out=self.Batch4(out)
		out=self.Drop1(out)

		# Resize
		# Original size: (100, 32, 7, 7)
		# out.size(0): 100
		# New out size: (100, 32*7*7)
		out = out.view(out.size(0), -1)

		# Linear function (readout)
		out = self.fc1(out)

		out=self.Drop2(out)

		out=self.fc2(out)

		out=self.Drop2(out)

		out=self.fc3(out)

		# Softmax to make probabilities
		out=self.softmax(out)[:,1] 

		return out

class SeldoPytorchTestModel(SupervisedPytorchBaseModel):
	def __init__(self,device):
		""" Implements a CNN with PyTorch. 
		CNN consists of two hidden layers followed 
		by a linear + softmax output layer 

		:param input_dim: Number of features
		:param output_dim: Size of output layer (number of label columns)
		"""
		super().__init__(device)

	def create_model(self,**kwargs):
		""" Create the pytorch model and return it
		Inputs are N,1,28,28 where N is the number of them,
		1 channel and 28x28 pixels.
		Do Conv2d,ReLU,maxpool twice then
		output in a fully connected layer to 10 output classes
		"""
		return testCNN()

import autograd.numpy as np   # Thinly-wrapped version of Numpy

from seldonian.spec import SupervisedSpec
from seldonian.dataset import SupervisedDataSet
from seldonian.utils.io_utils import load_pickle,save_pickle
from seldonian.models import objectives
from seldonian.seldonian_algorithm import SeldonianAlgorithm
from seldonian.parse_tree.parse_tree import (
	make_parse_trees_from_constraints)

import torch

if __name__ == "__main__":
	torch.manual_seed(0)
	regime='supervised_learning'
	sub_regime='classification'
	
	N=23700 # Clips off 5 samples (at random) to make total divisible by 150,
	# the desired batch size
	
	# Get the data, load from file if already saved
	savename_features = './features.pkl'
	savename_labels = './labels.pkl'
	savename_sensitive_attrs = './sensitive_attrs.pkl'

	features = load_pickle(savename_features)
	labels = load_pickle(savename_labels)
	sensitive_attrs = load_pickle(savename_sensitive_attrs)
	
	assert len(features) == N
	assert len(labels) == N
	assert len(sensitive_attrs) == N
	frac_data_in_safety = 0.5
	sensitive_col_names = ['M','F']

	meta_information = {}
	meta_information['feature_col_names'] = ['img']
	meta_information['label_col_names'] = ['label']
	meta_information['sensitive_col_names'] = sensitive_col_names
	meta_information['sub_regime'] = sub_regime
	
	print("Making SupervisedDataSet...")
	dataset = SupervisedDataSet(
		features=features,
		labels=labels,
		sensitive_attrs=sensitive_attrs,
		num_datapoints=N,
		meta_information=meta_information)

	constraint_strs = ['min((ACC | [M])/(ACC | [F]),(ACC | [F])/(ACC | [M])) >= 0.8']
	deltas = [0.05] 
	print("Making parse trees for constraint(s):")
	print(constraint_strs," with deltas: ", deltas)
	parse_trees = make_parse_trees_from_constraints(
		constraint_strs,deltas,regime=regime,
		sub_regime=sub_regime,columns=sensitive_col_names)
	device = torch.device("mps")
	model = SeldoPyTorchTestModel(device)

	initial_solution_fn = model.get_model_params
	spec = SupervisedSpec(
		dataset=dataset,
		model=model,
		parse_trees=parse_trees,
		frac_data_in_safety=frac_data_in_safety,
		primary_objective=objectives.binary_logistic_loss,
		use_builtin_primary_gradient_fn=False,
		sub_regime=sub_regime,
		initial_solution_fn=initial_solution_fn,
		optimization_technique='gradient_descent',
		optimizer='adam',
		optimization_hyperparams={
			'lambda_init'   : np.array([0.5]),
			'alpha_theta'   : 0.001,
			'alpha_lamb'    : 0.001,
			'beta_velocity' : 0.9,
			'beta_rmsprop'  : 0.95,
			'use_batches'   : True,
			'batch_size'    : 237,
			'n_epochs'      : 40,
			'gradient_library': "autograd",
			'hyper_search'  : None,
			'verbose'       : True,
		},
		batch_size_safety=2000
	)
	save_pickle('./spec.pkl',spec,verbose=True)
	SA = SeldonianAlgorithm(spec)

	passed_safety,solution = SA.run(debug=True,write_cs_logfile=True)
	if passed_safety:
		print("Passed safety test")
		st_primary_objective = SA.evaluate_primary_objective(theta=solution,
		branch='safety_test')
	else:
		print("Failed safety test")

Running the SeldoPytorchTestModel in the engine produces nans after only a few steps of gradient descent:

Have 40 epochs and 50 batches of size 237 for a total of 2000 iterations
Epoch: 0, batch iteration 0
epoch,batch_i,overall_i,f,g,theta,lambda: 0 0 0 0.6966847490660751 [-0.11317656] [-0.00249562  0.17881456 -0.27434838 ...  0.02548225 -0.05133569
  0.03980389] [0.5]

epoch,batch_i,overall_i,f,g,theta,lambda: 0 1 1 0.6979988501544743 [-0.11469971] [-0.00249562  0.17881456 -0.27434838 ...  0.02448849 -0.05033569
  0.03880389] [0.49988682]

epoch,batch_i,overall_i,f,g,theta,lambda: 0 2 2 0.6960756663028701 [-0.11772262] [-0.00249562  0.17881456 -0.27434838 ...  0.02493663 -0.04869457
  0.03716277] [0.49977212]

epoch,batch_i,overall_i,f,g,theta,lambda: 0 3 3 0.6964742956282217 [-0.12167626] [-0.00249562  0.17881456 -0.27434838 ...  0.02554206 -0.0465446
  0.0350128 ] [0.4996544]

  File '/Users/ahoag/anaconda3/envs/seldo-pytorch/lib/python3.9/site-packages/autograd/numpy/numpy_vjps.py', line 338
    divide by zero encountered in double_scalars
  File '/Users/ahoag/anaconda3/envs/seldo-pytorch/lib/python3.9/site-packages/autograd/numpy/numpy_vjps.py', line 340
    invalid value encountered in multiply
epoch,batch_i,overall_i,f,g,theta,lambda: 0 4 4 nan [inf] [-0.00249562  0.17881456 -0.27434838 ...         nan         nan
         nan] [0.49953272]

  File '/Users/ahoag/anaconda3/envs/seldo-pytorch/lib/python3.9/site-packages/seldonian/optimizers/gradient_descent.py', line 191
    Warning: a nan or inf was found during gradient descent. Stopping prematurely and returning NSF.
Wrote /Users/ahoag/beri/code/engine-repo-dev/examples/debug_dropout_issue/logs/candidate_selection_log1.p with candidate selection log info
Failed safety test

Removing the dropout layers from forward() makes the nans go away and gradient descent proceeds normally. We'd like to be able to support dropout layers, so understanding why dropouts produce this issue is a high priority.

Metadata

Metadata

Assignees

Labels

No labels
No labels

Type

No type

Projects

No projects

Milestone

No milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions