In [9]:
import numpy as np

try:
    import tensorflow.compat.v1 as tf
except ImportError as error:
    from logging import warning
    warning("{}: AdversarialDebiasing will be unavailable. To install, run:\n"
            "pip install 'aif360[AdversarialDebiasing]'".format(error))

from aif360.algorithms import Transformer
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix

In [1]:
class AdversarialDebiasing(Transformer):
    """Adversarial debiasing is an in-processing technique that learns a
    classifier to maximize prediction accuracy and simultaneously reduce an
    adversary's ability to determine the protected attribute from the
    predictions [5]_. This approach leads to a fair classifier as the
    predictions cannot carry any group discrimination information that the
    adversary can exploit.

    References:
        .. [5] B. H. Zhang, B. Lemoine, and M. Mitchell, "Mitigating Unwanted
           Biases with Adversarial Learning," AAAI/ACM Conference on Artificial
           Intelligence, Ethics, and Society, 2018.
    """

    def __init__(self,
                 unprivileged_groups,
                 privileged_groups,
                 scope_name,
                 sess,
                 seed=None,
                 adversary_loss_weight=0.1,
                 num_epochs=50,
                 batch_size=128,
                 classifier_num_hidden_units=200,
                 debias=True):
        """
        Args:
            unprivileged_groups (tuple): Representation for unprivileged groups
            privileged_groups (tuple): Representation for privileged groups
            scope_name (str): scope name for the tenforflow variables
            sess (tf.Session): tensorflow session
            seed (int, optional): Seed to make `predict` repeatable.
            adversary_loss_weight (float, optional): Hyperparameter that chooses
                the strength of the adversarial loss.
            num_epochs (int, optional): Number of training epochs.
            batch_size (int, optional): Batch size.
            classifier_num_hidden_units (int, optional): Number of hidden units
                in the classifier model.
            debias (bool, optional): Learn a classifier with or without
                debiasing.
        """
        super(AdversarialDebiasing, self).__init__(
            unprivileged_groups=unprivileged_groups,
            privileged_groups=privileged_groups)

        self.scope_name = scope_name
        self.seed = seed

        self.unprivileged_groups = unprivileged_groups
        self.privileged_groups = privileged_groups
        if len(self.unprivileged_groups) > 1 or len(self.privileged_groups) > 1:
            raise ValueError("Only one unprivileged_group or privileged_group supported.")
        self.protected_attribute_name = list(self.unprivileged_groups[0].keys())[0]

        self.sess = sess
        self.adversary_loss_weight = adversary_loss_weight
        self.num_epochs = num_epochs
        self.batch_size = batch_size
        self.classifier_num_hidden_units = classifier_num_hidden_units
        self.debias = debias

        self.features_dim = None
        self.features_ph = None
        self.protected_attributes_ph = None
        self.true_labels_ph = None
        self.pred_labels = None

    def _classifier_model(self, features, features_dim, keep_prob):
        """Compute the classifier predictions for the outcome variable.

        Input
        features: a tensor representing the input features for the classifier
        features_dim: dimension of the feature dataset

        Return
        pred_label: predicted label
        pred_logits: raw logits output from the output layer that holds
        the unactivated score of prediction confidence
        """
        with tf.variable_scope("classifier_model"):
            W1 = tf.get_variable('W1', [features_dim, self.classifier_num_hidden_units],
                                  initializer=tf.initializers.glorot_uniform(seed=self.seed1))
            b1 = tf.Variable(tf.zeros(shape=[self.classifier_num_hidden_units]), name='b1')

            h1 = tf.nn.relu(tf.matmul(features, W1) + b1)
            h1 = tf.nn.dropout(h1, keep_prob=keep_prob, seed=self.seed2)

            W2 = tf.get_variable('W2', [self.classifier_num_hidden_units, 1],
                                 initializer=tf.initializers.glorot_uniform(seed=self.seed3))
            b2 = tf.Variable(tf.zeros(shape=[1]), name='b2')

            pred_logit = tf.matmul(h1, W2) + b2
            pred_label = tf.sigmoid(pred_logit) # predictive binary classification with sigmoid activation

        return pred_label, pred_logit

    def _adversary_model(self, pred_logits, true_labels):
        """Compute the adversary predictions for the protected attribute.
        """
        with tf.variable_scope("adversary_model"):
            c = tf.get_variable('c', initializer=tf.constant(1.0))
            s = tf.sigmoid((1 + tf.abs(c)) * pred_logits)

            W2 = tf.get_variable('W2', [3, 1],
                                 initializer=tf.initializers.glorot_uniform(seed=self.seed4))
            b2 = tf.Variable(tf.zeros(shape=[1]), name='b2')

            pred_protected_attribute_logit = tf.matmul(tf.concat([s, s * true_labels, s * (1.0 - true_labels)], axis=1), W2) + b2
            pred_protected_attribute_label = tf.sigmoid(pred_protected_attribute_logit)

        return pred_protected_attribute_label, pred_protected_attribute_logit

    def fit(self, dataset):
        """Compute the model parameters of the fair classifier using gradient
        descent.

        Args:
            dataset (BinaryLabelDataset): Dataset containing true labels.

        Returns:
            AdversarialDebiasing: Returns self.
        """
        if tf.executing_eagerly():
            raise RuntimeError("AdversarialDebiasing does not work in eager "
                    "execution mode. To fix, add `tf.disable_eager_execution()`"
                    " to the top of the calling script.")

        if self.seed is not None:
            np.random.seed(self.seed)
        ii32 = np.iinfo(np.int32)
        self.seed1, self.seed2, self.seed3, self.seed4 = np.random.randint(ii32.min, ii32.max, size=4)

        # Map the dataset labels to 0 and 1.
        temp_labels = dataset.labels.copy()

        temp_labels[(dataset.labels == dataset.favorable_label).ravel(),0] = 1.0
        temp_labels[(dataset.labels == dataset.unfavorable_label).ravel(),0] = 0.0

        with tf.variable_scope(self.scope_name):
            num_train_samples, self.features_dim = np.shape(dataset.features)

            # Setup placeholders
            self.features_ph = tf.placeholder(tf.float32, shape=[None, self.features_dim])
            self.protected_attributes_ph = tf.placeholder(tf.float32, shape=[None,1])
            self.true_labels_ph = tf.placeholder(tf.float32, shape=[None,1])
            self.keep_prob = tf.placeholder(tf.float32)

            # Obtain classifier predictions and classifier loss
            self.pred_labels, pred_logits = self._classifier_model(self.features_ph, self.features_dim, self.keep_prob)
            # cross entropy loss between true and predicted labels
            pred_labels_loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(labels=self.true_labels_ph, logits=pred_logits))

            if self.debias:
                # Obtain adversary predictions and adversary loss
                pred_protected_attributes_labels, pred_protected_attributes_logits = self._adversary_model(pred_logits, self.true_labels_ph)
                pred_protected_attributes_loss = tf.reduce_mean(
                    tf.nn.sigmoid_cross_entropy_with_logits(labels=self.protected_attributes_ph, logits=pred_protected_attributes_logits))

            # Setup optimizers with learning rates
            global_step = tf.Variable(0, trainable=False)
            starter_learning_rate = 0.001
            learning_rate = tf.train.exponential_decay(starter_learning_rate, global_step,
                                                       1000, 0.96, staircase=True)
            # Tensorflow optimizer for classifier
            classifier_opt = tf.train.AdamOptimizer(learning_rate)
            if self.debias:
                # Tensorflow optimizer for adversary
                adversary_opt = tf.train.AdamOptimizer(learning_rate)

            classifier_vars = [var for var in tf.trainable_variables(scope=self.scope_name) if 'classifier_model' in var.name]
            if self.debias:
                adversary_vars = [var for var in tf.trainable_variables(scope=self.scope_name) if 'adversary_model' in var.name]
                # Update classifier parameters
                adversary_grads = {var: grad for (grad, var) in adversary_opt.compute_gradients(pred_protected_attributes_loss,
                                                                                      var_list=classifier_vars)}
            normalize = lambda x: x / (tf.norm(x) + np.finfo(np.float32).tiny)


            classifier_grads = []
            for (grad,var) in classifier_opt.compute_gradients(pred_labels_loss, var_list=classifier_vars):
                if self.debias:
                    unit_adversary_grad = normalize(adversary_grads[var])
                    grad -= tf.reduce_sum(grad * unit_adversary_grad) * unit_adversary_grad
                    grad -= self.adversary_loss_weight * adversary_grads[var]
                classifier_grads.append((grad, var))
            classifier_minimizer = classifier_opt.apply_gradients(classifier_grads, global_step=global_step)

            if self.debias:
                # Update adversary parameters
                with tf.control_dependencies([classifier_minimizer]):
                    adversary_minimizer = adversary_opt.minimize(pred_protected_attributes_loss, var_list=adversary_vars)#, global_step=global_step)

            self.sess.run(tf.global_variables_initializer())
            self.sess.run(tf.local_variables_initializer())

            # Begin training
            for epoch in range(self.num_epochs):
                shuffled_ids = np.random.choice(num_train_samples, num_train_samples, replace=False)
                for i in range(num_train_samples//self.batch_size):
                    batch_ids = shuffled_ids[self.batch_size*i: self.batch_size*(i+1)]
                    batch_features = dataset.features[batch_ids]
                    batch_labels = np.reshape(temp_labels[batch_ids], [-1,1])
                    batch_protected_attributes = np.reshape(dataset.protected_attributes[batch_ids][:,
                                                 dataset.protected_attribute_names.index(self.protected_attribute_name)], [-1,1])

                    batch_feed_dict = {self.features_ph: batch_features,
                                       self.true_labels_ph: batch_labels,
                                       self.protected_attributes_ph: batch_protected_attributes,
                                       self.keep_prob: 0.8}
                    if self.debias:
                        _, _, pred_labels_loss_value, pred_protected_attributes_loss_vale = self.sess.run([classifier_minimizer,
                                       adversary_minimizer,
                                       pred_labels_loss,
                                       pred_protected_attributes_loss], feed_dict=batch_feed_dict)
                        if i % 200 == 0:
                            print("epoch %d; iter: %d; batch classifier loss: %f; batch adversarial loss: %f" % (epoch, i, pred_labels_loss_value,
                                                                                     pred_protected_attributes_loss_vale))
                    else:
                        _, pred_labels_loss_value = self.sess.run(
                            [classifier_minimizer,
                             pred_labels_loss], feed_dict=batch_feed_dict)
                        if i % 200 == 0:
                            print("epoch %d; iter: %d; batch classifier loss: %f" % (
                            epoch, i, pred_labels_loss_value))
        return self

    def predict(self, dataset):
        """Obtain the predictions for the provided dataset using the fair
        classifier learned.

        Args:
            dataset (BinaryLabelDataset): Dataset containing labels that needs
                to be transformed.
        Returns:
            dataset (BinaryLabelDataset): Transformed dataset.
        """

        if self.seed is not None:
            np.random.seed(self.seed)

        num_test_samples, _ = np.shape(dataset.features)

        samples_covered = 0
        pred_labels = []
        while samples_covered < num_test_samples:
            start = samples_covered
            end = samples_covered + self.batch_size
            if end > num_test_samples:
                end = num_test_samples
            batch_ids = np.arange(start, end)
            batch_features = dataset.features[batch_ids]
            batch_labels = np.reshape(dataset.labels[batch_ids], [-1,1])
            batch_protected_attributes = np.reshape(dataset.protected_attributes[batch_ids][:,
                                         dataset.protected_attribute_names.index(self.protected_attribute_name)], [-1,1])

            batch_feed_dict = {self.features_ph: batch_features,
                               self.true_labels_ph: batch_labels,
                               self.protected_attributes_ph: batch_protected_attributes,
                               self.keep_prob: 1.0}

            pred_labels += self.sess.run(self.pred_labels, feed_dict=batch_feed_dict)[:,0].tolist()
            samples_covered += len(batch_features)

        # Mutated, fairer dataset with new labels
        dataset_new = dataset.copy(deepcopy = True)
        dataset_new.scores = np.array(pred_labels, dtype=np.float64).reshape(-1, 1)
        dataset_new.labels = (np.array(pred_labels)>0.5).astype(np.float64).reshape(-1,1)


        # Map the dataset labels to back to their original values.
        temp_labels = dataset_new.labels.copy()

        temp_labels[(dataset_new.labels == 1.0).ravel(), 0] = dataset.favorable_label
        temp_labels[(dataset_new.labels == 0.0).ravel(), 0] = dataset.unfavorable_label

        dataset_new.labels = temp_labels.copy()

        return dataset_new



# Test local AD impl

## Adult dataset

### sensitive attribute = sex

In [2]:
from aif360.datasets import AdultDataset
tf.disable_eager_execution()

adult_dataset = AdultDataset()
adult_train, adult_test = adult_dataset.split([0.7], shuffle=True)
adult_privileged_groups = [{'sex': 1}]  # Male
adult_unprivileged_groups = [{'sex': 0}]  # Female

sess = tf.compat.v1.Session()

adult_ad = AdversarialDebiasing(
    privileged_groups=adult_privileged_groups,
    unprivileged_groups=adult_unprivileged_groups,
    scope_name="adult_debiasing",
    sess=sess,
)
adult_ad.fit(adult_train)
adult_predict = adult_ad.predict(adult_test)
sess.close()



Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


epoch 0; iter: 0; batch classifier loss: 48.314850; batch adversarial loss: 1.134260


I0000 00:00:1733092916.767180  639727 mlir_graph_optimization_pass.cc:401] MLIR V1 optimization pass is not enabled


epoch 0; iter: 200; batch classifier loss: 10.646967; batch adversarial loss: 1.049885
epoch 1; iter: 0; batch classifier loss: 8.462988; batch adversarial loss: 0.757050
epoch 1; iter: 200; batch classifier loss: 5.221793; batch adversarial loss: 0.823377
epoch 2; iter: 0; batch classifier loss: 2.613160; batch adversarial loss: 0.752772
epoch 2; iter: 200; batch classifier loss: 4.986633; batch adversarial loss: 0.707846
epoch 3; iter: 0; batch classifier loss: 5.351168; batch adversarial loss: 0.688093
epoch 3; iter: 200; batch classifier loss: 2.172413; batch adversarial loss: 0.664870
epoch 4; iter: 0; batch classifier loss: 1.712467; batch adversarial loss: 0.616332
epoch 4; iter: 200; batch classifier loss: 1.110458; batch adversarial loss: 0.625024
epoch 5; iter: 0; batch classifier loss: 0.852619; batch adversarial loss: 0.578249
epoch 5; iter: 200; batch classifier loss: 0.576080; batch adversarial loss: 0.646356
epoch 6; iter: 0; batch classifier loss: 1.557930; batch advers

#### Evaluation

In [3]:
from aif360.metrics import ClassificationMetric

  vect_normalized_discounted_cumulative_gain = vmap(
  monte_carlo_vect_ndcg = vmap(vect_normalized_discounted_cumulative_gain, in_dims=(0,))


In [4]:
adult_metric_test = ClassificationMetric(
    adult_test,
    adult_predict,
    unprivileged_groups=adult_unprivileged_groups,
    privileged_groups=adult_privileged_groups,
)

In [5]:
print({
        "Accuracy": adult_metric_test.accuracy(),
        "Precision":  adult_metric_test.precision(),
        "Equal Opportunity Difference": adult_metric_test.equal_opportunity_difference(),
        "Statistical Parity Difference": adult_metric_test.statistical_parity_difference(),
})

{'Accuracy': np.float64(0.833419326306479), 'Precision': np.float64(0.7505731315910132), 'Equal Opportunity Difference': np.float64(0.17897872422469224), 'Statistical Parity Difference': np.float64(-0.0572880754242837)}


As statistical parity differenece is negative, there still is bias againt unprivileged group

### sensitive attribute = race

In [6]:
adult_privileged_groups = [{'race': 1}]  # white
adult_unprivileged_groups = [{'race': 0}]  # non-white

In [7]:
sess = tf.compat.v1.Session()

adult_ad_race = AdversarialDebiasing(
    privileged_groups=adult_privileged_groups,
    unprivileged_groups=adult_unprivileged_groups,
    scope_name="adult_debiasing_race",
    sess=sess,
)
adult_train_race, adult_test_race = adult_dataset.split([0.7], shuffle=True)
adult_ad_race.fit(adult_train_race)
adult_predict_race = adult_ad_race.predict(adult_test_race)
sess.close()

epoch 0; iter: 0; batch classifier loss: 9.611891; batch adversarial loss: 0.782873
epoch 0; iter: 200; batch classifier loss: 11.640459; batch adversarial loss: 1.001259
epoch 1; iter: 0; batch classifier loss: 25.443785; batch adversarial loss: 0.918664
epoch 1; iter: 200; batch classifier loss: 26.085617; batch adversarial loss: 0.680850
epoch 2; iter: 0; batch classifier loss: 1.541882; batch adversarial loss: 0.580884
epoch 2; iter: 200; batch classifier loss: 3.901371; batch adversarial loss: 0.574877
epoch 3; iter: 0; batch classifier loss: 0.714578; batch adversarial loss: 0.575285
epoch 3; iter: 200; batch classifier loss: 2.649572; batch adversarial loss: 0.482377
epoch 4; iter: 0; batch classifier loss: 1.768612; batch adversarial loss: 0.495498
epoch 4; iter: 200; batch classifier loss: 1.800842; batch adversarial loss: 0.498334
epoch 5; iter: 0; batch classifier loss: 1.512531; batch adversarial loss: 0.481856
epoch 5; iter: 200; batch classifier loss: 1.164107; batch adve

In [8]:
adult_race_metric_test = ClassificationMetric(
    adult_test_race,
    adult_predict_race,
    unprivileged_groups=adult_unprivileged_groups,
    privileged_groups=adult_privileged_groups,
)
print({
        "Accuracy": adult_race_metric_test.accuracy(),
        "Precision":  adult_race_metric_test.precision(),
        "Equal Opportunity Difference": adult_race_metric_test.equal_opportunity_difference(),
        "Statistical Parity Difference": adult_race_metric_test.statistical_parity_difference(),
})

{'Accuracy': np.float64(0.7997346502542935), 'Precision': np.float64(0.6807570977917982), 'Equal Opportunity Difference': np.float64(0.06123116278622465), 'Statistical Parity Difference': np.float64(-0.011656239262248)}


Adversary debiasing achieves better fairness performance when debiasing protected variable **race**.

# Update adversary_model with multiple inputs

The following return the inner layer `h1` from the two-layer neural network, and leverages the combinataion of inner layer and output layer of classifier_model for predicting protected attributes.

In [12]:
class AdversarialDebiasing(Transformer):
    """Adversarial debiasing is an in-processing technique that learns a
    classifier to maximize prediction accuracy and simultaneously reduce an
    adversary's ability to determine the protected attribute from the
    predictions [5]_. This approach leads to a fair classifier as the
    predictions cannot carry any group discrimination information that the
    adversary can exploit.

    References:
        .. [5] B. H. Zhang, B. Lemoine, and M. Mitchell, "Mitigating Unwanted
           Biases with Adversarial Learning," AAAI/ACM Conference on Artificial
           Intelligence, Ethics, and Society, 2018.
    """

    def __init__(self,
                 unprivileged_groups,
                 privileged_groups,
                 scope_name,
                 sess,
                 seed=None,
                 adversary_loss_weight=0.1,
                 num_epochs=50,
                 batch_size=128,
                 classifier_num_hidden_units=200,
                 debias=True):
        """
        Args:
            unprivileged_groups (tuple): Representation for unprivileged groups
            privileged_groups (tuple): Representation for privileged groups
            scope_name (str): scope name for the tenforflow variables
            sess (tf.Session): tensorflow session
            seed (int, optional): Seed to make `predict` repeatable.
            adversary_loss_weight (float, optional): Hyperparameter that chooses
                the strength of the adversarial loss.
            num_epochs (int, optional): Number of training epochs.
            batch_size (int, optional): Batch size.
            classifier_num_hidden_units (int, optional): Number of hidden units
                in the classifier model.
            debias (bool, optional): Learn a classifier with or without
                debiasing.
        """
        super(AdversarialDebiasing, self).__init__(
            unprivileged_groups=unprivileged_groups,
            privileged_groups=privileged_groups)

        self.scope_name = scope_name
        self.seed = seed

        self.unprivileged_groups = unprivileged_groups
        self.privileged_groups = privileged_groups
        if len(self.unprivileged_groups) > 1 or len(self.privileged_groups) > 1:
            raise ValueError("Only one unprivileged_group or privileged_group supported.")
        self.protected_attribute_name = list(self.unprivileged_groups[0].keys())[0]

        self.sess = sess
        self.adversary_loss_weight = adversary_loss_weight
        self.num_epochs = num_epochs
        self.batch_size = batch_size
        self.classifier_num_hidden_units = classifier_num_hidden_units
        self.debias = debias

        self.features_dim = None
        self.features_ph = None
        self.protected_attributes_ph = None
        self.true_labels_ph = None
        self.pred_labels = None

    def _classifier_model(self, features, features_dim, keep_prob):
        """Compute the classifier predictions for the outcome variable.

        Input
        features: a tensor representing the input features for the classifier
        features_dim: dimension of the feature dataset

        Return
        pred_label: predicted label
        pred_logits: raw logits output from the output layer that holds
        the unactivated score of prediction confidence
        """
        with tf.variable_scope("classifier_model"):
            W1 = tf.get_variable('W1', [features_dim, self.classifier_num_hidden_units],
                                  initializer=tf.initializers.glorot_uniform(seed=self.seed1))
            b1 = tf.Variable(tf.zeros(shape=[self.classifier_num_hidden_units]), name='b1')

            h1 = tf.nn.relu(tf.matmul(features, W1) + b1)
            h1 = tf.nn.dropout(h1, keep_prob=keep_prob, seed=self.seed2)

            W2 = tf.get_variable('W2', [self.classifier_num_hidden_units, 1],
                                 initializer=tf.initializers.glorot_uniform(seed=self.seed3))
            b2 = tf.Variable(tf.zeros(shape=[1]), name='b2')

            pred_logit = tf.matmul(h1, W2) + b2
            pred_label = tf.sigmoid(pred_logit) # predictive binary classification with sigmoid activation

        return pred_label, pred_logit, h1

    def _adversary_model(self, pred_logits, true_labels, hidden_layer):
        """Compute the adversary predictions for the protected attribute.
        """
        with tf.variable_scope("adversary_model"):
            c = tf.get_variable('c', initializer=tf.constant(1.0))
            s = tf.sigmoid((1 + tf.abs(c)) * pred_logits)
            # Add hidden layer processing
            W_hidden = tf.get_variable('W_hidden', [self.classifier_num_hidden_units, 10],
                               initializer=tf.initializers.glorot_uniform(seed=self.seed4))
            hidden_features = tf.matmul(hidden_layer, W_hidden)

            W2 = tf.get_variable('W2', [13, 1],
                                 initializer=tf.initializers.glorot_uniform(seed=self.seed4))
            b2 = tf.Variable(tf.zeros(shape=[1]), name='b2')
            combined_features = tf.concat([s, s * true_labels, s * (1.0 - true_labels), hidden_features], axis=1)

            pred_protected_attribute_logit = tf.matmul(combined_features, W2) + b2
            pred_protected_attribute_label = tf.sigmoid(pred_protected_attribute_logit)

        return pred_protected_attribute_label, pred_protected_attribute_logit
        
    def evaluate_adversary(self, dataset):
        """Evaluate adversary's ability to predict protected attributes.
        
        Args:
            dataset (BinaryLabelDataset): Dataset containing features and protected attributes
            
        Returns:
            dict: Dictionary containing evaluation metrics
        """
        if self.seed is not None:
            np.random.seed(self.seed)

        num_samples = len(dataset.features)
        
        # Initialize lists to store predictions and true values
        pred_protected_attrs = []
        true_protected_attrs = []
        
        # Predict in batches
        for i in range(0, num_samples, self.batch_size):
            batch_features = dataset.features[i:i + self.batch_size]
            batch_labels = dataset.labels[i:i + self.batch_size].reshape(-1, 1)
            batch_protected = dataset.protected_attributes[i:i + self.batch_size, 
                dataset.protected_attribute_names.index(self.protected_attribute_name)].reshape(-1, 1)
            
            # Create feed dictionary
            feed_dict = {
                self.features_ph: batch_features,
                self.true_labels_ph: batch_labels,
                self.keep_prob: 1.0
            }
            
            # Get classifier predictions first
            _, pred_logits = self._classifier_model(self.features_ph, self.features_dim, self.keep_prob)
            
            # Get adversary predictions
            pred_protected, _ = self._adversary_model(pred_logits, self.true_labels_ph)
            
            # Run session
            pred_protected_batch = self.sess.run(pred_protected, feed_dict=feed_dict)
            
            # Store predictions and true values
            pred_protected_attrs.extend((pred_protected_batch > 0.5).astype(int))
            true_protected_attrs.extend(batch_protected)
        
        # Convert to numpy arrays
        pred_protected_attrs = np.array(pred_protected_attrs)
        true_protected_attrs = np.array(true_protected_attrs)
        
        # Calculate metrics
        metrics = {
            'accuracy': accuracy_score(true_protected_attrs, pred_protected_attrs),
            'auc': roc_auc_score(true_protected_attrs, pred_protected_attrs),
            'confusion_matrix': confusion_matrix(true_protected_attrs, pred_protected_attrs)
        }
        
        return metrics

    def fit(self, dataset):
        """Compute the model parameters of the fair classifier using gradient
        descent.

        Args:
            dataset (BinaryLabelDataset): Dataset containing true labels.

        Returns:
            AdversarialDebiasing: Returns self.
        """
        if tf.executing_eagerly():
            raise RuntimeError("AdversarialDebiasing does not work in eager "
                    "execution mode. To fix, add `tf.disable_eager_execution()`"
                    " to the top of the calling script.")

        if self.seed is not None:
            np.random.seed(self.seed)
        ii32 = np.iinfo(np.int32)
        self.seed1, self.seed2, self.seed3, self.seed4 = np.random.randint(ii32.min, ii32.max, size=4)

        # Map the dataset labels to 0 and 1.
        temp_labels = dataset.labels.copy()

        temp_labels[(dataset.labels == dataset.favorable_label).ravel(),0] = 1.0
        temp_labels[(dataset.labels == dataset.unfavorable_label).ravel(),0] = 0.0

        with tf.variable_scope(self.scope_name):
            num_train_samples, self.features_dim = np.shape(dataset.features)

            # Setup placeholders
            self.features_ph = tf.placeholder(tf.float32, shape=[None, self.features_dim])
            self.protected_attributes_ph = tf.placeholder(tf.float32, shape=[None,1])
            self.true_labels_ph = tf.placeholder(tf.float32, shape=[None,1])
            self.keep_prob = tf.placeholder(tf.float32)

            # Obtain classifier predictions and classifier loss
            self.pred_labels, pred_logits, hidden_layer = self._classifier_model(self.features_ph, self.features_dim, self.keep_prob)
            # cross entropy loss between true and predicted labels
            pred_labels_loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(labels=self.true_labels_ph, logits=pred_logits))

            if self.debias:
                # Obtain adversary predictions and adversary loss
                pred_protected_attributes_labels, pred_protected_attributes_logits = self._adversary_model(pred_logits, self.true_labels_ph, hidden_layer)
                pred_protected_attributes_loss = tf.reduce_mean(
                    tf.nn.sigmoid_cross_entropy_with_logits(labels=self.protected_attributes_ph, logits=pred_protected_attributes_logits))

            # Setup optimizers with learning rates
            global_step = tf.Variable(0, trainable=False)
            starter_learning_rate = 0.001
            learning_rate = tf.train.exponential_decay(starter_learning_rate, global_step,
                                                       1000, 0.96, staircase=True)
            # Tensorflow optimizer for classifier
            classifier_opt = tf.train.AdamOptimizer(learning_rate)
            if self.debias:
                # Tensorflow optimizer for adversary
                adversary_opt = tf.train.AdamOptimizer(learning_rate)

            classifier_vars = [var for var in tf.trainable_variables(scope=self.scope_name) if 'classifier_model' in var.name]
            if self.debias:
                adversary_vars = [var for var in tf.trainable_variables(scope=self.scope_name) if 'adversary_model' in var.name]
                # Update classifier parameters
                adversary_grads = {var: grad for (grad, var) in adversary_opt.compute_gradients(pred_protected_attributes_loss,
                                                                                      var_list=classifier_vars)}
            normalize = lambda x: x / (tf.norm(x) + np.finfo(np.float32).tiny)


            classifier_grads = []
            for (grad,var) in classifier_opt.compute_gradients(pred_labels_loss, var_list=classifier_vars):
                if self.debias:
                    unit_adversary_grad = normalize(adversary_grads[var])
                    grad -= tf.reduce_sum(grad * unit_adversary_grad) * unit_adversary_grad
                    grad -= self.adversary_loss_weight * adversary_grads[var]
                classifier_grads.append((grad, var))
            classifier_minimizer = classifier_opt.apply_gradients(classifier_grads, global_step=global_step)

            if self.debias:
                # Update adversary parameters
                with tf.control_dependencies([classifier_minimizer]):
                    adversary_minimizer = adversary_opt.minimize(pred_protected_attributes_loss, var_list=adversary_vars)#, global_step=global_step)

            self.sess.run(tf.global_variables_initializer())
            self.sess.run(tf.local_variables_initializer())

            # Begin training
            for epoch in range(self.num_epochs):
                shuffled_ids = np.random.choice(num_train_samples, num_train_samples, replace=False)
                for i in range(num_train_samples//self.batch_size):
                    batch_ids = shuffled_ids[self.batch_size*i: self.batch_size*(i+1)]
                    batch_features = dataset.features[batch_ids]
                    batch_labels = np.reshape(temp_labels[batch_ids], [-1,1])
                    batch_protected_attributes = np.reshape(dataset.protected_attributes[batch_ids][:,
                                                 dataset.protected_attribute_names.index(self.protected_attribute_name)], [-1,1])

                    batch_feed_dict = {self.features_ph: batch_features,
                                       self.true_labels_ph: batch_labels,
                                       self.protected_attributes_ph: batch_protected_attributes,
                                       self.keep_prob: 0.8}
                    if self.debias:
                        _, _, pred_labels_loss_value, pred_protected_attributes_loss_vale = self.sess.run([classifier_minimizer,
                                       adversary_minimizer,
                                       pred_labels_loss,
                                       pred_protected_attributes_loss], feed_dict=batch_feed_dict)
                        if i % 200 == 0:
                            print("epoch %d; iter: %d; batch classifier loss: %f; batch adversarial loss: %f" % (epoch, i, pred_labels_loss_value,
                                                                                     pred_protected_attributes_loss_vale))
                    else:
                        _, pred_labels_loss_value = self.sess.run(
                            [classifier_minimizer,
                             pred_labels_loss], feed_dict=batch_feed_dict)
                        if i % 200 == 0:
                            print("epoch %d; iter: %d; batch classifier loss: %f" % (
                            epoch, i, pred_labels_loss_value))
        return self

    def predict(self, dataset):
        """Obtain the predictions for the provided dataset using the fair
        classifier learned.

        Args:
            dataset (BinaryLabelDataset): Dataset containing labels that needs
                to be transformed.
        Returns:
            dataset (BinaryLabelDataset): Transformed dataset.
        """

        if self.seed is not None:
            np.random.seed(self.seed)

        num_test_samples, _ = np.shape(dataset.features)

        samples_covered = 0
        pred_labels = []
        while samples_covered < num_test_samples:
            start = samples_covered
            end = samples_covered + self.batch_size
            if end > num_test_samples:
                end = num_test_samples
            batch_ids = np.arange(start, end)
            batch_features = dataset.features[batch_ids]
            batch_labels = np.reshape(dataset.labels[batch_ids], [-1,1])
            batch_protected_attributes = np.reshape(dataset.protected_attributes[batch_ids][:,
                                         dataset.protected_attribute_names.index(self.protected_attribute_name)], [-1,1])

            batch_feed_dict = {self.features_ph: batch_features,
                               self.true_labels_ph: batch_labels,
                               self.protected_attributes_ph: batch_protected_attributes,
                               self.keep_prob: 1.0}

            pred_labels += self.sess.run(self.pred_labels, feed_dict=batch_feed_dict)[:,0].tolist()
            samples_covered += len(batch_features)

        # Mutated, fairer dataset with new labels
        dataset_new = dataset.copy(deepcopy = True)
        dataset_new.scores = np.array(pred_labels, dtype=np.float64).reshape(-1, 1)
        dataset_new.labels = (np.array(pred_labels)>0.5).astype(np.float64).reshape(-1,1)


        # Map the dataset labels to back to their original values.
        temp_labels = dataset_new.labels.copy()

        temp_labels[(dataset_new.labels == 1.0).ravel(), 0] = dataset.favorable_label
        temp_labels[(dataset_new.labels == 0.0).ravel(), 0] = dataset.unfavorable_label

        dataset_new.labels = temp_labels.copy()

        return dataset_new

## Adult dataset

### Sensitive attribute = sex

In [4]:
from aif360.datasets import AdultDataset
tf.disable_eager_execution()

adult_dataset = AdultDataset()
adult_train, adult_test = adult_dataset.split([0.7], shuffle=True)
adult_privileged_groups = [{'sex': 1}]  # Male
adult_unprivileged_groups = [{'sex': 0}]  # Female

sess = tf.compat.v1.Session()

adult_ad = AdversarialDebiasing(
    privileged_groups=adult_privileged_groups,
    unprivileged_groups=adult_unprivileged_groups,
    scope_name="adult_debiasing",
    sess=sess,
)
adult_ad.fit(adult_train)
adult_predict = adult_ad.predict(adult_test)
sess.close()



Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
I0000 00:00:1733361640.150598 1144487 mlir_graph_optimization_pass.cc:401] MLIR V1 optimization pass is not enabled


epoch 0; iter: 0; batch classifier loss: 119.901993; batch adversarial loss: 35.531502
epoch 0; iter: 200; batch classifier loss: 7.217835; batch adversarial loss: 4.258472
epoch 1; iter: 0; batch classifier loss: 8.410116; batch adversarial loss: 7.039089
epoch 1; iter: 200; batch classifier loss: 4.505497; batch adversarial loss: 9.530908
epoch 2; iter: 0; batch classifier loss: 2.817708; batch adversarial loss: 4.587976
epoch 2; iter: 200; batch classifier loss: 12.599235; batch adversarial loss: 10.079864
epoch 3; iter: 0; batch classifier loss: 4.690023; batch adversarial loss: 2.500001
epoch 3; iter: 200; batch classifier loss: 6.555622; batch adversarial loss: 6.149632
epoch 4; iter: 0; batch classifier loss: 4.313360; batch adversarial loss: 1.807899
epoch 4; iter: 200; batch classifier loss: 1.605585; batch adversarial loss: 2.160141
epoch 5; iter: 0; batch classifier loss: 2.469866; batch adversarial loss: 1.280397
epoch 5; iter: 200; batch classifier loss: 2.144440; batch ad

In [5]:
from aif360.metrics import ClassificationMetric

  vect_normalized_discounted_cumulative_gain = vmap(
  monte_carlo_vect_ndcg = vmap(vect_normalized_discounted_cumulative_gain, in_dims=(0,))


In [6]:
adult_metric_test = ClassificationMetric(
    adult_test,
    adult_predict,
    unprivileged_groups=adult_unprivileged_groups,
    privileged_groups=adult_privileged_groups,
)

In [8]:
print({
        "Accuracy": adult_metric_test.accuracy(),
        "Precision":  adult_metric_test.precision(),
        "Equal Opportunity Difference": adult_metric_test.equal_opportunity_difference(),
        "Statistical Parity Difference": adult_metric_test.statistical_parity_difference(),
})

{'Accuracy': np.float64(0.8452863566005749), 'Precision': np.float64(0.7516939019529693), 'Equal Opportunity Difference': np.float64(0.022709553296986873), 'Statistical Parity Difference': np.float64(-0.12879800558656207)}


### sensitive attribute = race

In [10]:
adult_privileged_groups = [{'race': 1}]  # white
adult_unprivileged_groups = [{'race': 0}]  # non-white

sess = tf.compat.v1.Session()

adult_ad_race = AdversarialDebiasing(
    privileged_groups=adult_privileged_groups,
    unprivileged_groups=adult_unprivileged_groups,
    scope_name="adult_debiasing_race",
    sess=sess,
)
adult_train_race, adult_test_race = adult_dataset.split([0.7], shuffle=True)
adult_ad_race.fit(adult_train_race)
adult_predict_race = adult_ad_race.predict(adult_test_race)
sess.close()

epoch 0; iter: 0; batch classifier loss: 15.089951; batch adversarial loss: 109.427422
epoch 0; iter: 200; batch classifier loss: 5.532632; batch adversarial loss: 6.326194
epoch 1; iter: 0; batch classifier loss: 6.681258; batch adversarial loss: 2.966672
epoch 1; iter: 200; batch classifier loss: 2.306801; batch adversarial loss: 2.686831
epoch 2; iter: 0; batch classifier loss: 3.488862; batch adversarial loss: 3.756477
epoch 2; iter: 200; batch classifier loss: 1.543002; batch adversarial loss: 1.238295
epoch 3; iter: 0; batch classifier loss: 0.667877; batch adversarial loss: 20.976915
epoch 3; iter: 200; batch classifier loss: 0.877956; batch adversarial loss: 0.717201
epoch 4; iter: 0; batch classifier loss: 1.004907; batch adversarial loss: 1.572234
epoch 4; iter: 200; batch classifier loss: 1.514891; batch adversarial loss: 2.660866
epoch 5; iter: 0; batch classifier loss: 0.419609; batch adversarial loss: 1.168668
epoch 5; iter: 200; batch classifier loss: 10.977070; batch ad

In [11]:
adult_race_metric_test = ClassificationMetric(
    adult_test_race,
    adult_predict_race,
    unprivileged_groups=adult_unprivileged_groups,
    privileged_groups=adult_privileged_groups,
)
print({
        "Accuracy": adult_race_metric_test.accuracy(),
        "Precision":  adult_race_metric_test.precision(),
        "Equal Opportunity Difference": adult_race_metric_test.equal_opportunity_difference(),
        "Statistical Parity Difference": adult_race_metric_test.statistical_parity_difference(),
})

{'Accuracy': np.float64(0.8408638608388), 'Precision': np.float64(0.7297297297297297), 'Equal Opportunity Difference': np.float64(0.04281353712275093), 'Statistical Parity Difference': np.float64(-0.05788950717593025)}


# Add adversary model evaluation impl

`evaluate_adversary` helps evaluate `adversary_model`'s ability to predict sensitive attributes.

In [23]:
class AdversarialDebiasing(Transformer):
    """Adversarial debiasing is an in-processing technique that learns a
    classifier to maximize prediction accuracy and simultaneously reduce an
    adversary's ability to determine the protected attribute from the
    predictions [5]_. This approach leads to a fair classifier as the
    predictions cannot carry any group discrimination information that the
    adversary can exploit.

    References:
        .. [5] B. H. Zhang, B. Lemoine, and M. Mitchell, "Mitigating Unwanted
           Biases with Adversarial Learning," AAAI/ACM Conference on Artificial
           Intelligence, Ethics, and Society, 2018.
    """

    def __init__(self,
                 unprivileged_groups,
                 privileged_groups,
                 scope_name,
                 sess,
                 seed=None,
                 adversary_loss_weight=0.1,
                 num_epochs=50,
                 batch_size=128,
                 classifier_num_hidden_units=200,
                 debias=True):
        """
        Args:
            unprivileged_groups (tuple): Representation for unprivileged groups
            privileged_groups (tuple): Representation for privileged groups
            scope_name (str): scope name for the tenforflow variables
            sess (tf.Session): tensorflow session
            seed (int, optional): Seed to make `predict` repeatable.
            adversary_loss_weight (float, optional): Hyperparameter that chooses
                the strength of the adversarial loss.
            num_epochs (int, optional): Number of training epochs.
            batch_size (int, optional): Batch size.
            classifier_num_hidden_units (int, optional): Number of hidden units
                in the classifier model.
            debias (bool, optional): Learn a classifier with or without
                debiasing.
        """
        super(AdversarialDebiasing, self).__init__(
            unprivileged_groups=unprivileged_groups,
            privileged_groups=privileged_groups)

        self.scope_name = scope_name
        self.seed = seed

        self.unprivileged_groups = unprivileged_groups
        self.privileged_groups = privileged_groups
        if len(self.unprivileged_groups) > 1 or len(self.privileged_groups) > 1:
            raise ValueError("Only one unprivileged_group or privileged_group supported.")
        self.protected_attribute_name = list(self.unprivileged_groups[0].keys())[0]

        self.sess = sess
        self.adversary_loss_weight = adversary_loss_weight
        self.num_epochs = num_epochs
        self.batch_size = batch_size
        self.classifier_num_hidden_units = classifier_num_hidden_units
        self.debias = debias

        self.features_dim = None
        self.features_ph = None
        self.protected_attributes_ph = None
        self.true_labels_ph = None
        self.pred_labels = None
        self.pred_protected_attributes = None

    def _classifier_model(self, features, features_dim, keep_prob):
        """Compute the classifier predictions for the outcome variable.

        Input
        features: a tensor representing the input features for the classifier
        features_dim: dimension of the feature dataset

        Return
        pred_label: predicted label
        pred_logits: raw logits output from the output layer that holds
        the unactivated score of prediction confidence
        """
        with tf.variable_scope("classifier_model"):
            W1 = tf.get_variable('W1', [features_dim, self.classifier_num_hidden_units],
                                  initializer=tf.initializers.glorot_uniform(seed=self.seed1))
            b1 = tf.Variable(tf.zeros(shape=[self.classifier_num_hidden_units]), name='b1')

            h1 = tf.nn.relu(tf.matmul(features, W1) + b1)
            h1 = tf.nn.dropout(h1, keep_prob=keep_prob, seed=self.seed2)

            W2 = tf.get_variable('W2', [self.classifier_num_hidden_units, 1],
                                 initializer=tf.initializers.glorot_uniform(seed=self.seed3))
            b2 = tf.Variable(tf.zeros(shape=[1]), name='b2')

            pred_logit = tf.matmul(h1, W2) + b2
            pred_label = tf.sigmoid(pred_logit) # predictive binary classification with sigmoid activation

        return pred_label, pred_logit, h1

    def _adversary_model(self, pred_logits, true_labels, hidden_layer):
        """Compute the adversary predictions for the protected attribute.
        """
        with tf.variable_scope("adversary_model"):
            c = tf.get_variable('c', initializer=tf.constant(1.0))
            s = tf.sigmoid((1 + tf.abs(c)) * pred_logits)
            # Add hidden layer processing
            W_hidden = tf.get_variable('W_hidden', [self.classifier_num_hidden_units, 10],
                               initializer=tf.initializers.glorot_uniform(seed=self.seed4))
            hidden_features = tf.matmul(hidden_layer, W_hidden)

            W2 = tf.get_variable('W2', [13, 1],
                                 initializer=tf.initializers.glorot_uniform(seed=self.seed4))
            b2 = tf.Variable(tf.zeros(shape=[1]), name='b2')
            combined_features = tf.concat([s, s * true_labels, s * (1.0 - true_labels), hidden_features], axis=1)

            pred_protected_attribute_logit = tf.matmul(combined_features, W2) + b2
            pred_protected_attribute_label = tf.sigmoid(pred_protected_attribute_logit)

        return pred_protected_attribute_label, pred_protected_attribute_logit
        
    def evaluate_adversary(self, dataset):
        """Evaluate adversary's ability to predict protected attributes.
        
        Args:
            dataset (BinaryLabelDataset): Dataset containing features and protected attributes
            
        Returns:
            dict: Dictionary containing evaluation metrics
        """
        if self.seed is not None:
            np.random.seed(self.seed)

        num_samples = len(dataset.features)
        pred_protected_attrs = []
        true_protected_attrs = []
        
        for i in range(0, num_samples, self.batch_size):
            batch_features = dataset.features[i:i + self.batch_size]
            batch_labels = dataset.labels[i:i + self.batch_size].reshape(-1, 1)
            batch_protected = dataset.protected_attributes[i:i + self.batch_size, 
                dataset.protected_attribute_names.index(self.protected_attribute_name)].reshape(-1, 1)
            
            feed_dict = {
                self.features_ph: batch_features,
                self.true_labels_ph: batch_labels,
                self.protected_attributes_ph: batch_protected,
                self.keep_prob: 1.0
            }
            
            pred_protected_batch = self.sess.run(self.pred_protected_attributes, feed_dict=feed_dict)
            
            pred_protected_attrs.extend((pred_protected_batch > 0.5).astype(int))
            true_protected_attrs.extend(batch_protected)

        pred_protected_attrs = np.array(pred_protected_attrs)
        true_protected_attrs = np.array(true_protected_attrs)
        
        metrics = {
            'accuracy': accuracy_score(true_protected_attrs, pred_protected_attrs),
            'auc': roc_auc_score(true_protected_attrs, pred_protected_attrs),
            'confusion_matrix': confusion_matrix(true_protected_attrs, pred_protected_attrs)
        }
        
        return metrics

    def fit(self, dataset):
        """Compute the model parameters of the fair classifier using gradient
        descent.

        Args:
            dataset (BinaryLabelDataset): Dataset containing true labels.

        Returns:
            AdversarialDebiasing: Returns self.
        """
        if tf.executing_eagerly():
            raise RuntimeError("AdversarialDebiasing does not work in eager "
                    "execution mode. To fix, add `tf.disable_eager_execution()`"
                    " to the top of the calling script.")

        if self.seed is not None:
            np.random.seed(self.seed)
        ii32 = np.iinfo(np.int32)
        self.seed1, self.seed2, self.seed3, self.seed4 = np.random.randint(ii32.min, ii32.max, size=4)

        # Map the dataset labels to 0 and 1.
        temp_labels = dataset.labels.copy()

        temp_labels[(dataset.labels == dataset.favorable_label).ravel(),0] = 1.0
        temp_labels[(dataset.labels == dataset.unfavorable_label).ravel(),0] = 0.0

        with tf.variable_scope(self.scope_name):
            num_train_samples, self.features_dim = np.shape(dataset.features)

            # Setup placeholders
            self.features_ph = tf.placeholder(tf.float32, shape=[None, self.features_dim])
            self.protected_attributes_ph = tf.placeholder(tf.float32, shape=[None,1])
            self.true_labels_ph = tf.placeholder(tf.float32, shape=[None,1])
            self.keep_prob = tf.placeholder(tf.float32)

            # Obtain classifier predictions and classifier loss
            self.pred_labels, pred_logits, hidden_layer = self._classifier_model(self.features_ph, self.features_dim, self.keep_prob)
            # cross entropy loss between true and predicted labels
            pred_labels_loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(labels=self.true_labels_ph, logits=pred_logits))

            if self.debias:
                # Obtain adversary predictions and adversary loss
                pred_protected_attributes_labels, pred_protected_attributes_logits = self._adversary_model(pred_logits, self.true_labels_ph, hidden_layer)
                self.pred_protected_attributes = pred_protected_attributes_labels
                pred_protected_attributes_loss = tf.reduce_mean(
                    tf.nn.sigmoid_cross_entropy_with_logits(labels=self.protected_attributes_ph, logits=pred_protected_attributes_logits))

            # Setup optimizers with learning rates
            global_step = tf.Variable(0, trainable=False)
            starter_learning_rate = 0.001
            learning_rate = tf.train.exponential_decay(starter_learning_rate, global_step,
                                                       1000, 0.96, staircase=True)
            # Tensorflow optimizer for classifier
            classifier_opt = tf.train.AdamOptimizer(learning_rate)
            if self.debias:
                # Tensorflow optimizer for adversary
                adversary_opt = tf.train.AdamOptimizer(learning_rate)

            classifier_vars = [var for var in tf.trainable_variables(scope=self.scope_name) if 'classifier_model' in var.name]
            if self.debias:
                adversary_vars = [var for var in tf.trainable_variables(scope=self.scope_name) if 'adversary_model' in var.name]
                # Update classifier parameters
                adversary_grads = {var: grad for (grad, var) in adversary_opt.compute_gradients(pred_protected_attributes_loss,
                                                                                      var_list=classifier_vars)}
            normalize = lambda x: x / (tf.norm(x) + np.finfo(np.float32).tiny)


            classifier_grads = []
            for (grad,var) in classifier_opt.compute_gradients(pred_labels_loss, var_list=classifier_vars):
                if self.debias:
                    unit_adversary_grad = normalize(adversary_grads[var])
                    grad -= tf.reduce_sum(grad * unit_adversary_grad) * unit_adversary_grad
                    grad -= self.adversary_loss_weight * adversary_grads[var]
                classifier_grads.append((grad, var))
            classifier_minimizer = classifier_opt.apply_gradients(classifier_grads, global_step=global_step)

            if self.debias:
                # Update adversary parameters
                with tf.control_dependencies([classifier_minimizer]):
                    adversary_minimizer = adversary_opt.minimize(pred_protected_attributes_loss, var_list=adversary_vars)#, global_step=global_step)

            self.sess.run(tf.global_variables_initializer())
            self.sess.run(tf.local_variables_initializer())

            # Begin training
            for epoch in range(self.num_epochs):
                shuffled_ids = np.random.choice(num_train_samples, num_train_samples, replace=False)
                for i in range(num_train_samples//self.batch_size):
                    batch_ids = shuffled_ids[self.batch_size*i: self.batch_size*(i+1)]
                    batch_features = dataset.features[batch_ids]
                    batch_labels = np.reshape(temp_labels[batch_ids], [-1,1])
                    batch_protected_attributes = np.reshape(dataset.protected_attributes[batch_ids][:,
                                                 dataset.protected_attribute_names.index(self.protected_attribute_name)], [-1,1])

                    batch_feed_dict = {self.features_ph: batch_features,
                                       self.true_labels_ph: batch_labels,
                                       self.protected_attributes_ph: batch_protected_attributes,
                                       self.keep_prob: 0.8}
                    if self.debias:
                        _, _, pred_labels_loss_value, pred_protected_attributes_loss_vale = self.sess.run([classifier_minimizer,
                                       adversary_minimizer,
                                       pred_labels_loss,
                                       pred_protected_attributes_loss], feed_dict=batch_feed_dict)
                        if i % 200 == 0:
                            print("epoch %d; iter: %d; batch classifier loss: %f; batch adversarial loss: %f" % (epoch, i, pred_labels_loss_value,
                                                                                     pred_protected_attributes_loss_vale))
                    else:
                        _, pred_labels_loss_value = self.sess.run(
                            [classifier_minimizer,
                             pred_labels_loss], feed_dict=batch_feed_dict)
                        if i % 200 == 0:
                            print("epoch %d; iter: %d; batch classifier loss: %f" % (
                            epoch, i, pred_labels_loss_value))
        return self

    def predict(self, dataset):
        """Obtain the predictions for the provided dataset using the fair
        classifier learned.

        Args:
            dataset (BinaryLabelDataset): Dataset containing labels that needs
                to be transformed.
        Returns:
            dataset (BinaryLabelDataset): Transformed dataset.
        """

        if self.seed is not None:
            np.random.seed(self.seed)

        num_test_samples, _ = np.shape(dataset.features)

        samples_covered = 0
        pred_labels = []
        while samples_covered < num_test_samples:
            start = samples_covered
            end = samples_covered + self.batch_size
            if end > num_test_samples:
                end = num_test_samples
            batch_ids = np.arange(start, end)
            batch_features = dataset.features[batch_ids]
            batch_labels = np.reshape(dataset.labels[batch_ids], [-1,1])
            batch_protected_attributes = np.reshape(dataset.protected_attributes[batch_ids][:,
                                         dataset.protected_attribute_names.index(self.protected_attribute_name)], [-1,1])

            batch_feed_dict = {self.features_ph: batch_features,
                               self.true_labels_ph: batch_labels,
                               self.protected_attributes_ph: batch_protected_attributes,
                               self.keep_prob: 1.0}

            pred_labels += self.sess.run(self.pred_labels, feed_dict=batch_feed_dict)[:,0].tolist()
            samples_covered += len(batch_features)

        # Mutated, fairer dataset with new labels
        dataset_new = dataset.copy(deepcopy = True)
        dataset_new.scores = np.array(pred_labels, dtype=np.float64).reshape(-1, 1)
        dataset_new.labels = (np.array(pred_labels)>0.5).astype(np.float64).reshape(-1,1)


        # Map the dataset labels to back to their original values.
        temp_labels = dataset_new.labels.copy()

        temp_labels[(dataset_new.labels == 1.0).ravel(), 0] = dataset.favorable_label
        temp_labels[(dataset_new.labels == 0.0).ravel(), 0] = dataset.unfavorable_label

        dataset_new.labels = temp_labels.copy()

        return dataset_new

## Evaluate adversary_model for protected attribute = sex case

In [24]:
adult_dataset = AdultDataset()
adult_train, adult_test = adult_dataset.split([0.7], shuffle=True)
adult_privileged_groups = [{'sex': 1}]  # Male
adult_unprivileged_groups = [{'sex': 0}]  # Female

sess = tf.compat.v1.Session()

adult_ad = AdversarialDebiasing(
    privileged_groups=adult_privileged_groups,
    unprivileged_groups=adult_unprivileged_groups,
    scope_name="ad_eval_sex",
    sess=sess,
)
adult_ad.fit(adult_train)
adult_predict = adult_ad.predict(adult_test)
adversary_model_eval = adult_ad.evaluate_adversary(adult_test)
sess.close()



epoch 0; iter: 0; batch classifier loss: 56.425098; batch adversarial loss: 15.921197
epoch 0; iter: 200; batch classifier loss: 3.644402; batch adversarial loss: 8.841759
epoch 1; iter: 0; batch classifier loss: 4.655004; batch adversarial loss: 5.581833
epoch 1; iter: 200; batch classifier loss: 3.012911; batch adversarial loss: 6.471652
epoch 2; iter: 0; batch classifier loss: 2.563766; batch adversarial loss: 4.264257
epoch 2; iter: 200; batch classifier loss: 0.291487; batch adversarial loss: 11.848980
epoch 3; iter: 0; batch classifier loss: 4.885805; batch adversarial loss: 2.573878
epoch 3; iter: 200; batch classifier loss: 0.851050; batch adversarial loss: 1.044282
epoch 4; iter: 0; batch classifier loss: 1.965853; batch adversarial loss: 8.589492
epoch 4; iter: 200; batch classifier loss: 2.457242; batch adversarial loss: 1.142101
epoch 5; iter: 0; batch classifier loss: 2.085353; batch adversarial loss: 1.294117
epoch 5; iter: 200; batch classifier loss: 1.567454; batch adve

In [25]:
adversary_model_eval

{'accuracy': 0.6677231517653129,
 'auc': np.float64(0.5015695604493574),
 'confusion_matrix': array([[ 110, 4309],
        [ 199, 8949]])}

## Evaluate adversary_model for protected attribute = race case

In [26]:
adult_privileged_groups = [{'race': 1}]  # white
adult_unprivileged_groups = [{'race': 0}]  # non-white

sess = tf.compat.v1.Session()

adult_ad_race = AdversarialDebiasing(
    privileged_groups=adult_privileged_groups,
    unprivileged_groups=adult_unprivileged_groups,
    scope_name="ad_eval_race",
    sess=sess,
)
adult_train_race, adult_test_race = adult_dataset.split([0.7], shuffle=True)
adult_ad_race.fit(adult_train_race)
adversary_model_eval_race = adult_ad_race.evaluate_adversary(adult_test)
sess.close()

epoch 0; iter: 0; batch classifier loss: 172.117157; batch adversarial loss: 74.262733
epoch 0; iter: 200; batch classifier loss: 7.617409; batch adversarial loss: 9.049286
epoch 1; iter: 0; batch classifier loss: 6.327208; batch adversarial loss: 3.348848
epoch 1; iter: 200; batch classifier loss: 7.633907; batch adversarial loss: 9.172488
epoch 2; iter: 0; batch classifier loss: 2.654064; batch adversarial loss: 0.427413
epoch 2; iter: 200; batch classifier loss: 3.535689; batch adversarial loss: 1.480289
epoch 3; iter: 0; batch classifier loss: 7.298629; batch adversarial loss: 2.112755
epoch 3; iter: 200; batch classifier loss: 1.964802; batch adversarial loss: 0.659369
epoch 4; iter: 0; batch classifier loss: 2.002451; batch adversarial loss: 0.830459
epoch 4; iter: 200; batch classifier loss: 2.081960; batch adversarial loss: 11.801059
epoch 5; iter: 0; batch classifier loss: 1.811082; batch adversarial loss: 1.593162
epoch 5; iter: 200; batch classifier loss: 2.418119; batch adv

In [27]:
adversary_model_eval_race

{'accuracy': 0.8670302940959682,
 'auc': np.float64(0.5),
 'confusion_matrix': array([[    0,  1804],
        [    0, 11763]])}