##### Copyright 2018 The TensorFlow Constrained Optimization Authors. All Rights Reserved.

Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at

> http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License.

## Overview

In this notebook, we explore the problem of classification with fairness on the [https://www.researchconnections.org/icpsrweb/instructors/studies/36151] (School Staffing -008 and Medical Staffing-005). We show how to set up a classification problem with data-dependent fairness constraints using the TensorFlow Constrained Optimization library and then subsequently train to optimize fairness.

In [62]:
import math
import random
import numpy as np
import pandas as pd
import warnings
from six.moves import xrange
import tensorflow.compat.v1 as tf
import tensorflow_constrained_optimization as tfco
import matplotlib.pyplot as plt
from sklearn import model_selection

tf.disable_eager_execution()

warnings.filterwarnings('ignore')
%matplotlib inline

### Reading and processing dataset.

We load the [https://www.researchconnections.org/icpsrweb/instructors/studies/36151] and do some pre-processing. The dataset is based on India's Human Development Survey, is to predict whether someone who has higher education is appointed as a teaching staff or not. We construct three protected groups, two based on gender (SS7 - Male and Female) and two based on religion (SS11- Hindu, Muslim, Christian, Sikh, Tribal, etc) and on Jati Code (SS12 - Brahmi, Forward Class or Backward Class OBSC, etc).

We preprocess the features as done in works such as [[ZafarEtAl15]](https://arxiv.org/abs/1507.05259) and [[GohEtAl16]](https://arxiv.org/abs/1606.07558). We transform the categorical features into categorical codes. Also if you have continuous feature you can transofrm them into buckets based on each feature's 5 quantiles values in training.

The fairness goal is that of equal opportunity. That is, we would like the true positive rates of our classifier on the protected groups to match that of the overall dataset.

In [63]:
dataset_path = "data/staffing-teaching/School-Staff-Data.csv"

# Read dataset from the UCI web repository and assign column names.
data_df = pd.read_csv(dataset_path)
data_df.head()

Unnamed: 0,STATEID,DISTID,PSUID,SCHOOLID,SQGOVT,SS1,SS3,SS4,SS5,SS6,SS7,SS8,SS9,SS10,SS11,SS12,SS13
0,1,2,1,1,1,1,1,1,5,1,1,25,15,1.0,2,3,2
1,1,2,1,1,1,2,2,3,5,1,2,26,15,1.0,2,3,2
2,1,2,1,1,1,3,3,3,99,1,2,35,0,,2,5,0
3,1,2,1,2,2,1,1,3,2,2,2,25,15,0.0,2,3,4
4,1,2,1,2,2,2,2,1,2,1,2,21,15,0.0,2,3,1


In [64]:
data_df.size

543898

In [65]:
data_df.head(10)

Unnamed: 0,STATEID,DISTID,PSUID,SCHOOLID,SQGOVT,SS1,SS3,SS4,SS5,SS6,SS7,SS8,SS9,SS10,SS11,SS12,SS13
0,1,2,1,1,1,1,1,1,5,1,1,25,15,1.0,2,3,2
1,1,2,1,1,1,2,2,3,5,1,2,26,15,1.0,2,3,2
2,1,2,1,1,1,3,3,3,99,1,2,35,0,,2,5,0
3,1,2,1,2,2,1,1,3,2,2,2,25,15,0.0,2,3,4
4,1,2,1,2,2,2,2,1,2,1,2,21,15,0.0,2,3,1
5,1,2,2,1,1,1,1,1,5,1,1,35,17,1.0,2,2,5
6,1,2,2,1,1,2,2,2,5,1,2,32,17,1.0,2,2,1
7,1,2,2,2,2,1,1,1,5,1,1,27,15,1.0,2,2,0
8,1,2,2,2,2,2,2,3,5,1,1,24,15,1.0,2,2,0
9,1,2,2,2,2,3,2,3,5,1,1,34,12,0.0,2,2,0


In [66]:
cols = data_df.columns

# Transforming Categorical Features to Categorical Codes

In [67]:
for col in data_df.select_dtypes(include=[object]):

    data_df['org_'+col] =  data_df[col]
    data_df[col] = data_df[col].astype('category').cat.codes
    data_df[col] = data_df[col].astype(float)
    print(col)
    print(dict(zip( data_df['org_'+ col], data_df[col] ) ))


data_df = data_df[cols]
data_df.head(10)

SS1
{'1': 1.0, '2': 12.0, '3': 23.0, '4': 34.0, '5': 42.0, '6': 43.0, '7': 44.0, '8': 45.0, '9': 46.0, '10': 2.0, '11': 3.0, '12': 4.0, '13': 5.0, '14': 6.0, '15': 7.0, '16': 8.0, '17': 9.0, '18': 10.0, '19': 11.0, '20': 13.0, '21': 14.0, '22': 15.0, '23': 16.0, '24': 17.0, '25': 18.0, '26': 19.0, '27': 20.0, '28': 21.0, '29': 22.0, '30': 24.0, '31': 25.0, '32': 26.0, '33': 27.0, '34': 28.0, '35': 29.0, '36': 30.0, '37': 31.0, '38': 32.0, '39': 33.0, '40': 35.0, '41': 36.0, '42': 37.0, '43': 38.0, '44': 39.0, '45': 40.0, '46': 41.0, ' ': 0.0}
SS3
{'1': 1.0, '2': 2.0, '3': 3.0, '5': 5.0, '4': 4.0, ' ': 0.0}
SS4
{'1': 1.0, '3': 3.0, '2': 2.0, ' ': 0.0}
SS5
{'5': 9.0, '99': 15.0, '2': 5.0, '4': 8.0, '3': 7.0, '8': 13.0, '6': 11.0, '1': 1.0, '10': 2.0, '55': 10.0, '12': 4.0, '9': 14.0, '7': 12.0, ' ': 0.0, '11': 3.0, '22': 6.0}
SS6
{'1': 1.0, '2': 2.0, ' ': 0.0, '3': 3.0}
SS7
{'1': 1.0, '2': 2.0, ' ': 0.0}
SS8
{'25': 12.0, '26': 13.0, '35': 22.0, '21': 8.0, '32': 19.0, '27': 14.0, '24': 11

Unnamed: 0,STATEID,DISTID,PSUID,SCHOOLID,SQGOVT,SS1,SS3,SS4,SS5,SS6,SS7,SS8,SS9,SS10,SS11,SS12,SS13
0,1,2,1,1,1,1.0,1.0,1.0,9.0,1.0,1.0,12.0,4.0,2.0,2.0,3.0,13.0
1,1,2,1,1,1,12.0,2.0,3.0,9.0,1.0,2.0,13.0,4.0,2.0,2.0,3.0,13.0
2,1,2,1,1,1,23.0,3.0,3.0,15.0,1.0,2.0,22.0,1.0,0.0,2.0,5.0,1.0
3,1,2,1,2,2,1.0,1.0,3.0,5.0,2.0,2.0,12.0,4.0,1.0,2.0,3.0,35.0
4,1,2,1,2,2,12.0,2.0,1.0,5.0,1.0,2.0,8.0,4.0,1.0,2.0,3.0,2.0
5,1,2,2,1,1,1.0,1.0,1.0,9.0,1.0,1.0,22.0,5.0,2.0,2.0,2.0,43.0
6,1,2,2,1,1,12.0,2.0,2.0,9.0,1.0,2.0,19.0,5.0,2.0,2.0,2.0,2.0
7,1,2,2,2,2,1.0,1.0,1.0,9.0,1.0,1.0,14.0,4.0,2.0,2.0,2.0,1.0
8,1,2,2,2,2,12.0,2.0,3.0,9.0,1.0,1.0,11.0,4.0,2.0,2.0,2.0,1.0
9,1,2,2,2,2,23.0,2.0,3.0,9.0,1.0,1.0,21.0,3.0,1.0,2.0,2.0,1.0


In [68]:
data_df['SS9'] = data_df['SS9'].astype(int)
data_df['SS9'] = np.where((data_df['SS9'] != 5), 0, 1)
data_df['SS9'].unique()

array([0, 1])

In [69]:
labels_df = data_df[['SS9']]
labels_df['SS9'] = labels_df['SS9'].astype(int)
labels_df

Unnamed: 0,SS9
0,0
1,0
2,0
3,0
4,0
...,...
31989,0
31990,0
31991,0
31992,0


# Filling missing values

In [70]:
feature_names = data_df.columns
for feature_name in feature_names:  
    # Which rows have missing values?
    missing_rows = data_df[feature_name].isna()
    if missing_rows.any():  # Check if at least one row has a missing value.
        data_df[feature_name].fillna(0.0, inplace=True)  # Fill NaN with 0.
        missing_rows.rename(feature_name + "_is_missing", inplace=True)
        data_df = data_df.join(missing_rows)  # Append "is_missing" feature.

# Train Test Data Split

In [71]:
# Set random seed so that the results are reproducible.
np.random.seed(123456)

# Train and test indices.
train_indices, test_indices = model_selection.train_test_split(
    np.arange(data_df.shape[0]), test_size=1./3.)

# Train and test data.
x_train_df = data_df.loc[train_indices].astype(np.float32)
y_train_df = labels_df.loc[train_indices].astype(np.float32)
x_test_df = data_df.loc[test_indices].astype(np.float32)
y_test_df = labels_df.loc[test_indices].astype(np.float32)

# Convert data frames to NumPy arrays.
x_train = x_train_df.values
y_train = y_train_df.values
x_test = x_test_df.values
y_test = y_test_df.values

In [72]:
x_train_df.head(10)

Unnamed: 0,STATEID,DISTID,PSUID,SCHOOLID,SQGOVT,SS1,SS3,SS4,SS5,SS6,SS7,SS8,SS9,SS10,SS11,SS12,SS13
3780,6.0,6.0,3.0,1.0,1.0,12.0,2.0,2.0,1.0,1.0,1.0,15.0,1.0,2.0,1.0,2.0,14.0
30675,33.0,10.0,6.0,1.0,1.0,42.0,3.0,3.0,15.0,1.0,2.0,40.0,0.0,2.0,1.0,3.0,1.0
21116,24.0,16.0,5.0,1.0,1.0,1.0,1.0,3.0,5.0,2.0,1.0,27.0,0.0,2.0,1.0,5.0,2.0
26051,29.0,1.0,4.0,2.0,2.0,44.0,2.0,3.0,11.0,1.0,1.0,27.0,0.0,2.0,1.0,5.0,1.0
24647,28.0,5.0,6.0,2.0,2.0,3.0,5.0,3.0,15.0,1.0,2.0,31.0,0.0,1.0,1.0,3.0,2.0
24930,28.0,10.0,6.0,1.0,1.0,34.0,2.0,3.0,1.0,2.0,2.0,12.0,0.0,2.0,1.0,5.0,16.0
10066,9.0,60.0,1.0,2.0,2.0,23.0,2.0,3.0,9.0,1.0,1.0,32.0,0.0,1.0,1.0,3.0,1.0
23038,27.0,23.0,9.0,2.0,2.0,23.0,2.0,2.0,5.0,1.0,1.0,22.0,0.0,2.0,3.0,2.0,3.0
4468,6.0,18.0,7.0,2.0,2.0,2.0,2.0,3.0,5.0,1.0,2.0,17.0,1.0,2.0,1.0,2.0,2.0
27038,29.0,12.0,5.0,3.0,2.0,43.0,2.0,2.0,10.0,1.0,1.0,28.0,0.0,2.0,1.0,3.0,3.0


In [73]:
y_train_df.head(10)

Unnamed: 0,SS9
3780,1.0
30675,0.0
21116,0.0
26051,0.0
24647,0.0
24930,0.0
10066,0.0
23038,0.0
4468,1.0
27038,0.0


In [74]:
np.unique(y_train)

array([0., 1.], dtype=float32)

In [75]:
np.unique(y_test)

array([0., 1.], dtype=float32)

In [76]:
x_test_df.head(10)

Unnamed: 0,STATEID,DISTID,PSUID,SCHOOLID,SQGOVT,SS1,SS3,SS4,SS5,SS6,SS7,SS8,SS9,SS10,SS11,SS12,SS13
995,2.0,5.0,4.0,1.0,1.0,1.0,1.0,2.0,7.0,1.0,2.0,39.0,0.0,2.0,1.0,1.0,24.0
26735,29.0,9.0,2.0,1.0,1.0,34.0,2.0,2.0,13.0,1.0,2.0,31.0,0.0,2.0,1.0,4.0,0.0
12027,11.0,0.0,5.0,1.0,1.0,43.0,2.0,3.0,8.0,1.0,2.0,42.0,0.0,2.0,5.0,2.0,1.0
6060,8.0,6.0,7.0,2.0,2.0,1.0,1.0,1.0,9.0,1.0,1.0,27.0,0.0,2.0,1.0,3.0,19.0
13686,19.0,10.0,9.0,1.0,1.0,45.0,3.0,2.0,15.0,1.0,2.0,22.0,0.0,1.0,1.0,4.0,1.0
29663,32.0,8.0,3.0,2.0,2.0,44.0,2.0,3.0,8.0,1.0,2.0,9.0,0.0,2.0,1.0,2.0,7.0
30245,33.0,2.0,8.0,1.0,1.0,43.0,3.0,2.0,15.0,1.0,2.0,45.0,0.0,1.0,1.0,3.0,1.0
31036,33.0,15.0,2.0,1.0,1.0,12.0,2.0,1.0,8.0,1.0,2.0,29.0,1.0,2.0,1.0,1.0,1.0
2727,3.0,11.0,10.0,2.0,2.0,2.0,2.0,3.0,9.0,1.0,2.0,27.0,0.0,1.0,1.0,2.0,1.0
15907,21.0,9.0,2.0,2.0,2.0,4.0,2.0,3.0,13.0,1.0,1.0,35.0,0.0,1.0,1.0,3.0,2.0


### Model.

We use a linear model and predict positively or negatively based on threshold at 0.

In the following code, we initialize the placeholders and model. In build_train_op, we set up the constrained optimization problem. We create a rate context for the entire dataset, and compute the overall false positive rate as the positive prediction rate on the negatively labeled subset. We then construct a constraint for each of the protected groups based on the difference between the true positive rates of the protected group and that of the overall dataset. We then construct a minimization problem using RateMinimizationProblem and use the ProxyLagrangianOptimizerV1 as the solver. build_train_op initializes a training operation which will later be used to actually train the model. 

In [77]:
class Model(object):
    def __init__(self,
                 tpr_max_diff=0):
        tf.random.set_random_seed(123)
        self.tpr_max_diff = tpr_max_diff
        num_features = len(FEATURE_NAMES)
        self.features_placeholder = tf.placeholder(
            tf.float32, shape=(None, num_features), name='features_placeholder')
        self.labels_placeholder = tf.placeholder(
            tf.float32, shape=(None, 1), name='labels_placeholder')
        self.protected_placeholders = [tf.placeholder(tf.float32, shape=(None, 1), name=attribute+"_placeholder") for attribute in PROTECTED_COLUMNS]
        # We use a linear model.
        self.predictions_tensor = tf.layers.dense(inputs=self.features_placeholder, units=1, activation=None)


    def build_train_op(self,
                       learning_rate,
                       unconstrained=False):
        ctx = tfco.rate_context(self.predictions_tensor, self.labels_placeholder)
        positive_slice = ctx.subset(self.labels_placeholder > 0) 
        overall_tpr = tfco.positive_prediction_rate(positive_slice)
        constraints = []
        if not unconstrained:
            for placeholder in self.protected_placeholders:
                slice_tpr = tfco.positive_prediction_rate(ctx.subset((placeholder > 0) & (self.labels_placeholder > 0)))
                constraints.append(slice_tpr >= overall_tpr - self.tpr_max_diff)
        mp = tfco.RateMinimizationProblem(tfco.error_rate(ctx), constraints)
        opt = tfco.ProxyLagrangianOptimizerV1(tf.train.AdamOptimizer(learning_rate))
        self.train_op = opt.minimize(mp)
        return self.train_op
  
    def feed_dict_helper(self, dataframe):
        feed_dict = {self.features_placeholder:
                  dataframe[FEATURE_NAMES],
              self.labels_placeholder:
                  dataframe[[LABEL_COLUMN]],}
        for i, protected_attribute in enumerate(PROTECTED_COLUMNS):
            feed_dict[self.protected_placeholders[i]] = dataframe[[protected_attribute]]
        return feed_dict

### Training.

Below is the function which performs the training of our constrained optimization problem. Each call to the function does one epoch through the dataset and then yields the training and testing predictions.

In [78]:
FEATURE_NAMES = x_train_df.columns
PROTECTED_COLUMNS = ['SS7', 'SS11', 'SS12', ]  #Sex, Religion and Jati code
LABEL_COLUMN = 'SS9'

In [79]:
def training_generator(model,
                       train_df,
                       test_df,
                       minibatch_size,
                       num_iterations_per_loop=1,
                       num_loops=1):
    random.seed(31337)
    num_rows = train_df.shape[0]
    minibatch_size = min(minibatch_size, num_rows)
    permutation = list(range(train_df.shape[0]))
    random.shuffle(permutation)

    session = tf.Session()
    session.run((tf.global_variables_initializer(),
               tf.local_variables_initializer()))

    minibatch_start_index = 0
    for n in xrange(num_loops):
        for _ in xrange(num_iterations_per_loop):
            minibatch_indices = []
            while len(minibatch_indices) < minibatch_size:
                minibatch_end_index = (
                minibatch_start_index + minibatch_size - len(minibatch_indices))
                if minibatch_end_index >= num_rows:
                    minibatch_indices += range(minibatch_start_index, num_rows)
                    minibatch_start_index = 0
                else:
                    minibatch_indices += range(minibatch_start_index, minibatch_end_index)
                    minibatch_start_index = minibatch_end_index
                    
            session.run(
                  model.train_op,
                  feed_dict=model.feed_dict_helper(
                      train_df.iloc[[permutation[ii] for ii in minibatch_indices]]))

        train_predictions = session.run(
            model.predictions_tensor,
            feed_dict=model.feed_dict_helper(train_df))
        test_predictions = session.run(
            model.predictions_tensor,
            feed_dict=model.feed_dict_helper(test_df))

        yield (train_predictions, test_predictions)

### Computing accuracy and fairness metrics.

In [80]:
def error_rate(predictions, labels):
    signed_labels = (
      (labels > 0).astype(np.float32) - (labels <= 0).astype(np.float32))
    numerator = (np.multiply(signed_labels, predictions) <= 0).sum()
    denominator = predictions.shape[0]
    return float(numerator) / float(denominator)


def positive_prediction_rate(predictions, subset):
    numerator = np.multiply((predictions > 0).astype(np.float32),
                          (subset > 0).astype(np.float32)).sum()
    denominator = (subset > 0).sum()
    return float(numerator) / float(denominator)

def tpr(df):
    """Measure the true positive rate."""
    fp = sum((df['predictions'] >= 0.0) & (df[LABEL_COLUMN] > 0.5))
    ln = sum(df[LABEL_COLUMN] > 0.5)
    return float(fp) / float(ln)

def _get_error_rate_and_constraints(df, tpr_max_diff):
    """Computes the error and fairness violations."""
    error_rate_local = error_rate(df[['predictions']], df[[LABEL_COLUMN]])
    overall_tpr = tpr(df)
    return error_rate_local, [(overall_tpr - tpr_max_diff) - tpr(df[df[protected_attribute] > 0.5]) for protected_attribute in PROTECTED_COLUMNS]

def _get_exp_error_rate_constraints(cand_dist, error_rates_vector, constraints_matrix):
    """Computes the expected error and fairness violations on a randomized solution."""
    expected_error_rate = np.dot(cand_dist, error_rates_vector)
    expected_constraints = np.matmul(cand_dist, constraints_matrix)
    return expected_error_rate, expected_constraints

def training_helper(model,
                    train_df,
                    test_df,
                    minibatch_size,
                    num_iterations_per_loop=1,
                    num_loops=1):
    train_error_rate_vector = []
    train_constraints_matrix = []
    test_error_rate_vector = []
    test_constraints_matrix = []
    for train, test in training_generator(
      model, train_df, test_df, minibatch_size, num_iterations_per_loop,
      num_loops):
        train_df['predictions'] = train
        test_df['predictions'] = test

        train_error_rate, train_constraints = _get_error_rate_and_constraints(
          train_df, model.tpr_max_diff)
        train_error_rate_vector.append(train_error_rate)
        train_constraints_matrix.append(train_constraints)

        test_error_rate, test_constraints = _get_error_rate_and_constraints(
            test_df, model.tpr_max_diff)
        test_error_rate_vector.append(test_error_rate)
        test_constraints_matrix.append(test_constraints)

    return (train_error_rate_vector, train_constraints_matrix, test_error_rate_vector, test_constraints_matrix)

### Baseline without constraints.

We now declare the model, build the training op, and then perform the training. We use a linear classifier, and train using the ADAM optimizer with learning rate 0.01, with minibatch size of 100 over 40 epochs. We first train without fairness constraints to show the baseline performance. We see that without training fair fairness, we obtain a high fairness violation.


In [81]:
model = Model(tpr_max_diff=0.05)
model.build_train_op(0.01, unconstrained=True)

# training_helper returns the list of errors and violations over each epoch.
train_errors, train_violations, test_errors, test_violations = training_helper(
      model,
      x_train_df,
      x_test_df,
      128,
      num_iterations_per_loop=326,
      num_loops=250)

In [82]:
print("Train Error", train_errors[-1])
print("Train Violation", max(train_violations[-1]))
print()
print("Test Error", test_errors[-1])
print("Test Violation", max(test_violations[-1]))

Train Error 0.0
Train Violation -0.050000000000000044

Test Error 0.0
Test Violation -0.050000000000000044


### Training with fairness constraints.

We now show train with the constraints using the procedure of [[CoJiSr19]](https://arxiv.org/abs/1804.06500) and returning the last solution found. We see that the fairness violation improves.

We allow an additive fairness slack of 0.05. That is, when training and evaluating the fairness constraints, the true positive rate difference between protected group has to be at least that of the overall dataset up to a slack of at most 0.05. Thus, the fairness constraints would be of the form TPR_p >= TPR - 0.05, where TPR_p and TPR denotes the true positive rates of the protected group and the overall dataset, respectively.


In [90]:
model = Model(tpr_max_diff=0.1)
model.build_train_op(0.01, unconstrained=False)

# training_helper returns the list of errors and violations over each epoch.
train_errors, train_violations, test_errors, test_violations = training_helper(
      model,
      x_train_df,
      x_test_df,
      128,
      num_iterations_per_loop=326,
      num_loops=250)

In [91]:
print("Train Error", train_errors[-1])
print("Train Violation", max(train_violations[-1]))
print()
print("Test Error", test_errors[-1])
print("Test Violation", max(test_violations[-1]))

Train Error 0.0
Train Violation -0.09999999999999998

Test Error 0.0
Test Violation -0.09999999999999998


### Improving using Best Iterate instead of Last Iterate.

As discussed in [[CotterEtAl18b]](https://arxiv.org/abs/1809.04198), the last iterate may not be the best choice and suggests a simple heuristic to choose the best iterate out of the ones found after each epoch. The heuristic proceeds by ranking each of the solutions based on accuracy and fairness separately with respect to the training data. Any solutions which satisfy the constraints are equally ranked top in terms fairness. Each solution thus has two ranks. Then, the chosen solution is the one with the smallest maximum of the two ranks. We see that this improves the fairness and can find a better accuracy / fairness trade-off on the training data. 

This solution can be calculated using find_best_candidate_index given the list of training errors and violations associated with each of the epochs.

In [85]:
train_violations[-1]

[-0.050000000000000044, -0.050000000000000044, -0.050000000000000044]

In [86]:
print("Train Error", train_errors[best_cand_index])
print("Train Violation", max(train_violations[best_cand_index]))
print()
print("Test Error", test_errors[best_cand_index])
print("Test Violation", max(test_violations[best_cand_index]))

Train Error 0.0
Train Violation -0.050000000000000044

Test Error 0.0
Test Violation -0.050000000000000044


### Using stochastic solutions.

As discussed in [[CoJiSr19]](https://arxiv.org/abs/1804.06500), neither the best nor last iterate will come with theoretical guarantees. One can instead use randomized solutions, which come with theoretical guarantees. However, as discussed in [[CotterEtAl18b]](https://arxiv.org/abs/1809.04198), there may not always be a clear practical benefits. We show how to use these solutions here for sake of completeness.

#### T-stochastic solution.
The first and simplest randomized solution suggested is the T-stochastic, which simply takes the average of all of the iterates found at each epoch.

In [87]:
print("Train Error", np.mean(train_errors))
print("Train Violation", max(np.mean(train_violations, axis=0)))
print()
print("Test Error", np.mean(test_errors))
print("Test Violation", max(np.mean(test_violations, axis=0)))

Train Error 1.6878428430775002e-06
Train Violation -0.04999989383977878

Test Error 3.000468823253634e-06
Test Violation -0.04999983075940076


#### m-stochastic solution.
[[CoJiSr19]](https://arxiv.org/abs/1804.06500) presents a method which shrinks down the T-stochastic solution down to one that is supported on at most (m+1) points where m is the number of constraints and is guaranteed to be at least as good as the T-stochastic solution. Here we see that indeed there is benefit in performing the shrinking.

This solution can be computed using find_best_candidate_distribution by passing in the training errors and violations found at each epoch and returns the weight of each constituent. We see that indeed, it is sparse.

In [88]:
cand_dist = tfco.find_best_candidate_distribution(train_errors, train_violations)
print(cand_dist)

[1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]


In [89]:
m_stoch_error_train, m_stoch_violations_train = _get_exp_error_rate_constraints(cand_dist, train_errors, train_violations)
m_stoch_error_test, m_stoch_violations_test = _get_exp_error_rate_constraints(cand_dist, test_errors, test_violations)

print("Train Error", m_stoch_error_train)
print("Train Violation", max(m_stoch_violations_train))
print()
print("Test Error", m_stoch_error_test)
print("Test Violation", max(m_stoch_violations_test))

Train Error 0.0
Train Violation -0.050000000000000044

Test Error 0.0
Test Violation -0.050000000000000044
