## 1. Pre-task: Data and explainable classifier preparation
### 1.1 Code setup
The codes for explainable clssifiers are in the following cell.


In [None]:
#@title Codes (Double click this area to collapse this massive cell)
'''
Author: Jonathan Dodge
'''
import pandas as pd
import numpy as np
import json

from copy import deepcopy
from random import randint

from sklearn.linear_model import LogisticRegression
from sklearn.metrics.pairwise import euclidean_distances


#The first two functions will be used to show predictions/labels from the model.  Using them allows each of the explanation function to depend slightly less on the particular data set the model was trained on
def stringPrediction(prediction):
    answerStr = ""
    if prediction:
        answerStr = "likely to reoffend"
    else:
        answerStr = "NOT likely to reoffend"
    return answerStr
def stringInstanceLabel(label):
    answerStr = ""
    if label:
        answerStr = "reoffended"
    else:
        answerStr = "did NOT reoffend"
    return answerStr


#This function will be used by various explanation functions later to show a single data instance
def showInstance(dataInstance, baseline_columns, instanceName, silent=False):
    # input: dataInstance       A row of a pandas dataframe
    # input: baseline_columns   A list of strings indicating the names of the features which are baseline
                                # meaning that they do not appear in the instance's dataframe columns
    # input: silent             Boolean, indicates whether print should occur
    # OUTPUT: list of strings, the labels of the columns for the features which were "on" for the instance
    strings = list()

    # print header row, with prediction/label
    #if not silent:
    #    print "Defendant:", instanceName

    # print features which were 'on'
    for k in range(0, len(dataInstance)):
        if dataInstance[k]:
            strings.append(dataInstance.index[k])

    droppedStrings = [] # accumulator for feature names which are baseline and describe this instance
    # find and print features which were 'off' for all categories of the feature
    for baselineHere in baseline_columns:
        overlapFound = False
        for stringHere in strings:
            # Compare the instance's descriptors with baseline.  this test is primitive, but kinda works
            if stringHere[:3] == baselineHere[:3]:
                overlapFound = True
        # if no overlap was found, then this instance is part of the baseline group for this feature
        if not overlapFound:
            droppedStrings.append(baselineHere)

    # combine the features which were "on" with the "baseline" features
    strings = strings + droppedStrings

    # sort and then rearrange the strings so they always appear in the same order
    strings.sort()
    strings[4],strings[2] = strings[2],strings[4]
    strings[1],strings[2] = strings[2],strings[1]
    strings[1],strings[0] = strings[0],strings[1]

    '''
    # json dump
    if not silent:
        caseDict = {"name" : instanceName}
        for featureName in strings:
            cutIdx = featureName.rfind("_")
            caseDict[featureName[0:cutIdx]] = featureName[cutIdx+1:]
        print(json.dumps(caseDict))
    return strings
    '''

    # print the feature names (if necessary)
    if not silent:
        for featureName in strings:
            print("\t", featureName)
        print
    return strings


# A helper function to quantize coefficients based on the coefficient ranges
def quantizeCoeff(coeff, maxCoeff, minCoeff):
    # input: coeff       The coefficient to quantize and convert to an output string
    # input: maxCoeff   The maximum coefficient (for quantizing)
    # input: minCoeff   The minimum coefficient (for quantizing)
    # OUTPUT: a string of "-----", "0", or "++" to describe where coeff falls in the range specified by min/max
    binCount =  11
    binRange = (maxCoeff - minCoeff)/binCount
    binIndex = (coeff - minCoeff)/binRange
    # make it so that the max value is not by itself as its own bin, since the bins are left associative, meaning shaped as |___
    if binCount == int(binIndex):
        binIndex -= 1

    # how many minuses should appear (might be negative)
    num_minuses = int(-minCoeff/binRange) - int(binIndex)

    # check the middle bucket
    if num_minuses == 0:
        return "0"

    # append the right number of -
    plusMinusString = ""
    for i in range(0,num_minuses):
        plusMinusString += "-"

    # append the right number of +
    for i in range(0,-num_minuses):
        plusMinusString += "+"

    return plusMinusString

def getDecorator(decoratorOn):
    if decoratorOn:
        return "<b>* "
    else:
        return "  "

def getDecoratorClose(decoratorOn):
    if decoratorOn:
        return "</b>"
    else:
        return "  "

#This class is a wrapper around the sklearn classifier, which we use to maintain various state we need to generate explanations.  The required state includes (but is not limited to), training examples, training labels, and feature labels.  The main reason this class is useful is that maintaining the columns that were dropped in the dummy coding is kind of a pain without maintaining some state information.
class ExplainableClassifier:
    def __init__(self, D_features, X_features, Y_features, trainExamples, baseline_columns):
        self.D_features = D_features
        self.X_features = X_features
        self.Y_features = Y_features

        self.trainLabels = trainExamples[Y_features]
        self.trainDF = pd.get_dummies(trainExamples[D_features+X_features])

        self.droppedStrings = baseline_columns
        self.trainDF = self.trainDF.drop(self.droppedStrings, axis=1)

        self.classifier = LogisticRegression()
        self.classifier.fit(self.trainDF.values,self.trainLabels.values.ravel())

    '''
    Now, we get to the explanation functions themselves.  Some explanations will take a data instance to explain in the locality surrounding that point, while others are "global," meaning they will generate essentially the same explanation for all points.

    The other main axis which we will categorize explanations on is whether it "describes" or "justifies." To elaborate, we appeal to the XAI literature review:
    The distinction between 'explanation' and 'justification' had been made by some, although it may be more appropriate to think about a justification as a type of explanation. Nevertheless, early explaining systems focused on making the what visible to the user, but not the why. Both modes of explanation are useful, but our view is that most current systems focus on justification; justifying why a specific action was taken. Yet we have advocated earlier that an important goal of explanation is to help a user develop a robust and predictive mental model of a system. Justifications may be inefficient ways of doing this, because they are local and focus on specific cases. Global explanations regarding how the system works may be needed as well, and this may serve more directly to help a user understand the workings of a system than individual justifications for particular decisions.
    '''


    #First, we have input influence based classification.  We consider this explanation global and description.  Its geometric interpretation is to give the equation for the decision boundary as a description of the classifier (implemented by describing the weight each feature is given).
    def inputInfluenceExplanation(self, instance, instanceName, silent=False):
        # input: instance           single row dataframe, instance we are explaining (not used much for this function)
        # input: silent             Boolean, indicates whether print should occur
        instanceReshaped = np.asarray(instance).reshape(1,len(instance))
        instanceFeatures = showInstance(instance, self.droppedStrings, instanceName, silent)
        prediction = self.classifier.predict(instanceReshaped)[0]

        # print the prediction and confidence
        if not silent:
            print("Prediction:", stringPrediction(prediction))
            #print "Confidence:", str(int(max(self.classifier.predict_proba(instanceReshaped)[0])*100)) + "%"
            print

            print("The predictive model assesses an individual's profile factors in the database to predict whether or not this individual is likely to reoffend.",)
            print("How different factors influence the prediction is illustrated below.",)
            print("The more +s (-s), means a person with that factor is more (less) likely to re-offend. Factors with (0) have little impact.",)
            print("A * will appear next to features which are relevant for this individual")
            print()

        #built-in min/max function is giving me a really weird type, find it manually I guess
        maxCoeff = -1000000
        minCoeff = 1000000
        featureCoefs = self.classifier.coef_ # for other classifiers, change this line
        featureLabels = self.trainDF.columns
        for k in featureCoefs[0]:
            if k < minCoeff:
                minCoeff = k
            if k > maxCoeff:
                maxCoeff = k



        # armed with min/max feature importances, iterate through the features and quantize
        output = []
        zeroString = quantizeCoeff(0, maxCoeff, minCoeff)
        lastFeature = ""
        for k in range(0, len(featureCoefs[0])):
            # split the category label off from the feature label
            for feature in self.D_features + self.X_features:
                if feature in featureLabels[k]:
                    categoryLabel = featureLabels[k][len(feature)+1:]
                    break

            # check to see if we are still doing different categories of same feature or not
            if lastFeature != feature:
                lastFeature = feature
                output.append(feature + " :")

                # DUMMY_CODING_MOD now that we have printed the header, print the baseline category for this feature
                # baseline features have a coefficient of 0 by definition.
                for dropped in self.droppedStrings:
                    if feature in dropped:
                        output.append("\t" + getDecorator(dropped in instanceFeatures) +  dropped[len(feature)+1:] \
                                      + " (" + zeroString + ")")

            # handle the regular case of producing output for this feature and category
            plusMinusString = quantizeCoeff(featureCoefs[0][k], maxCoeff, minCoeff)
            output.append("\t" +getDecorator(instanceReshaped[0][k]) + categoryLabel + " (" + plusMinusString + ")" )

        # reorder the output list so that items appear in semantic order, then emit output
        # these lines are NOT general
        output[4],output[6] = output[6],output[4]
        output[4],output[5] = output[5],output[4]
        output[13],output[15] = output[15],output[13]
        output[13],output[14] = output[14],output[13]
        for out in output:
            print(out)


    #Next, we have Sensitivity-based explanation, which is local and justification.  Its geometric interpretation is that from a specific instance's point in feature space, we search along axis-aligned lines until we hit the decision boundary.  Thus, the implicit justification of finding that the decision point is lower than the boundary for a feature X is that the decision was made "because feature X was too low."
    def sensitivityExplanation(self, instance, instanceName, silent=False):
        # input: instance           single row dataframe, instance we are explaining (not used much for this function)
        # input: silent             Boolean, indicates whether print should occur
        # output: boolean, whether or not a single feature perturbation was found that flips the outcome.
        instanceReshaped = np.asarray(instance).reshape(1,len(instance))
        showInstance(instance, self.droppedStrings, instanceName, silent)

        # setup some local variables and run the classifier on the input
        featureLabelStems = self.D_features + self.X_features
        featureLabels = list(self.trainDF.columns)
        prediction = self.classifier.predict(instanceReshaped)[0]

        # print the prediction and confidence
        if not silent:
            print("Prediction:", stringPrediction(prediction))
            #print("Confidence:", str(int(max(self.classifier.predict_proba(instanceReshaped)[0])*100)) + "%")
            print()

            print("The predictive model assesses an individual's profile factors in the database to predict whether or not this individual is likely to reoffend.",)
            print("The prediction is based on:")
            print()

        # Since this formulation uses categorical indicator variables, we can deep copy the instance, zero out
        # all the mutually exclusive categories, then march a bit down each category  (i.e. start with 0 priors,
        # then 1-3 priors, then 3+ priors, repeat for next feature.)
        # NOTE: One of these cases will be duplicate of the ACTUAL data instance, but the prediction won't change in that
        # case, so nothing will happen and it is fine.  This is easier than determining which feature vectors
        # are actually different.
        foundSensitiveFeature = False
        output = []
        for stem in featureLabelStems:
            # Start with a fresh copy of the instance to perturb
            perturbed = deepcopy(instanceReshaped)

            # now determine the feature index range corresponding to the feature label stem (a family of categories)
            # e.g. given the X feature "priors" we want data frame columns "priors_count_0", "priors_count_1_to_3", etc
            firstMatchIdx = 100000
            lastMatchIdx = -100000
            for i in range(0, len(featureLabels)):
                if stem in featureLabels[i]:
                    if i < firstMatchIdx:
                        firstMatchIdx = i
                    if i > lastMatchIdx:
                        lastMatchIdx = i

            # now we zero out all the relevant feature values for these mutually exclusive categories
            for j in range(firstMatchIdx, lastMatchIdx+1):
                perturbed[0][j] = 0

            #DUMMY_CODING_MOD run the classifier on the zero'd version
            newPrediction = self.classifier.predict(perturbed)
            if(prediction != newPrediction):
                for dropped in self.droppedStrings:
                    if stem in dropped:
                        foundSensitiveFeature = True
                        output.append(dropped)
            else:
                output.append("")

            # now we march a bit along each feature within these mutually exclusive categories
            for k in range(firstMatchIdx, lastMatchIdx+1):
                perturbed[0][k] = 1
                newPrediction = self.classifier.predict(perturbed)
                if(prediction != newPrediction):
                    foundSensitiveFeature = True
                    output.append(featureLabels[k])
                else:
                    output.append("")
                perturbed[0][k] = 0


        # reorder the output list so that items appear in semantic order, then emit output
        # these lines are NOT general
        output[2],output[4] = output[4],output[2]
        output[2],output[3] = output[3],output[2]
        output[9],output[11] = output[11],output[9]
        output[9],output[10] = output[10],output[9]



        #9-12 priors
        priorStart = 9
        priorEnd = 13
        # bin priors UP
        if output[priorStart] == "":
            # find the first area of sensitivity
            for firstSensitivity in range(priorStart, priorEnd + 1):
                if output[firstSensitivity] != "":
                    break

            # zero out the relevant sensitivities, and also do a sanity check that we dont have a case of
            #        positive examples | negative examples | positive examples
            # This is because are using logistic regression, which is a linear model
            for endSensitivity in range(firstSensitivity + 1, priorEnd + 1):
                if output[endSensitivity] == "":
                    print("PANIC, WE HAVE NONLINEARITY (branch 1, prior)")
                else:
                    output[endSensitivity] = ""

            # make a mild adjustment to the feature that started the sequence of related sensitive features
            if output[firstSensitivity] != "":
                output[firstSensitivity] += "\" (or more)"

        # bin priors DOWN
        else:
            # find the first area of INsensitivity, and zero out the previous sensitivity as we find new ones
            for firstInsensitivity in range(priorStart, priorEnd + 1):
                if output[firstInsensitivity] == "":
                    break
                if firstInsensitivity > priorStart:
                    output[firstInsensitivity-1] = ""

            # make a mild adjustment to the feature that started the sequence of related INsensitive features
            if firstInsensitivity > priorStart+1:
                output[firstInsensitivity-1] += "\" (or fewer)"

            # do a sanity check that we dont have a case of
            #        positive examples | negative examples | positive examples
            # This is because are using logistic regression, which is a linear model
            for endInsensitivity in range(firstInsensitivity + 1, priorEnd + 1):
                if output[endInsensitivity] != "":
                    print("PANIC, WE HAVE NONLINEARITY (branch 2, prior)")


        #index 2-6 age
        ageStart = 2
        ageEnd = 6
        # bin ages UP
        if output[ageStart] == "":
            # find the first area of sensitivity
            for firstSensitivity in range(ageStart, ageEnd + 1):
                if output[firstSensitivity] != "":
                    break

            # zero out the relevant sensitivities, and also do a sanity check that we dont have a case of
            #        positive examples | negative examples | positive examples
            # This is because are using logistic regression, which is a linear model
            for endSensitivity in range(firstSensitivity + 1, ageEnd + 1):
                if output[endSensitivity] == "":
                    print("PANIC, WE HAVE NONLINEARITY (branch 1, age)")
                else:
                    output[endSensitivity] = ""

            # make a mild adjustment to the feature that started the sequence of related sensitive features
            if output[firstSensitivity] != "":
                output[firstSensitivity] += "\" (or older)"

        # bin ages DOWN
        else:
            # find the first area of INsensitivity, and zero out the previous sensitivity as we find new ones
            for firstInsensitivity in range(ageStart, ageEnd + 1):
                if output[firstInsensitivity] == "":
                    break
                if firstInsensitivity > ageStart:
                    output[firstInsensitivity-1] = ""

            # make a mild adjustment to the feature that started the sequence of related INsensitive features
            if firstInsensitivity > ageStart+1:
                output[firstInsensitivity-1] += "\" (or younger)"

            # do a sanity check that we dont have a case of
            #        positive examples | negative examples | positive examples
            # This is because are using logistic regression, which is a linear model
            for endInsensitivity in range(firstInsensitivity + 1, ageEnd + 1):
                if output[endInsensitivity] != "":
                    print("PANIC, WE HAVE NONLINEARITY (branch 2, age)")



        # check to see if we found any feature perturbation which flips the output
        if not silent:
            newPredictString = stringPrediction(not prediction)
            for outLineNum in range(0, len(output)):
                if output[outLineNum] != "":
                    print(instanceName+"'s", output[outLineNum][0:output[outLineNum].find("_")], ":",)
                    if '\"' in output[outLineNum]:
                        print("If the individual had \""+ output[outLineNum]+\
                                    " they would have been predicted as "+ newPredictString)
                    else:
                        print("If the individual had \""+ output[outLineNum]+\
                                    "\" they would have been predicted as "+ newPredictString)

            if not foundSensitiveFeature:
                print("No single feature change was found that changes the prediction for this individual.\n")
            else:
                print("Changing other factors will not change the prediction for this individual.\n")
        return foundSensitiveFeature


    #Next is Case explanation, which is local and justification. Its geometric interpretation is that we find the nearest neighbor(s) in the training set and report their label(s).  It is possible we should hybridize with demographic explanation, depending on how complex and well covered the feature space is.  In some settings we find many exact matches in the training set for a given test instance.
    def caseExplanation(self, instance, instanceName, silent=False):
        # input: instance           single row dataframe, instance we are explaining (not used much for this function)
        # input: silent             Boolean, indicates whether print should occur
        # output: 2-tuple, first entry has exact matches in feature space, second entry also matches label space
        instanceReshaped = np.asarray(instance).reshape(1,len(instance))
        showInstance(instance, self.droppedStrings, instanceName, silent)

        # setup some local variables and run the classifier on the input
        prediction = self.classifier.predict(instanceReshaped)[0]
        # print the prediction and confidence
        if not silent:
            print("Prediction:", stringPrediction(prediction))
            #print("Confidence:", str(int(max(self.classifier.predict_proba(instanceReshaped)[0])*100)) + "%")
            print

            print("The predictive model assesses an individual's profile factors in the database to predict whether or not this individual is likely to reoffend.",)
            print("The prediction is based on whether similar or identical cases in the past re-offended or not.",)

        # compute distances between the instance and the training set, then find the min
        distances = euclidean_distances(instanceReshaped, self.trainDF)
        minDistanceIdx = np.argmin(distances)

        # this code reports how many EXACT matches this data instance had in the training set, regardless of label
        # index [0] first because distance is returned as a matrix, since the input can be more than 1 instance
        filteredList = [i for i in distances[0] if i == 0.0]

        # distances in feature X label space
        instanceReshapedWithLabel = np.append(instanceReshaped, prediction)
        instanceReshapedWithLabel = np.asarray(instanceReshapedWithLabel).reshape(1,len(instanceReshapedWithLabel))
        trainAndLabelsDF = self.trainDF.assign(y=self.trainLabels.values.ravel()) # need this to filter on y
        distancesWithLabel = euclidean_distances(instanceReshapedWithLabel, trainAndLabelsDF)
        minDistanceIdxWithLabel = np.argmin(distancesWithLabel)

        # this code reports how many EXACT matches this data instance had in the training set, regardless of label
        # index [0] first because distance is returned as a matrix, since the input can be more than 1 instance
        filteredWithLabelList = [i for i in distancesWithLabel[0] if i == 0.0]
        if not silent:
            print("The training set contained", len(filteredList), "individuals identical to this one.")
            print(len(filteredWithLabelList), "of them", stringInstanceLabel(prediction), )
            if len(filteredList) == 0:
                print("(0%)\n")
            else:
                print("("+str(int(float(len(filteredWithLabelList))/float(len(filteredList))*100))+"%)\n")

        # now print the data instance which was determined to be the nearest neighbor in the training set
        if not silent:
            print("Nearest Neighbor in training data")
        # index [0] second because iloc returns a row of a data frame (in this case, a 1 element row containing a label)
        neighbor_name = "<NEIGHBOR>"
        strings = showInstance(self.trainDF.iloc[minDistanceIdx], self.droppedStrings, neighbor_name, silent=True)
        nnLabel = self.trainLabels.iloc[minDistanceIdx][0]

        # now we set the strings for each feature in the dataset so that we can explain with them!
        race_string = "race"
        age_string = "age"
        charge_string = "c_charge"
        priors_string = "priors"
        juv_prior_string = 'juv_prior'
        for string in strings:
            if race_string in string:
                race_string = string
            if age_string in string:
                age_string = string
            if charge_string in string:
                charge_string = string
            if priors_string in string:
                priors_string = string
            if juv_prior_string in string:
                juv_prior_string = string

        #DUMMY_CODING_MOD if the strings are not modified (found by checking containment), replace with dropped string
        for string in self.droppedStrings:
            if race_string in string:
                race_string = string
            if age_string in string:
                age_string = string
            if charge_string in string:
                charge_string = string
            if priors_string in string:
                priors_string = string
            if juv_prior_string in string:
                juv_prior_string = string

        if not silent:
            print("This decision was based on thousands of similar cases from the past.",)
            print("For example, a similar case to this individual is ", neighbor_name, \
                ", a \""+ race_string+ \
                ",\" \""+age_string+ \
                ",\" with \""+ priors_string+ \
                "\" and \""+juv_prior_string+ \
                ",\" charged with a \""+charge_string+ \
                "\".\n", instanceName, stringInstanceLabel(nnLabel) )
        return len(filteredList), len(filteredWithLabelList)


    #Next is Demographic explanation, which is global and description. Its geometric interpretation is characterizing how the training examples are distributed in feature space, which induces the classifier.
    def demographicExplanation(self, instance, instanceName, silent=False):
        # input: instance           single row dataframe, instance we are explaining (not used much for this function)
        # input: silent             Boolean, indicates whether print should occur
        instanceReshaped = np.asarray(instance).reshape(1,len(instance))
        strings = showInstance(instance, self.droppedStrings, instanceName)

        # setup some local variables and run the classifier on the input
        featureLabelStems = self.D_features + self.X_features
        featureLabels = list(self.trainDF.columns)
        trainAndLabelsDF = self.trainDF.assign(y=self.trainLabels.values.ravel()) # need this to filter on y
        prediction = self.classifier.predict(instanceReshaped)[0]
        if not silent:
            print("Prediction:", stringPrediction(prediction))
            #print("Confidence:", str(int(max(self.classifier.predict_proba(instanceReshaped)[0])*100)) + "%")
            print()
            print("The predictive model assesses an individual's profile factors in the database to predict whether or not this individual is likely to reoffend.",)
            print("The prediction is based on the likelihood of previous cases with different profile factors re-offended or not, as illustrated below.",)
            print("A * will appear next to features which are relevant for this individual.")


        # compute summary statistics on the training data for each feature and category
        lastFeature = ""
        output = []
        for k in range(0,len(featureLabels)):
            # split the category label off from the feature label
            for feature in self.D_features + self.X_features:
                if feature in featureLabels[k]:
                    categoryLabel = featureLabels[k][len(feature)+1:]
                    break

            # check to see if we are still doing different categories of same feature or not
            if lastFeature != feature:
                lastFeature = feature
                output.append(feature+" :")

                #DUMMY_CODING_MOD compute demographics for those in the baseline feature categories
                # First, determine the dropped column name that corresponds to this family of feature categories
                labelHere = ""
                for dropped in self.droppedStrings:
                    if feature in dropped:
                        labelHere = dropped

                # now determine the feature index range corresponding to the feature label stem
                # e.g. given the X feature "priors" we want data frame columns "priors_count_0", "priors_count_1_to_3", etc
                firstMatchIdx = 100000
                lastMatchIdx = -100000
                for i in range(0, len(featureLabels)):
                    if feature in featureLabels[i]:
                        if i < firstMatchIdx:
                            firstMatchIdx = i
                        if i > lastMatchIdx:
                            lastMatchIdx = i

                # successively remove from the dataframe until only those with "0" values for all feature categories remain
                thisInstanceInBaseline = True # also determine if this instance is 0 valued for all feature categories too
                baselineDF = trainAndLabelsDF
                for j in range(firstMatchIdx, lastMatchIdx+1):
                    # successively filter the DF such that each feature is "off"
                    baselineDF = baselineDF[baselineDF[featureLabels[j]] == 0]
                    if instanceReshaped[0][j]:
                        thisInstanceInBaseline = False


                # now, the baselineDF only contains rows with ALL categories of this feature "off"
                denominator = len(baselineDF)

                # further, select the training examples which match the prediction for the given instance
                baselineDF = baselineDF.query('y=='+str(prediction))
                numerator = len(baselineDF)

                # now print the results from summary statistics on training data
                result = str(int((float(numerator)/float(denominator))*100))
                output.append("\t"+getDecorator(thisInstanceInBaseline)+result+"% of those in the "+\
                              labelHere[len(feature)+1:]+" "+feature+" group "+ stringInstanceLabel(prediction))



            # set the index based on the current feature.  Drop=false prevents the dataframe from being rearranged
            trainAndLabelsDF = trainAndLabelsDF.set_index(featureLabels[k], drop=False)

            # now that we have set the index, this will select all training examples with the "index" feature active
            kthFeatureDF = trainAndLabelsDF.loc[1]
            denominator = len(kthFeatureDF)

            # further, select the training examples which match the prediction for the given instance
            kthFeatureDF = kthFeatureDF.query('y=='+str(prediction))
            numerator = len(kthFeatureDF)

            # now print the results from summary statistics on training data
            result = str(int((float(numerator)/float(denominator))*100))
            output.append("\t"+getDecorator(instanceReshaped[0][k])+result+"% of those in the "+categoryLabel+" "\
                          +feature+" group "+stringInstanceLabel(prediction))

        # reorder the output list so that items appear in semantic order, then emit output
        # these lines are NOT general
        output[4],output[6] = output[6],output[4]
        output[4],output[5] = output[5],output[4]
        output[13],output[15] = output[15],output[13]
        output[13],output[14] = output[14],output[13]
        for out in output:
            print(out)



### 1.2 Load dataset

In [None]:
path = 'data/' # use your path
lastColPlusOne = 7
TrainList = pd.read_csv(path + "train_3.csv",index_col=None, header=0, usecols=range(1,lastColPlusOne))
TestList = pd.read_csv(path + "test_3.csv",index_col=None, header=0, usecols=range(1,lastColPlusOne))
TrainNewList = pd.read_csv(path + "train_new_3.csv",index_col=None, header=0, usecols=range(1,lastColPlusOne))
TestNewList = pd.read_csv(path + "test_new_3.csv",index_col=None, header=0, usecols=range(1,lastColPlusOne))

print(list(TrainList))

['race', 'age', 'charge degree', 'prior convictions', 'juvenile priors', 'is_recid']


Setup the features and their type.  D is protected features, while X is non-protected ones. Y is the output.

In [1]:
D_features = ['race']
Y_features = ['is_recid']
X_features = ['age', 'charge degree','prior convictions', 'juvenile priors']

Baseline features are a requirement imposed by logistic regression, which compares to a baseline where all features are set to 0. Thus, dropping these columns ensure that a baseline sample is interpretable when dummy coding. Throughout the source code in this notebook, there will be tags that show a modification required by this type of dummy coding.

To determine which columns to drop, we used the median category for each feature (when possible)

In [None]:
baseline_columns = ['race_Caucasian', 'age_40-49', 'charge degree_Felony', 'prior convictions_4-6', 'juvenile priors_No']

### 1.3 Explainable classifier construction

Next, we construct the explainable classifiers based on the training data, then check accuracy on test set.

In [None]:
# create the classifiers
classifierRaw = ExplainableClassifier(D_features,X_features,Y_features, TrainList, baseline_columns)
classifierProc = ExplainableClassifier(D_features,X_features,Y_features, TrainNewList, baseline_columns)

# create the test dataframe for each split, being sure to drop the appropriate columns
testDF = pd.get_dummies(TestList[D_features+X_features])

# occasionally, a category does not appear, handle this case by creating a column of 0s
for column in classifierRaw.trainDF.columns:
    if column not in testDF:
        testDF[column] = 0

# finish making the DF have the right shape
testDF = testDF.drop(baseline_columns, axis=1)

# measure accuracy for each classifier on the testDF
print("Accuracy RAW:", classifierRaw.classifier.score(testDF.values, TestList[Y_features]))
print("Accuracy PROC:", classifierProc.classifier.score(testDF.values, TestList[Y_features]))

Accuracy RAW: 0.6714015151515151
Accuracy PROC: 0.6761363636363636


## Task 2: Explanation Consumption
Please investigate the followings cells to complete these subtasks:

A. **[Team, TURN THIS IN]** Sensitivity-based explanation varies features one-at-a-time. Suppose we varied features in a pairwise fashion. How might the output vary?

B. **[Team, TURN THIS IN]** Compare and contrast the explanation output on the various sample groups (see cell title "Sample Groups").

C. **[Each, TURN THIS IN]** In what ways and to what extent do you think these decision processes (as implemented by the raw and processed model) are biased? Compare and contrast the classifiers trained on raw and processed data. Consider the evidence in both the explanations and validation notebook for this question.

D. **[Each, TURN THIS IN]** Which pieces of evidence do you find most compelling in reaching the judgment you did in the previous question? And why are those pieces of evidence the most compelling to you?

E. **[Each, TURN THIS IN]** There is a tradeoff between having brief global descriptions and having meaningful local explanations (global explanations have a length scaling on number of BOTH features AND categories within each feature, but a low dimensional feature/category space means fewer perturbations are possible for sensitivity-based explanation and also that the distances used for case-based explanation are more likely to be small). Reflect on this tradeoff, particularly as it pertains to the amount of text you would like provided in each explanation.

F. **[Each, TURN THIS IN]** Which explanation strategy do you like most and why? And which explanation strategy do you like least and why?

Helper function to produce explanations en masse, without huge code duplication. Edit this function to adjust the explanations you would like to view together. You may find it helpful to edit the function in the cell titled "Output Control Function" to remove or rearrange pieces to help you more easily see interesting trends.

In [None]:
#@title Output Control Function
def allExplanationsFor(sample, name):
    ##################################
    #####   Change these lines to inspect a different group of explanations
    ##################################

    print("Influence-based Explanation (raw )------------------------------")
    classifierRaw.inputInfluenceExplanation(sample, name)

    print("\n\nSensitivity-based Explanation (raw)------------------------------")
    classifierRaw.sensitivityExplanation(sample, name)

    print("\n\nCase-based Explanation (raw)------------------------------")
    classifierRaw.caseExplanation(sample, name)

    print("\n\nDemographic-based Explanation (raw)------------------------------")
    classifierRaw.demographicExplanation(sample, name)



    print("\n\nInfluence-based Explanation (processed)------------------------------")
    classifierProc.inputInfluenceExplanation(sample, name)

    print("\n\nSensitivity-based Explanation (processed)------------------------------")
    classifierProc.sensitivityExplanation(sample, name)

    print("\n\nCase-based Explanation (processed)------------------------------")
    classifierProc.caseExplanation(sample, name)

    print("\n\nDemographic-based Explanation (processed)------------------------------")
    classifierProc.demographicExplanation(sample, name)

Create a bunch of sets we can use to evaluate the explanations.

* **rawImpactONLYSample** are individuals who are ONLY disparately impacted by the decision process of the raw-data-trained classifier (decision changes when changing protected feature).

* **procBothSample** the set of individuals who are BOTH disparately impacted by the processed-data-trained classifier AND have individual fairness issue (decision differs between the two classifiers trained on raw and processed data). Note that this set is small, due to the impact of the debiasing step

* **rawBothSample** is the set of individuals who are BOTH disparately impacted by the RAW-data-trained classifier AND have individual fairness issue (decision differs between the two classifiers trained on raw and processed data).

* **noneSample** are those who do not have any individual fairness issues or any disparate impact issues.

In [None]:
#@title Sample Groups
rawImpactONLYSample =  [208, 365, 465, 579, 356, 368, 1031, 776, 174, 533, 537, 801, 14, 614, 948, 315, 808, 360, 340, 193, 327, 652, 341] # 23
procBothSample =  [629, 311, 74] # 3
rawBothSample =  [69, 574, 74, 898, 442, 15, 753, 221, 1048, 920, 626, 311, 147, 629, 944] # 15
noneSample =  [35, 57, 92, 100, 105, 205, 209, 270, 304, 334, 377, 480, 723, 790, 807, 809] # 16

##################################
#####   Change this line to inspect a different set above
##################################
sampleGroup = rawBothSample

In [None]:
for j in sampleGroup:
    print("\n***  SAMPLE", j, "\n")
    allExplanationsFor(testDF.iloc[j], "<NAME>")


***  SAMPLE 69 

Influence-based Explanation (raw )------------------------------
	 race_Caucasian
	 age_18-29
	 charge degree_Felony
	 prior convictions_None
	 juvenile priors_Yes
Prediction: NOT likely to reoffend
The predictive model assesses an individual's profile factors in the database to predict whether or not this individual is likely to reoffend.
How different factors influence the prediction is illustrated below.
The more +s (-s), means a person with that factor is more (less) likely to re-offend. Factors with (0) have little impact.
A * will appear next to features which are relevant for this individual

race :
	<b>* Caucasian (0)
	  African-American (+)
age :
	<b>* 18-29 (++++)
	  30-39 (+)
	  40-49 (0)
	  50-59 (-)
	  >59 (---)
charge degree :
	<b>* Felony (0)
	  Misdemeanor (-)
prior convictions :
	  1-3 (---)
	  7-10 (+++)
	  4-6 (0)
	  >10 (++++)
	<b>* None (------)
juvenile priors :
	  No (0)
	<b>* Yes (++)


Sensitivity-based Explanation (raw)-----------------------