In [44]:
import sys
import numpy as np
import pandas as pd
from aif360.datasets import BinaryLabelDataset
from aif360.metrics import BinaryLabelDatasetMetric, ClassificationMetric
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn import metrics, preprocessing
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor, plot_tree
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import Markdown, display
import copy
import random
np.random.seed(1)

In [45]:
cols = ['age', 'workclass', 'fnlwgt', 'education', 'education.num', 'marital', 'occupation', 'relationship', 'race', 'gender', 'capgain', 'caploss', 'hours', 'country', 'income']
df_train = pd.read_csv('adult.data', names=cols, sep=",")
df_test = pd.read_csv('adult.test', names=cols, sep=",")

In [46]:
 def preprocess(df):
    df.isin(['?']).sum(axis=0)

    # replace missing values (?) to nan and then drop the columns
    df['country'] = df['country'].replace('?',np.nan)
    df['workclass'] = df['workclass'].replace('?',np.nan)
    df['occupation'] = df['occupation'].replace('?',np.nan)

    # dropping the NaN rows now
    df.dropna(how='any',inplace=True)
            
    df['income'] = df['income'].map({'<=50K': 0, '>50K': 1}).astype(int)
    df['gender'] = df['gender'].map({'Male': 1, 'Female': 0}).astype(int)
    df['workclass'] = df['workclass'].map({'State-gov': 0, 'Self-emp-not-inc': 1, 'Private': 2, 'Federal-gov': 3, 'Local-gov': 4, '?': 5,
                                           'Self-emp-inc': 6, 'Without-pay': 7, 'Never-worked': 8}).astype(int)
    df['education'] = df['education'].map({'Bachelors': 0, 'HS-grad': 1, '11th': 2, 'Masters': 3, '9th': 4, 
                                           'Some-college': 5, 'Assoc-acdm': 6, 'Assoc-voc': 7, '7th-8th': 8, 'Doctorate': 9, 
                                           'Prof-school': 10, '5th-6th': 11, '10th': 12, '1st-4th': 13, 'Preschool': 14, '12th': 15}).astype(int)
    df['marital'] = df['marital'].map({'Never-married': 0, 'Married-civ-spouse': 1, 'Divorced': 2, 'Married-spouse-absent': 3, 
                                                     'Separated': 4, 'Married-AF-spouse': 5, 'Widowed': 6}).astype(int)
    df['occupation'] = df['occupation'].map({'Adm-clerical': 0, 'Exec-managerial': 1, 'Handlers-cleaners': 2, 
                                             'Prof-specialty': 3, 'Other-service': 4, 'Sales': 5, 'Craft-repair': 6, 'Transport-moving': 7, 'Farming-fishing': 8, 
                                             'Machine-op-inspct': 9, 'Tech-support': 10, '?': 11, 'Protective-serv': 12, 'Armed-Forces': 13, 'Priv-house-serv': 14}).astype(int)
    df['relationship'] = df['relationship'].map({'Not-in-family': 0, 'Husband': 1, 'Wife': 2, 
                                                 'Own-child': 3, 'Unmarried': 4, 'Other-relative': 5}).astype(int)
    df['race'] = df['race'].map({'White': 0, 'Black': 1, 'Asian-Pac-Islander': 2, 'Amer-Indian-Eskimo': 3, 'Other': 4}).astype(int)
    df['country'] = df['country'].map({'United-States': 0, 'Cuba': 1, 'Jamaica': 2, 'India': 3, '?': 4, 'Mexico': 5, 'South': 6, 'Puerto-Rico': 7, 
                                       'Honduras': 8, 'England': 9, 'Canada': 10, 'Germany': 11, 'Iran': 12, 'Philippines': 13, 'Italy': 14, 
                                       'Poland': 15, 'Columbia': 16, 'Cambodia': 17, 'Thailand': 18, 'Ecuador': 19, 'Laos': 20, 'Taiwan': 21, 
                                       'Haiti': 22, 'Portugal': 23, 'Dominican-Republic': 24, 'El-Salvador': 25, 'France': 26, 'Guatemala': 27, 
                                       'China': 28, 'Japan': 29, 'Yugoslavia': 30, 'Peru': 31, 'Outlying-US(Guam-USVI-etc)': 32, 'Scotland': 33,
                                       'Trinadad&Tobago': 34, 'Greece': 35, 'Nicaragua': 36, 'Vietnam': 37, 'Hong': 38, 'Ireland': 39, 'Hungary': 40, 
                                       'Holand-Netherlands': 41}).astype(int)
    
    
    labels = df['age']
    proc = []
    for v in labels:
            if v <= 30:
                proc.append(1)
            elif v <= 40:
                proc.append(2)
            elif v <= 50:
                proc.append(3)
            else:
                proc.append(4)
    df['age']=proc 
    
    labels = df['hours']
    proc=[]
    for v in labels:
        if v<=25:
            proc.append(1)
        elif v<=41:
            proc.append(2)
        elif v<=55:
            proc.append(3)
        else:
            proc.append(4)
    df['hours']=proc
    
    df = df.drop(['fnlwgt', 'education.num', 'capgain', 'caploss', 'country'], axis = 1, inplace = True) 

**One-hot encoding**

In [47]:
 def one_hot_encode(df):
    df.isin(['?']).sum(axis=0)

    # replace missing values (?) to nan and then drop the columns
    df['country'] = df['country'].replace('?',np.nan)
    df['workclass'] = df['workclass'].replace('?',np.nan)
    df['occupation'] = df['occupation'].replace('?',np.nan)

    # dropping the NaN rows now
    df.dropna(how='any',inplace=True)
    df['income'] = df['income'].map({'<=50K': 0, '>50K': 1}).astype(int)
    df = pd.concat([df, pd.get_dummies(df['gender'], prefix='gender')],axis=1)
    df = pd.concat([df, pd.get_dummies(df['race'], prefix='race')],axis=1)
    df = pd.concat([df, pd.get_dummies(df['marital'], prefix='marital')],axis=1)
    df = pd.concat([df, pd.get_dummies(df['workclass'], prefix='workclass')],axis=1)
    df = pd.concat([df, pd.get_dummies(df['relationship'], prefix='relationship')],axis=1)
    df = pd.concat([df, pd.get_dummies(df['occupation'], prefix='occupation')],axis=1)

    df = df.drop(columns=['workclass', 'gender', 'fnlwgt', 'education', 'occupation', \
                      'relationship', 'marital', 'race', 'country', 'capgain', \
                      'caploss'])
    return df

In [48]:
# not one-hot (for randomforestclassifier and such)
# preprocess(df_train)
# preprocess(df_test)

# one-hot encoding (for regression mdoels)
df_train = one_hot_encode(df_train)
df_test = one_hot_encode(df_test)

In [49]:
df_train = df_train.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

**Protected, privileged**

In [50]:
# protected: 'gender'=0
# privileged: 'gender'=1

**Parametric Model**

In [51]:
X_train = df_train.drop(columns='income')
y_train = df_train['income']

X_test = df_test.drop(columns='income')
y_test = df_test['income']

X_test_orig = copy.deepcopy(X_test)

# Scale data
X_train = preprocessing.scale(X_train)
X_test = preprocessing.scale(X_test)

# Logistic regression
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(random_state=0)

clf.fit(X_train, y_train)
y_pred = clf.predict_proba(X_test)
num_params = len(clf.coef_.transpose())
# clf.classes_
# clf.coef_

**Loss function** (Log loss for logistic regression)

In [52]:
import math
def logistic_loss(y_test, y_pred):
    loss = 0
    for i in range(len(y_test)):
        loss += - y_test[i] * math.log(y_pred[i][1]) - (1 - y_test[i]) * math.log(y_pred[i][0])
    return loss/len(y_test)

print(logistic_loss(y_test, y_pred))

0.3609074672032075


**First-order derivative of loss function at z with respect to model parameters**

In [53]:
def del_L_del_theta_i(num_params, y_true, x, y_pred):
    del_L_del_theta = np.zeros((num_params, 1))
    for j in range(num_params):
        del_L_del_theta[j] = (1 - y_true) * y_pred[1] - y_true * y_pred[0]
        if j > 0:
            del_L_del_theta[j] *=  x[j]
    return del_L_del_theta

**Second-order partial derivative of loss function with respect to model parameters**

In [54]:
def hessian_one_point(num_params, y_true, x, y_pred):
    multiplier = y_pred[0] * y_pred[1]
    H = np.zeros((num_params, num_params))
    for i in range(num_params):
        for j in range(i + 1):
            if i == 0 and j == 0:
                H[i][j] = 1
            elif i > 0 and j == 0:
                H[i][j] = x[j]
#             elif i == 0 and j > 0:
#                 Hx[i][j] = x[i]
            elif i > 0 and j > 0:
                H[i][j] = x[i] * x[j]
    
    i_lower = np.tril_indices(num_params, -1)
    H.T[i_lower] = H[i_lower]     
    H *= multiplier
    return H

**First-order derivative of $P(y \mid \textbf{x})$ with respect to model parameters**

In [55]:
def del_f_del_theta_i(num_params, x, y_pred):
    del_f_del_theta = np.zeros((num_params, 1))
    for j in range(num_params):
        del_f_del_theta[j] = y_pred[0] * y_pred[1]
        if j > 0:
            del_f_del_theta[j] *=  x[j]
    return del_f_del_theta

In [13]:
# Return v = del(SPD)/del(theta)
del_f_protected = np.zeros((num_params, 1))
del_f_privileged = np.zeros((num_params, 1))
numProtected = X_test_orig['gender_Female'].sum()
numPrivileged = X_test_orig['gender_Male'].sum()
for i in range(len(X_test)):
    y_pred = clf.predict_proba(np.reshape(X_test[i], (1, num_params)))
    del_f_i = del_f_del_theta_i(num_params, X_test[i], y_pred[0])
#     del_f_i = del_L_del_theta_i(num_params, y_test[i], X_test[i], y_pred[0])
    if X_test_orig.iloc[i]['gender_Male'] == 1: #privileged
        del_f_privileged = np.add(del_f_privileged, del_f_i)
    elif X_test_orig.iloc[i]['gender_Female'] == 1:
        del_f_protected = np.add(del_f_protected, del_f_i)
del_f_privileged /= numPrivileged
del_f_protected /= numProtected
v = np.subtract(del_f_protected, del_f_privileged)
print(v.transpose())

[[-0.0761036  -0.00251122 -0.0485904   0.18883835 -0.18883835  0.00459085
   0.00283769  0.02099204  0.00360457 -0.0212755   0.05063701  0.01115698
  -0.10357413  0.00853188  0.05200718  0.01732764  0.03283201 -0.00385164
   0.01092429  0.01200229 -0.01752555 -0.01898312  0.00596354  0.00162999
  -0.16771663  0.05209307  0.01712104  0.03033389  0.05140912  0.14526574
   0.04561792 -0.00217271 -0.04538539 -0.00290335 -0.007349    0.0028575
  -0.00326059  0.02662191  0.00728675  0.0147451  -0.01824599 -0.00926515
   0.00470317 -0.02688753]]


**Stochastic estimation of $H_{\theta}^{-1}\nabla_{\theta}f(z, \theta)$**

In [46]:
# Uniformly sample t points from training data 
size = 100
sample = random.sample(range(len(X_train)), size)

hinv_v = copy.deepcopy(v)
for idx in range(size):
    i = sample[idx]
#     print(X_train[i])
    y_pred = clf.predict_proba(np.reshape(X_train[i], (1, num_params)))
    hessian_i = hessian_one_point(num_params, y_train[i], X_train[i], y_pred[0])
    hinv_v = np.matmul(np.subtract(np.identity(num_params), hessian_i), hinv_v)
#     print(hinv_v)
    hinv_v = np.add(hinv_v, v)
#     print(hinv_v)
print(hinv_v)

[[ 84180.18434209]
 [ -5033.86060874]
 [ -8252.88877302]
 [-10435.83505339]
 [-15837.10529412]
 [-13354.90222809]
 [ -6599.85682561]
 [-12011.56546963]
 [-13358.11835057]
 [-17157.16150282]
 [-11868.88558147]
 [-13159.79928081]
 [-17910.13235073]
 [-13419.35078755]
 [-12847.81765031]
 [ 16114.50204355]
 [-32985.6822442 ]
 [-15229.54666227]
 [-14342.00129333]
 [-10517.41484364]
 [-14139.50047434]
 [-14008.53999567]
 [-13404.99776501]
 [-13191.28564039]
 [-17996.88153585]
 [-12208.89779571]
 [-13563.28763127]
 [-13634.2227653 ]
 [ -5978.98051508]
 [-12996.15376702]
 [-13203.88115374]
 [-13180.76810836]
 [  8698.07451694]
 [-15273.7093843 ]
 [-13607.26954261]
 [-13619.5423121 ]
 [-14908.61946229]
 [-13484.20184909]
 [-13311.82662009]
 [  3381.33487934]
 [-14452.45929288]
 [ 10664.217132  ]
 [-32227.78911254]
 [-84258.19634007]]


**Influence of points computed using Hessian vector product**

In [25]:
for i in range(len(X_train)):
    y_pred = clf.predict_proba(np.reshape(X_train[i], (1, num_params)))
    del_L_del_theta = del_L_del_theta_i(num_params, y_train[i], X_train[i], y_pred[0])
    inf = -np.dot(del_L_del_theta.transpose(), hinv_v)
    print(inf)

[[147789.17473822]]
[[102069.14122837]]
[[91492.25566276]]
[[11749.90922098]]
[[-431199.58541261]]
[[-233970.62393776]]
[[-171939.51178529]]
[[-94498.17763366]]
[[198811.58391002]]
[[14994.52933481]]
[[-13227.32222326]]
[[-103813.21809296]]
[[-140920.97845614]]
[[-386452.69852651]]
[[208076.00050117]]
[[228126.11264423]]
[[147852.29027041]]
[[-522935.33418503]]
[[-264651.78768825]]
[[216.72223458]]
[[58982.20315481]]
[[63314.71425795]]
[[5396.83370031]]
[[11572.29430434]]
[[-40438.43974519]]
[[7549.93852302]]
[[59899.88562207]]
[[-71743.67632491]]
[[167277.07833944]]
[[-423495.61737941]]
[[143829.07627502]]
[[88933.56649783]]
[[100212.20839735]]
[[121282.94869259]]
[[62001.808787]]
[[-548279.47920281]]
[[93194.65932082]]
[[-112955.74536675]]
[[74933.74340586]]
[[-214.24610972]]
[[11702.39356224]]
[[-25976.8985718]]
[[-37788.20844359]]
[[-50241.76336332]]
[[22741.68153464]]
[[6783.00917445]]
[[-18706.30154149]]
[[-842874.35737404]]
[[95918.34908207]]
[[-100188.26438024]]
[[106553.788643

[[62018.0448831]]
[[-75233.53377313]]
[[-293253.97458092]]
[[53625.28942107]]
[[131522.7795199]]
[[-18457.33793979]]
[[-17573.80281595]]
[[-783960.12977577]]
[[106041.51079642]]
[[18590.78525477]]
[[-44694.6239346]]
[[298637.12527059]]
[[-33023.1572475]]
[[227974.87856958]]
[[-13500.43272244]]
[[45430.58615782]]
[[70130.76600111]]
[[3262.87989397]]
[[14525.48285984]]
[[-4339.47556007]]
[[41824.74557796]]
[[396860.04684445]]
[[-67589.79026691]]
[[-262623.9469681]]
[[-1224407.43108425]]
[[14994.52933481]]
[[-129701.98302196]]
[[143366.07855151]]
[[77813.06477519]]
[[252191.90540865]]
[[-4907.27927155]]
[[15256.07606922]]
[[273745.79037929]]
[[2022.57907907]]
[[63270.68671651]]
[[21168.51417386]]
[[72740.14362643]]
[[-225453.63114701]]
[[31572.19451588]]
[[85034.15017751]]
[[-112197.07608811]]
[[94160.83729312]]
[[-410411.07688704]]
[[-277002.74957092]]
[[-1567208.06981933]]
[[58019.63849179]]
[[28116.18767757]]
[[119074.14974719]]
[[300879.78259979]]
[[238437.27411247]]
[[-9695.95519364]

[[-39510.39318428]]
[[-5316.57459262]]
[[21158.09411185]]
[[-87.13088525]]
[[889275.05544139]]
[[-20112.78925167]]
[[-10950.00976282]]
[[3074.92244478]]
[[198353.92570165]]
[[15837.33037728]]
[[-123097.55601756]]
[[-415115.98819408]]
[[-143605.84129319]]
[[174801.00891622]]
[[49525.40510679]]
[[46315.05443024]]
[[-20246.83945441]]
[[-146821.93155018]]
[[582843.69312698]]
[[-51842.96148631]]
[[67801.74800193]]
[[-22352.37540215]]
[[-32200.24793105]]
[[-771029.9704978]]
[[-81902.81904606]]
[[16928.19219592]]
[[-76800.27660018]]
[[40264.92161939]]
[[-1200899.26306619]]
[[-113949.36136868]]
[[-209532.47878904]]
[[125035.4055409]]
[[-576207.94892739]]
[[-179730.79495789]]
[[-18425.95893699]]
[[287035.2281632]]
[[-295873.22100022]]
[[-71711.40552662]]
[[-106932.93110738]]
[[6528.98217867]]
[[-242435.98044941]]
[[176727.75063426]]
[[-85330.05646152]]
[[-841800.61174997]]
[[13913.64749318]]
[[-44786.82684243]]
[[134090.93040859]]
[[-4985.85734317]]
[[188660.36788715]]
[[76599.28730292]]
[[-274

[[184636.67271477]]
[[105307.16916465]]
[[140605.82911339]]
[[469473.87723367]]
[[-238628.55151017]]
[[-125046.81921047]]
[[106368.61612645]]
[[56340.52914616]]
[[180136.21009631]]
[[411866.92310086]]
[[-273232.8839237]]
[[-180465.52865783]]
[[-48620.7970238]]
[[49159.77462856]]
[[191546.27923616]]
[[31320.58907653]]
[[-784871.31527495]]
[[-653094.67503806]]
[[36322.99562927]]
[[-471756.81946008]]
[[-111697.92428327]]
[[-85666.040562]]
[[-167493.32399559]]
[[-41044.5494839]]
[[18301.55743135]]
[[-28711.81633416]]
[[103567.74260498]]
[[-129945.40371347]]
[[398594.22510779]]
[[-459443.08491599]]
[[570132.77643926]]
[[11790.14287333]]
[[47872.03455405]]
[[115868.48723898]]
[[51000.11781989]]
[[1634.469869]]
[[-777674.83760511]]
[[28786.60644156]]
[[159742.93923857]]
[[152754.42248056]]
[[-105093.41844298]]
[[-79631.06840567]]
[[-43953.70088038]]
[[-42040.74016951]]
[[-49735.02983293]]
[[-70263.8954158]]
[[263151.39079458]]
[[-1025726.49159984]]
[[-95123.63742669]]
[[-34668.27915955]]
[[-5

[[44487.21552488]]
[[298250.57970665]]
[[-29147.63478835]]
[[83037.13936282]]
[[-116968.48610018]]
[[-242833.1706424]]
[[145002.22046683]]
[[139958.9473365]]
[[-421679.93815416]]
[[-874246.85366249]]
[[126380.22044451]]
[[-61713.28732222]]
[[26526.34992197]]
[[23543.42608088]]
[[-203015.28507432]]
[[-15929.8240757]]
[[-198657.25069163]]
[[-28454.27245232]]
[[-103643.13725747]]
[[51002.84402518]]
[[56724.54218742]]
[[-128111.2914127]]
[[941502.21367127]]
[[127152.71321678]]
[[5987.250799]]
[[-190518.31814543]]
[[-88130.40144178]]
[[-57362.77143488]]
[[364045.18520534]]
[[-34415.97771144]]
[[351888.50553355]]
[[-55093.03500462]]
[[-36649.16364939]]
[[249218.99858529]]
[[17822.54322531]]
[[-24519.82615891]]
[[108147.98221462]]
[[4174.7747856]]
[[186073.49907726]]
[[67644.1159359]]
[[792089.0699494]]
[[92179.53768523]]
[[254758.90994401]]
[[-416455.30896355]]
[[163021.43255452]]
[[56035.28926871]]
[[146499.31996788]]
[[43246.75090783]]
[[38184.46278007]]
[[39853.76833037]]
[[-40128.3266111

[[-346872.86073838]]
[[181094.12206541]]
[[-65228.66550544]]
[[-29699.38303812]]
[[138476.37269379]]
[[-143884.34723225]]
[[37863.61740444]]
[[-1088422.64918278]]
[[965156.04382686]]
[[-37967.6778028]]
[[55090.88655258]]
[[67512.16569171]]
[[-436653.82087874]]
[[-36961.3447053]]
[[82488.06349869]]
[[250171.95282121]]
[[2650.13464216]]
[[-172365.28079207]]
[[14525.48285984]]
[[-643798.72551448]]
[[282583.31779444]]
[[37759.838342]]
[[17814.38753507]]
[[-26144.75730625]]
[[-123598.84761318]]
[[-123733.63453429]]
[[10582.83273542]]
[[129430.93989595]]
[[-120391.55092779]]
[[41184.44116796]]
[[17795.30770792]]
[[147998.12576709]]
[[260529.28232208]]
[[-91082.3257154]]
[[19189.58535587]]
[[-168021.29855913]]
[[145800.17709837]]
[[-28318.93168717]]
[[3324.44043172]]
[[69314.71267573]]
[[299994.64214611]]
[[123140.50560521]]
[[181401.9356785]]
[[-20040.77068833]]
[[151359.09770679]]
[[7860.00990019]]
[[-291561.94508452]]
[[-4857.48375421]]
[[-575815.80807751]]
[[133352.49538791]]
[[-473536.53

[[32296.50632539]]
[[-9646.65840561]]
[[-72978.34644147]]
[[72243.6065369]]
[[-39228.4053648]]
[[-16864.32016918]]
[[404736.14597586]]
[[463621.38019092]]
[[-75724.16127478]]
[[-85866.21683046]]
[[311207.53569906]]
[[23897.09818374]]
[[311466.74108293]]
[[133023.83094421]]
[[-4815.06277641]]
[[444414.18437446]]
[[-1190077.26312967]]
[[14266.31393709]]
[[5800.69791394]]
[[23234.89822474]]
[[-83734.64699383]]
[[-81454.90033828]]
[[50060.6757059]]
[[-1272065.71845728]]
[[33551.92532485]]
[[-79984.42141956]]
[[412828.0200735]]
[[34967.41183924]]
[[223833.39059806]]
[[96617.48840348]]
[[-57273.4915249]]
[[7545.79151835]]
[[184698.46350738]]
[[-744169.61796529]]
[[-150574.72650359]]
[[170451.76670469]]
[[-212852.50911954]]
[[-479086.90533322]]
[[20967.80241611]]
[[104712.56886077]]
[[-116553.9352634]]
[[-263834.14661678]]
[[119327.41253115]]
[[246317.54723175]]
[[-366021.27775504]]
[[-10674.06275258]]
[[-271482.95607558]]
[[1086535.79781254]]
[[233356.91898011]]
[[-51926.93713]]
[[-388288.88

[[450559.04212344]]
[[32734.70228614]]
[[31331.40615953]]
[[-142076.25014763]]
[[-181252.19309985]]
[[-236945.68057176]]
[[-258046.15217339]]
[[71125.22215611]]
[[-4395.96046202]]
[[-48616.0275981]]
[[-225590.04628554]]
[[386841.27773059]]
[[-1191739.01576385]]
[[84096.15802524]]
[[-25132.33317214]]
[[275541.68124408]]
[[-179374.16378253]]
[[30402.8179594]]
[[125441.2412025]]
[[7471.65983917]]
[[-99028.12765945]]
[[15365.5504279]]
[[163589.4743234]]
[[-50066.84715153]]
[[-304751.02324988]]
[[-6422.24166504]]
[[52693.23294877]]
[[290548.35952183]]
[[111200.1799441]]
[[-12085.28138114]]
[[-10191.28712248]]
[[315557.84346893]]
[[122619.3564952]]
[[-53000.61092809]]
[[-108719.28082358]]
[[1135347.33807442]]
[[-243071.20657491]]
[[-274743.46320898]]
[[267575.49839505]]
[[-254952.29043909]]
[[-59616.83367078]]
[[-64213.35905689]]
[[-869314.41058093]]
[[-640084.92674715]]
[[-36001.46220177]]
[[224748.46424802]]
[[626227.83931427]]
[[11302.49852603]]
[[-81868.27410842]]
[[1102918.84486933]]
[[

[[-775903.04507051]]
[[-570197.75216237]]
[[207108.25258634]]
[[-36001.82613174]]
[[-46532.74493222]]
[[48035.53236301]]
[[-1154006.45211509]]
[[-107013.3664405]]
[[-211120.41838065]]
[[45691.59159224]]
[[-14064.18927482]]
[[147268.54360254]]
[[-571450.22951365]]
[[136730.92967766]]
[[-70423.26310337]]
[[-117249.64774307]]
[[68882.54912128]]
[[-117647.86010216]]
[[-358665.43580098]]
[[7417.6656147]]
[[-33952.30310611]]
[[-309780.48598814]]
[[-884637.99269794]]
[[-156130.90569293]]
[[-30776.46150768]]
[[43775.01632014]]
[[77144.21020766]]
[[42034.63289665]]
[[-34623.58483837]]
[[109174.14071771]]
[[-204165.54072626]]
[[147851.58802768]]
[[63.63619129]]
[[6001.80089458]]
[[-177810.49094163]]
[[92563.12157814]]
[[388658.16556018]]
[[-240057.11847593]]
[[-43215.67269798]]
[[21093.187808]]
[[-35170.66916907]]
[[-74926.96223997]]
[[256156.30264954]]
[[-129792.96819838]]
[[8031.35216716]]
[[24119.02123226]]
[[21227.98359285]]
[[-56242.29273003]]
[[-46039.36510429]]
[[78366.35954412]]
[[-1329.

KeyboardInterrupt: 

**Compute statistical parity difference**

In [14]:
def computeFairness(y_pred, X_test): 
    protected_idx = X_test[X_test['gender_Female']==1].index
    numProtected = len(protected_idx)
    privileged_idx = X_test[X_test['gender_Male']==1].index
    numPrivileged = len(privileged_idx)
    
    p_protected = 0
    for i in range(len(protected_idx)):
        p_protected += y_pred[i][0]
    p_protected /= len(protected_idx)
    
    p_privileged = 0
    for i in range(len(privileged_idx)):
        p_privileged += y_pred[i][0]
    p_privileged /= len(privileged_idx)
    
    spd = p_protected - p_privileged
    return spd

**Influence of points computed using ground truth**

In [56]:
clf.fit(X_train, y_train)
y_pred = clf.predict_proba(X_test)
spd_0 = computeFairness(y_pred, X_test_orig)

delta_spd = []
# for i in range(len(X_train)):
#     X_removed = np.delete(X_train, i, 0)
#     y_removed = y_train.drop(index=i, inplace=False)
#     clf.fit(X_removed, y_removed)
#     y_pred = clf.predict_proba(X_test)
#     delta_spd_i = spd_0 - computeFairness(y_pred, X_test_orig)
#     delta_spd.append(delta_spd_i)
#     print(delta_spd)

In [63]:
clf.fit(X_train, y_train)
print(clf.coef_)

[[ 0.39055584  0.75703998  0.36234345 -0.20201628  0.20201628 -0.04144023
  -0.00762933 -0.00110559 -0.05638886  0.03038057 -0.20590097  0.04942052
   0.67150067 -0.07326494 -0.48134048 -0.11480844 -0.07297495  0.09611215
  -0.02770347  0.04546588  0.06302814 -0.09614586 -0.05737912 -0.15375256
  -0.03084835  0.15254117 -0.08590973 -0.30291253  0.04568383  0.26861483
  -0.01770805 -0.01511405  0.00295241  0.26166452 -0.17955657 -0.15909116
  -0.08675011 -0.30169731 -0.17979404  0.18537322  0.0716978   0.08082881
   0.09366023 -0.03406616]]


In [19]:
with open('delta_spd_ground_truth.txt', 'w') as filehandle:
    for listitem in delta_spd:
        filehandle.write('%s\n' % listitem)

**Hessian matrix of loss function with respect to model parameters**

Verified symmetric

In [58]:
clf.fit(X_train, y_train)

def compute_hessian(num_params, X_train, y_train):
    H = np.zeros((num_params, num_params))
    for i in range(len(X_train)):
        y_pred = clf.predict_proba(np.reshape(X_train[i], (1, num_params)))
        H = np.add(H, hessian_one_point(num_params, y_train[i], X_train[i], y_pred[0]))
    H /= len(X_train)
    return H

In [59]:
hxx = compute_hessian(num_params, X_train, y_train)
hxx

array([[ 0.11474172,  0.0346523 ,  0.0346523 , ...,  0.0346523 ,
         0.0346523 ,  0.0346523 ],
       [ 0.0346523 ,  0.10017688,  0.00974613, ...,  0.00164282,
         0.0044189 , -0.01879675],
       [ 0.0346523 ,  0.00974613,  0.10792799, ...,  0.00624262,
        -0.00487627,  0.0091847 ],
       ...,
       [ 0.0346523 ,  0.00164282,  0.00624262, ...,  0.12618   ,
        -0.01002048, -0.01260149],
       [ 0.0346523 ,  0.0044189 , -0.00487627, ..., -0.01002048,
         0.14048683, -0.00661232],
       [ 0.0346523 , -0.01879675,  0.0091847 , ..., -0.01260149,
        -0.00661232,  0.13211377]])

In [60]:
hinv_v = np.linalg.pinv(hxx)

In [64]:
for i in range(10):
    X_removed = np.delete(X_train, i, 0)
    y_removed = y_train.drop(index=i, inplace=False)
    clf.fit(X_removed, y_removed)
    print("Ground truth updated parameters")
    print(clf.coef_)
#     y_pred = clf.predict_proba(X_test)
#     delta_spd_i = spd_0 - computeFairness(y_pred, X_test_orig)
#     delta_spd.append(delta_spd_i)

    y_pred = clf.predict_proba(np.reshape(X_train[i], (1, num_params)))
    del_L_del_theta = del_L_del_theta_i(num_params, y_train[i], X_train[i], y_pred[0])
    updated_model_params = np.matmul(del_L_del_theta.transpose(), hinv_v)/len(X_train)
    clf.coef_ = np.add(clf.coef_, updated_model_params)
    print("Approximated parameters")
    print(clf.coef_)

Ground truth updated parameters
[[ 0.39056678  0.75705478  0.36234289 -0.20205895  0.20205895 -0.04144044
  -0.00763714 -0.00111484 -0.05638538  0.03039132 -0.20594506  0.04942333
   0.67148257 -0.07327921 -0.48127384 -0.11482732 -0.0729876   0.09608792
  -0.02771863  0.04542855  0.06303426 -0.09615068 -0.05725656 -0.15375514
  -0.03088295  0.15258774 -0.08591606 -0.3029528   0.04571113  0.26863218
  -0.01759744 -0.01511476  0.00294718  0.26165246 -0.17955955 -0.15909311
  -0.08674547 -0.30169847 -0.17979714  0.18531302  0.07166639  0.08082351
   0.09364591 -0.03407036]]
Approximated parameters
[[ 0.39056678  0.75708366  0.36234156 -0.20210247  0.20212076 -0.04143784
  -0.00763764 -0.00111521 -0.05638114  0.03041262 -0.20599443  0.04942592
   0.67152093 -0.07329616 -0.48122496 -0.11485152 -0.07300217  0.09606984
  -0.02771141  0.04541285  0.06303308 -0.09615134 -0.05712185 -0.15394718
  -0.03092811  0.15268274 -0.08592446 -0.30300748  0.04576651  0.26864385
  -0.01747918 -0.0151148   0

Ground truth updated parameters
[[ 0.39048989  0.75731025  0.36232435 -0.2020041   0.2020041  -0.04144134
  -0.00763911 -0.00110121 -0.05637732  0.03037909 -0.2058999   0.04942001
   0.67153786 -0.07327581 -0.48139091 -0.11479908 -0.07294904  0.0961896
  -0.02764172  0.04555986  0.06313236 -0.09651171 -0.05732217 -0.15376632
  -0.03094138  0.15255917 -0.08588487 -0.302848    0.04570775  0.26863119
  -0.0177326  -0.01512155  0.00305238  0.26140716 -0.17937199 -0.15906966
  -0.08673429 -0.30165898 -0.17984435  0.18533219  0.07168337  0.08089433
   0.09363262 -0.0340149 ]]
Approximated parameters
[[ 0.39048989  0.7575449   0.36234147 -0.2020389   0.20195065 -0.04143039
  -0.00763736 -0.00110199 -0.05637266  0.03035254 -0.20590654  0.04941344
   0.6714246  -0.07327476 -0.48144043 -0.11480734 -0.07294556  0.09620908
  -0.02763538  0.04559538  0.06316991 -0.09692182 -0.05730715 -0.1533854
  -0.03103823  0.15248722 -0.08588526 -0.30282557  0.04565903  0.26864285
  -0.0178124  -0.01512673  0.0

**Inverse of Hessian computed using pseudo-inverse, spd computed**

In [23]:
delta_spd_pinv = []
for i in range(len(X_train)):
    y_pred = clf.predict_proba(np.reshape(X_train[i], (1, num_params)))
    del_L_del_theta = del_L_del_theta_i(num_params, y_train[i], X_train[i], y_pred[0])
    updated_model_params = np.matmul(del_L_del_theta.transpose(), hinv_v)/len(X_train)
    clf.coef_ = np.add(clf.coef_, updated_model_params)
    y_pred = clf.predict_proba(X_test)
    delta_spd_i = spd_0 - computeFairness(y_pred, X_test_orig)
    delta_spd_pinv.append(delta_spd_i)

In [25]:
with open('delta_spd_pinv.txt', 'w') as filehandle:
    for listitem in delta_spd_pinv:
        filehandle.write('%s\n' % listitem)

In [33]:
from scipy.stats import rankdata
gt_rank = rankdata(delta_spd).astype(int)
pinv_rank = rankdata(delta_spd_pinv).astype(int)

**Challenges: (1) Hessian is ill-conditioned; (2) Inverting Hessian is computationally expensive.**
Solutions: (1) Resort to pseudo-inverse; (2) Use Hessian vector products

In [113]:
# Traditional way of computing hinv produces results that are prone 
# to numerical errors because hxx is ill-conditioned
# If the condition number is very large, then the matrix is said to be ill-conditioned. 
# Practically, such a matrix is almost singular, and the computation of its inverse, 
# or solution of a linear system of equations is prone to large numerical errors.
# Ax = b; Small change in elements of b effects huge change in the solution x. 
# Condition number >= 1. (=1 for identity matrix I)

# hinv = np.linalg.inv(hxx)

# We, therefore, compute pseudo-inverse
hpinv = np.linalg.pinv(hxx)
# print("Condition number of pseudo-inverse:", np.linalg.cond(hpinv))

for i in range(len(X_train)):
    y_pred = clf.predict_proba(np.reshape(X_train[i], (1, num_params)))
    loss_grad_at_z = del_L_del_theta_i(num_params, y_train[i], X_train[i], y_pred[0])
    delta_theta = np.matmul(hpinv, loss_grad_at_z)/len(X_train)
    delta_theta_max = delta_theta.max()
    delta_theta_min = delta_theta.min()
    if abs(delta_theta_max) > 1 or abs(delta_theta_min) > 1:
        print("i: ", i)
        print("Maximum change: ", delta_theta_max)
        print("Minimum change: ", delta_theta_min)

i:  0
Maximum change:  1.2736311704568954
Minimum change:  -0.255102107041807
i:  1
Maximum change:  0.8288609844597572
Minimum change:  -1.1969165625533587
i:  2
Maximum change:  2.4088418062740358
Minimum change:  -0.568077483922055
i:  3
Maximum change:  3.2829238158492258
Minimum change:  -1.5078677722105867
i:  4
Maximum change:  3.8507609025641623
Minimum change:  -2.1970274212329213
i:  5
Maximum change:  1.4531599084270606
Minimum change:  -1.159364016420451
i:  6
Maximum change:  4.623801470250959
Minimum change:  -2.061683589113863
