In [6]:
import tensorflow as tf
from tensorflow import keras
import pandas as pd
import numpy as np
from imblearn.under_sampling import RandomUnderSampler
from aix360.algorithms.protodash import ProtodashExplainer

In [7]:
heart_csv_path = 'C:/Users/Rawan Alamily/Downloads/McSCert Co-op/explainable-ai-heart/predictive-models/personal-indicators-model/data/life-heart.csv'
dataframe = pd.read_csv(heart_csv_path)
dataframe['target'] = np.where(dataframe['heartDisease']=='Yes', 1, 0)
dataframe = dataframe.drop(columns=['heartDisease'])
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
df = dataframe.copy()
# designate for fitting rus
y = df.pop('target')
X = df
train, val, test = np.split(dataframe.sample(frac=1), [int(0.6*len(dataframe)), int(0.9*len(dataframe))])
y_train = train.pop('target')
X_train = train
y_test = test.pop('target')
X_test = test

# resample via undersampling majority class - this is favoured over oversampling as the dataset is very large
rus = RandomUnderSampler(random_state=0)
rus.fit(X,y)
# only resample training dataset
X_train_resampled, y_train_resampled = rus.fit_resample(X_train,y_train)
neg0, pos0 = np.bincount(y_train_resampled)
print("No.negative samples after undersampling",neg0)
print("No.positive samples after undersampling",pos0)

No.negative samples after undersampling 16507
No.positive samples after undersampling 16507


In [8]:
def df_to_dataset(features, labels, batch_size=512):
    tf_dataset = tf.data.Dataset.from_tensor_slices((dict(features), labels)).cache()
    shuffled_tf_dataset = tf_dataset.shuffle(buffer_size=len(df)) # shuffling values 
    return shuffled_tf_dataset.batch(batch_size).prefetch(2)# returning 32 samples per batch

# train = X_train_resampled
# train['target'] = y_train_resampled
train = df_to_dataset(X_train_resampled, y_train_resampled)
test = df_to_dataset(X_test, y_test)

0        80 or older
1              60-64
2              60-64
3              40-44
4              60-64
            ...     
33009          65-69
33010          60-64
33011          60-64
33012          65-69
33013    80 or older
Name: ageGroup, Length: 33014, dtype: object
0         No
1         No
2        Yes
3         No
4         No
        ... 
33009     No
33010     No
33011     No
33012     No
33013     No
Name: alcoholDrinking, Length: 33014, dtype: object
0         No
1         No
2         No
3         No
4         No
        ... 
33009    Yes
33010     No
33011     No
33012    Yes
33013     No
Name: asthma, Length: 33014, dtype: object
0        21.79
1        21.63
2        33.00
3        36.59
4        39.80
         ...  
33009    25.10
33010    25.75
33011    25.50
33012    25.09
33013    21.63
Name: bmi, Length: 33014, dtype: float64
0                             No
1                            Yes
2                             No
3                             No
4    

In [9]:
model = keras.models.load_model("C:/Users/Rawan Alamily/Downloads/McSCert Co-op/explainable-ai-heart/predictive-models/personal-indicators-model/saved-model")

0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
[]
0
[]
1
[]
0
ListWrapper([1])
ListWrapper([1])
[]
0
[]
1
[]
0
ListWrapper([1])
ListWrapper([1])
[]
0
[]
1
[]
0
ListWrapper([1])
ListWrapper([1])
[]
0
[]
1
[]
0
ListWrapper([1])
ListWrapper([1])
(56, 64)
-0.22360679774997896
0.22360679774997896
[88635597, 0]
[64]
0
(64, 128)
-0.1767766952966369
0.1767766952966369
[509738196, 0]
[128]
0
(128, 128)
-0.15309310892394862
0.15309310892394862
[412879409, 0]
[128]
0
(128, 64)
-0.1767766952966369
0.1767766952966369
[362797128, 0]
[64]
0
(64, 1)
-0.3038218101251
0.3038218101251
[128087662, 0]
[1]
0
[]
0
[]
0
[]
0
[]
0
[1 2]
[b'No' b'Yes']
[1 2]
[b'No' b'Yes']
[1 2]
[b'No' b'Yes']
[b'Good' b'Very good' b'Fair' b'Excellent' b'Poor']
[1 2 3 4 5]
[1 2]
[b'Yes' b'No']
[1 2 3 4]
[b'No' b'Yes' b'No, borderline diabetes' b'Yes (during pregnancy)']
[ 1  2  3  4  5  6  7  8  9 10 11 12 13]
[b'70-74' b'80 or older' b'65-69' b'60-64' b'75-79' b'55-59' b'50-54'
 b'45-49' b'40-44' b'35-39' b'18-24' b'30-

In [13]:
z_test = np.hstack((X_train_resampled,p_test))
z_test_pos = z_test[z_test[:,-1]==1,:]
print(z_test_pos.shape)

(17783, 17)


In [41]:
sample_id = 104
sample = X_test.iloc[sample_id,:]
print(y_test.iloc[sample_id])
print(type(sample))
print(sample.shape)

1
<class 'pandas.core.series.Series'>
(16,)


In [42]:
bmi = tf.convert_to_tensor(sample[0],dtype=np.float64) #float
smoking = tf.convert_to_tensor(sample[1]) #str
alcohol = tf.convert_to_tensor(sample[2]) #str
stroke = tf.convert_to_tensor(sample[3]) #str
physical = tf.convert_to_tensor(sample[4],dtype=np.int64) #int
mental = tf.convert_to_tensor(sample[5],dtype=np.int64) #int
walk = tf.convert_to_tensor(sample[6]) #str
sex = tf.convert_to_tensor(sample[7]) #str
age = tf.convert_to_tensor(sample[8]) #str
diabetic = tf.convert_to_tensor(sample[9]) #str
activity = tf.convert_to_tensor(sample[10]) #str
health = tf.convert_to_tensor(sample[11]) #str
sleep = tf.convert_to_tensor(sample[12],dtype=np.int64) #int
asthma = tf.convert_to_tensor(sample[13]) #str
kidney = tf.convert_to_tensor(sample[14]) #str
skinCancer = tf.convert_to_tensor(sample[15]) #str
X_dict = {'bmi': bmi, 'smoking': smoking,'alcoholDrinking': alcohol, 'stroke': stroke, 'physicalHealth': physical,
             'mentalHealth': mental, 'diffWalk': walk, 'sex': sex, 'ageGroup': age,'diabetic': diabetic, 'physicalActivity': activity, 
             'overallHealth': health, 'sleepHours': sleep, 'asthma': asthma,  'kidneyDisease': kidney, 'skinCancer': skinCancer}
X_ds = tf.data.Dataset.from_tensors((X_dict)).batch(1)
p_sample = model.predict(X_ds)

30.03
Yes
No
No
0
0
No
Female
65-69
No
Yes
Fair
6
No
No
No
1
False
0


In [43]:
p_sample = tf.round(p_sample).numpy().flatten().reshape((1,1))
print(p_sample)
p_sample.shape

[[0.6293415]]
[[1.]]


(1, 1)

In [44]:
X = np.hstack((sample.to_numpy().reshape((1,) + X_train_resampled.iloc[13,:].shape), p_sample))
print(X)
print(z_test_pos[1])

[[30.03 'Yes' 'No' 'No' 0 0 'No' 'Female' '65-69' 'No' 'Yes' 'Fair' 6
  'No' 'No' 'No' 1.0]]
[21.63 'Yes' 'No' 'No' 0 0 'No' 'Female' '60-64' 'Yes' 'Yes' 'Good' 8 'No'
 'No' 'No' 1.0]


In [45]:
encode = lambda x: 1 if x=='Yes' else 0
male = lambda x: 1 if x=='Male' else 0
def age(x):
    y = int(x[0:2])
    return y
def diabetes(x):
    if x=='Yes':
        y=1
    elif x=='No':
        y=0
    else:
        y=2
    return y
def genHealth(x):
    if x=='Very good':
        y=0
    elif x=='Good':
        y=1
    elif x=='Excellent':
        y=2
    elif x=='Fair':
        y=3
    else:
        y=4
    return y

In [46]:
print(X.shape)

(1, 17)


In [47]:
def encode_strings(arr):
    for row in arr:
        row[1] = encode(row[1])
        row[2] = encode(row[2])
        row[3] = encode(row[3])
        row[6] = encode(row[6])
        row[7] = male(row[7])
        row[8] = age(row[8])
        row[9] = diabetes(row[9])
        row[10] = encode(row[10])
        row[11] = genHealth(row[11])
        row[13] = encode(row[13])
        row[14] = encode(row[14])
        row[15] = encode(row[15])
    return arr
z_test_pos_norm = z_test_pos.copy()
z_test_pos_norm = encode_strings(z_test_pos_norm)
print(X)
print(z_test_pos[1])

[[30.03 'Yes' 'No' 'No' 0 0 'No' 'Female' '65-69' 'No' 'Yes' 'Fair' 6
  'No' 'No' 'No' 1.0]]
[21.63 'Yes' 'No' 'No' 0 0 'No' 'Female' '60-64' 'Yes' 'Yes' 'Good' 8 'No'
 'No' 'No' 1.0]


In [49]:
X_norm = X.copy()
X_norm[0,1] = encode(X_norm[0,1])
X_norm[0,2] = encode(X_norm[0,2])
X_norm[0,3] = encode(X_norm[0,3])
X_norm[0,6] = encode(X_norm[0,6])
X_norm[0,7] = male(X_norm[0,7])
X_norm[0,8] = age(X_norm[0,8])
X_norm[0,9] = diabetes(X_norm[0,9])
X_norm[0,10] = encode(X_norm[0,10])
X_norm[0,11] = genHealth(X_norm[0,11])
X_norm[0,13] = encode(X_norm[0,13])
X_norm[0,14] = encode(X_norm[0,14])
X_norm[0,15] = encode(X_norm[0,15])

In [50]:
def encode_strings(arr):
    for row in arr:
        row[1] = encode(row[1])
        row[2] = encode(row[2])
        row[3] = encode(row[3])
        row[6] = encode(row[6])
        row[7] = male(row[7])
        row[8] = age(row[8])
        row[9] = diabetes(row[9])
        row[10] = encode(row[10])
        row[11] = genHealth(row[11])
        row[13] = encode(row[13])
        row[14] = encode(row[14])
        row[15] = encode(row[15])
    return arr
z_test_pos_norm = z_test_pos.copy()
z_test_pos_norm = encode_strings(z_test_pos_norm)
print(X)
print(z_test_pos[1])

[[30.03 'Yes' 'No' 'No' 0 0 'No' 'Female' '65-69' 'No' 'Yes' 'Fair' 6
  'No' 'No' 'No' 1.0]]
[21.63 'Yes' 'No' 'No' 0 0 'No' 'Female' '60-64' 'Yes' 'Yes' 'Good' 8 'No'
 'No' 'No' 1.0]


In [51]:
prototype_explainer = ProtodashExplainer()
(W,S,values) = prototype_explainer.explain(X_norm, z_test_pos_norm, m=5)

elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison
elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison
elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison
elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison


     pcost       dcost       gap    pres   dres
 0:  0.0000e+00 -2.0000e+04  4e+00  1e+00  1e+00
 1:  3.7454e+03 -5.8281e+07  9e+03  1e+00  1e+00
 2:  9.3891e+03 -1.6284e+08  3e+04  1e+00  1e+00
 3:  9.6298e+03 -1.6651e+08  3e+04  1e+00  1e+00
 4:  2.3577e+02 -3.0209e+08  7e+04  1e+00  1e+00
 5:  4.1054e+03 -7.4286e+08  2e+05  1e+00  1e+00
 6:  7.4914e+03 -2.3374e+09  5e+05  1e+00  1e+00
 7:  1.1844e+04 -2.9192e+10  7e+06  1e+00  1e+00
 8:  1.8491e+04 -7.8111e+13  3e+10  1e+00  1e+00
 9:  1.3384e+09 -5.6474e+20  6e+20  7e-14  7e-01
10:  1.3384e+09 -5.6474e+18  6e+18  6e-16  8e-03
11:  1.3384e+09 -5.6474e+16  6e+16  1e-16  8e-05
12:  1.3384e+09 -5.6474e+14  6e+14  2e-16  8e-07
13:  1.3378e+09 -5.6502e+12  6e+12  2e-16  9e-09
14:  1.2831e+09 -5.9107e+10  6e+10  2e-16  9e-11
15:  3.4602e+08 -1.1175e+09  1e+09  3e-16  1e-09
16:  5.1495e+07 -6.1366e+07  1e+08  2e-16  1e-11
17:  7.2936e+06 -8.2741e+06  2e+07  2e-16  2e-14
18:  1.0131e+06 -1.2234e+06  2e+06  1e-16  4e-15
19:  1.3209e+05 -1.87

In [53]:
dataframe.pop('target')

0         0
1         0
2         0
3         0
4         0
         ..
319790    1
319791    0
319792    0
319793    0
319794    0
Name: target, Length: 319795, dtype: int32

In [54]:
dfs = pd.DataFrame.from_records(z_test_pos[S,0:-1])
dfs.columns = dataframe.columns

In [55]:
dfs['Weights'] = np.around(W, 5)/np.sum(np.around(W, 5))

In [56]:
dfs.transpose()

Unnamed: 0,0,1,2,3,4
bmi,30.91,13.44,15.66,22.67,31.01
smoking,Yes,No,Yes,Yes,Yes
alcoholDrinking,No,No,No,No,No
stroke,No,No,No,No,No
physicalHealth,0,10,0,2,0
mentalHealth,0,0,0,30,0
diffWalk,No,No,No,Yes,Yes
sex,Female,Female,Female,Female,Male
ageGroup,65-69,80 or older,80 or older,50-54,80 or older
diabetic,No,No,No,No,Yes


In [57]:
print(X.transpose())

[[30.03]
 ['Yes']
 ['No']
 ['No']
 [0]
 [0]
 ['No']
 ['Female']
 ['65-69']
 ['No']
 ['Yes']
 ['Fair']
 [6]
 ['No']
 ['No']
 ['No']
 [1.0]]


In [66]:
sample_id = 114
sample = X_test.iloc[sample_id,:]
print(y_test.iloc[sample_id])
print(type(sample))
print(sample.shape)

1
<class 'pandas.core.series.Series'>
(16,)


In [67]:
bmi = tf.convert_to_tensor(sample[0],dtype=np.float64) #float
smoking = tf.convert_to_tensor(sample[1]) #str
alcohol = tf.convert_to_tensor(sample[2]) #str
stroke = tf.convert_to_tensor(sample[3]) #str
physical = tf.convert_to_tensor(sample[4],dtype=np.int64) #int
mental = tf.convert_to_tensor(sample[5],dtype=np.int64) #int
walk = tf.convert_to_tensor(sample[6]) #str
sex = tf.convert_to_tensor(sample[7]) #str
age = tf.convert_to_tensor(sample[8]) #str
diabetic = tf.convert_to_tensor(sample[9]) #str
activity = tf.convert_to_tensor(sample[10]) #str
health = tf.convert_to_tensor(sample[11]) #str
sleep = tf.convert_to_tensor(sample[12],dtype=np.int64) #int
asthma = tf.convert_to_tensor(sample[13]) #str
kidney = tf.convert_to_tensor(sample[14]) #str
skinCancer = tf.convert_to_tensor(sample[15]) #str
X_dict = {'bmi': bmi, 'smoking': smoking,'alcoholDrinking': alcohol, 'stroke': stroke, 'physicalHealth': physical,
             'mentalHealth': mental, 'diffWalk': walk, 'sex': sex, 'ageGroup': age,'diabetic': diabetic, 'physicalActivity': activity, 
             'overallHealth': health, 'sleepHours': sleep, 'asthma': asthma,  'kidneyDisease': kidney, 'skinCancer': skinCancer}
X_ds = tf.data.Dataset.from_tensors((X_dict)).batch(1)
p_sample = model.predict(X_ds)

31.95
No
No
No
30
0
No
Male
65-69
Yes
Yes
Very good
5
No
No
No
1
False
0


In [68]:
p_sample = tf.round(p_sample).numpy().flatten().reshape((1,1))
print(p_sample)
p_sample.shape

[[0.60240084]]
[[1.]]


(1, 1)

In [69]:
X = np.hstack((sample.to_numpy().reshape((1,) + X_train_resampled.iloc[13,:].shape), p_sample))
print(X)
print(z_test_pos[1])

[[31.95 'No' 'No' 'No' 30 0 'No' 'Male' '65-69' 'Yes' 'Yes' 'Very good' 5
  'No' 'No' 'No' 1.0]]
[21.63 'Yes' 'No' 'No' 0 0 'No' 'Female' '60-64' 'Yes' 'Yes' 'Good' 8 'No'
 'No' 'No' 1.0]


In [70]:
encode = lambda x: 1 if x=='Yes' else 0
male = lambda x: 1 if x=='Male' else 0
def age(x):
    y = int(x[0:2])
    return y
def diabetes(x):
    if x=='Yes':
        y=1
    elif x=='No':
        y=0
    else:
        y=2
    return y
def genHealth(x):
    if x=='Very good':
        y=0
    elif x=='Good':
        y=1
    elif x=='Excellent':
        y=2
    elif x=='Fair':
        y=3
    else:
        y=4
    return y

In [71]:
X_norm = X.copy()
X_norm[0,1] = encode(X_norm[0,1])
X_norm[0,2] = encode(X_norm[0,2])
X_norm[0,3] = encode(X_norm[0,3])
X_norm[0,6] = encode(X_norm[0,6])
X_norm[0,7] = male(X_norm[0,7])
X_norm[0,8] = age(X_norm[0,8])
X_norm[0,9] = diabetes(X_norm[0,9])
X_norm[0,10] = encode(X_norm[0,10])
X_norm[0,11] = genHealth(X_norm[0,11])
X_norm[0,13] = encode(X_norm[0,13])
X_norm[0,14] = encode(X_norm[0,14])
X_norm[0,15] = encode(X_norm[0,15])

In [72]:
def encode_strings(arr):
    for row in arr:
        row[1] = encode(row[1])
        row[2] = encode(row[2])
        row[3] = encode(row[3])
        row[6] = encode(row[6])
        row[7] = male(row[7])
        row[8] = age(row[8])
        row[9] = diabetes(row[9])
        row[10] = encode(row[10])
        row[11] = genHealth(row[11])
        row[13] = encode(row[13])
        row[14] = encode(row[14])
        row[15] = encode(row[15])
    return arr
z_test_pos_norm = z_test_pos.copy()
z_test_pos_norm = encode_strings(z_test_pos_norm)
print(X)
print(z_test_pos[1])

[[31.95 'No' 'No' 'No' 30 0 'No' 'Male' '65-69' 'Yes' 'Yes' 'Very good' 5
  'No' 'No' 'No' 1.0]]
[21.63 'Yes' 'No' 'No' 0 0 'No' 'Female' '60-64' 'Yes' 'Yes' 'Good' 8 'No'
 'No' 'No' 1.0]


In [73]:
prototype_explainer = ProtodashExplainer()
(W,S,values) = prototype_explainer.explain(X_norm, z_test_pos_norm, m=5)

elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison
elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison
elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison
elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison


     pcost       dcost       gap    pres   dres
 0:  0.0000e+00 -2.0000e+04  4e+00  1e+00  1e+00
 1:  5.9694e+03 -8.6589e+07  2e+04  1e+00  1e+00
 2:  2.3675e+04 -2.8015e+08  6e+04  1e+00  1e+00
 3:  2.5631e+04 -3.0241e+08  7e+04  1e+00  1e+00
 4:  2.8860e+04 -5.4833e+08  1e+05  1e+00  1e+00
 5:  3.8402e+03 -2.5755e+10  6e+06  1e+00  1e+00
 6:  8.3498e+02 -1.8942e+12  5e+08  1e+00  1e+00
 7:  1.8276e+11 -3.0162e+19  3e+19  1e-13  4e-05
 8:  1.8276e+11 -3.0162e+17  3e+17  1e-15  5e-05
 9:  1.8276e+11 -3.0173e+15  3e+15  9e-17  1e-06
10:  1.8268e+11 -3.1224e+13  3e+13  1e-16  2e-08
11:  1.7456e+11 -1.3152e+12  1e+12  2e-16  8e-10
12:  1.5064e+10 -3.4416e+11  4e+11  1e-16  6e-11
13:  5.3324e+09 -1.0674e+10  2e+10  3e-16  2e-12
14:  7.9938e+08 -8.7396e+08  2e+09  2e-16  4e-13
15:  1.1456e+08 -1.2918e+08  2e+08  1e-16  1e-13
16:  1.6293e+07 -1.8443e+07  3e+07  2e-16  3e-14
17:  2.2813e+06 -2.6932e+06  5e+06  2e-16  2e-14
18:  3.0535e+05 -4.0675e+05  7e+05  4e-16  3e-15
19:  3.4572e+04 -6.69

In [75]:
dfs = pd.DataFrame.from_records(z_test_pos[S,0:-1])
dfs.columns = dataframe.columns

In [76]:
dfs['Weights'] = np.around(W, 5)/np.sum(np.around(W, 5))

In [77]:
dfs.transpose()

Unnamed: 0,0,1,2,3,4
bmi,30.11,88.6,35.44,12.84,33.89
smoking,No,No,Yes,Yes,No
alcoholDrinking,No,No,No,No,No
stroke,No,No,No,No,No
physicalHealth,30,30,0,28,30
mentalHealth,0,0,0,0,0
diffWalk,No,Yes,Yes,Yes,No
sex,Female,Male,Male,Male,Male
ageGroup,65-69,55-59,80 or older,75-79,18-24
diabetic,No,No,"No, borderline diabetes",Yes,No


In [78]:
print(X.transpose())

[[31.95]
 ['No']
 ['No']
 ['No']
 [30]
 [0]
 ['No']
 ['Male']
 ['65-69']
 ['Yes']
 ['Yes']
 ['Very good']
 [5]
 ['No']
 ['No']
 ['No']
 [1.0]]
