# DiCE

## Load data

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd

df = pd.read_csv("../../datasets/FICO/heloc_dataset_v1.csv")

In [3]:
TARGET = "RiskPerformance"

In [4]:
from sklearn.model_selection import train_test_split

df[TARGET] = df[TARGET].factorize()[0]

train, test = train_test_split(df, test_size=0.25, random_state=42)

## DiCE dataset

In [5]:
import dice_ml
from dice_ml.utils import helpers # helper functions

In [6]:
continuous_features = list(train[df.columns[~train.columns.isin([TARGET])]].columns)

In [8]:
features = {f: [-20, 1000] for f in continuous_features}

In [9]:
d = dice_ml.Data(features=features, outcome_name=TARGET)

## XGBoost

### Load model

In [10]:
from xgboost.sklearn import XGBClassifier

model = XGBClassifier()

In [11]:
model.load_model("../../models/xgboost.json")

In [13]:
import utils

X = utils.get_negative_closest(model, 0.75)

In [14]:
X = pd.DataFrame(X).T

In [15]:
m = dice_ml.Model(model=model, backend="sklearn")

In [16]:
d.get_decimal_precisions()

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

### KD-Tree method

KD-Tree method requires access to the entire dataset.

In [39]:
d = dice_ml.Data(dataframe=train, continuous_features=continuous_features, outcome_name=TARGET)

In [19]:
explainer = dice_ml.Dice(d, m, method="kdtree")

In [20]:
N=100

In [21]:
explanation = explainer.generate_counterfactuals(X, total_CFs=N, desired_class=1)

100%|██████████████████████████████████████████████████| 1/1 [00:06<00:00,  6.49s/it]


In [22]:
explanation.visualize_as_dataframe()

Query instance (original outcome : 0)


Unnamed: 0,ExternalRiskEstimate,MSinceOldestTradeOpen,MSinceMostRecentTradeOpen,AverageMInFile,NumSatisfactoryTrades,NumTrades60Ever2DerogPubRec,NumTrades90Ever2DerogPubRec,PercentTradesNeverDelq,MSinceMostRecentDelq,MaxDelq2PublicRecLast12M,...,MSinceMostRecentInqexcl7days,NumInqLast6M,NumInqLast6Mexcl7days,NetFractionRevolvingBurden,NetFractionInstallBurden,NumRevolvingTradesWBalance,NumInstallTradesWBalance,NumBank2NatlTradesWHighUtilization,PercentTradesWBalance,RiskPerformance
0,63,309,36,112,17,2,1,68,7,2,...,0,1,1,54,-8,5,1,2,67,0



Diverse Counterfactual set (new outcome: 1.0)


Unnamed: 0,ExternalRiskEstimate,MSinceOldestTradeOpen,MSinceMostRecentTradeOpen,AverageMInFile,NumSatisfactoryTrades,NumTrades60Ever2DerogPubRec,NumTrades90Ever2DerogPubRec,PercentTradesNeverDelq,MSinceMostRecentDelq,MaxDelq2PublicRecLast12M,...,MSinceMostRecentInqexcl7days,NumInqLast6M,NumInqLast6Mexcl7days,NetFractionRevolvingBurden,NetFractionInstallBurden,NumRevolvingTradesWBalance,NumInstallTradesWBalance,NumBank2NatlTradesWHighUtilization,PercentTradesWBalance,RiskPerformance
1714,73.0,309.0,36.0,112.0,17.0,2.0,1.0,89.0,1.0,1.0,...,0.0,1.0,1.0,15.0,-8.0,4.0,-8.0,0.0,60.0,1
6114,71.0,309.0,10.0,112.0,32.0,2.0,1.0,100.0,-7.0,6.0,...,0.0,1.0,1.0,23.0,-8.0,5.0,5.0,2.0,59.0,1
10400,84.0,309.0,24.0,112.0,17.0,0.0,1.0,100.0,-7.0,7.0,...,0.0,1.0,1.0,28.0,-8.0,2.0,1.0,0.0,80.0,1
4243,75.0,365.0,13.0,130.0,17.0,0.0,1.0,89.0,13.0,6.0,...,0.0,1.0,1.0,45.0,-8.0,5.0,1.0,0.0,50.0,1
10404,79.0,344.0,24.0,135.0,24.0,0.0,1.0,100.0,-7.0,7.0,...,0.0,1.0,1.0,41.0,-8.0,5.0,1.0,2.0,50.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5550,90.0,309.0,12.0,99.0,17.0,0.0,1.0,100.0,-7.0,7.0,...,-8.0,1.0,1.0,0.0,-8.0,1.0,1.0,0.0,50.0,1
8142,71.0,309.0,3.0,112.0,17.0,0.0,1.0,86.0,20.0,6.0,...,2.0,1.0,1.0,43.0,20.0,5.0,1.0,0.0,75.0,1
1996,87.0,378.0,17.0,112.0,17.0,0.0,1.0,100.0,-7.0,7.0,...,0.0,1.0,1.0,14.0,-8.0,5.0,-8.0,0.0,60.0,1
6559,87.0,272.0,15.0,95.0,17.0,0.0,1.0,100.0,-7.0,7.0,...,0.0,1.0,1.0,21.0,-8.0,5.0,1.0,0.0,57.0,1


In [44]:
def create_result_df(_model, _explanation, method: str, model_type: str):
    exp_df = _explanation.cf_examples_list[0].final_cfs_df
    if "RiskPerformance" in exp_df:
        exp_df = exp_df.drop(columns=["RiskPerformance"])
    probs = _model.predict_proba(exp_df)
    exp_df = exp_df.add_prefix("Cf")
    exp_df['GoalScore'] = probs[:, 1]
    exp_df['GoalValue'] = 1
    exp_df['GoalName'] = "RiskPerformance"
    exp_df['method'] = method
    exp_df['model'] = model_type
    Xd = X.to_dict()
    names = list(Xd.keys())
    for i in range(len(names)):
        exp_df.insert(i, names[i], list(Xd[names[i]].values())[0])
    return exp_df

In [45]:
result_kdtree = create_result_df(model, explanation, "DiCE-kdtree", "XGBoost")
result_kdtree

Unnamed: 0,ExternalRiskEstimate,MSinceOldestTradeOpen,MSinceMostRecentTradeOpen,AverageMInFile,NumSatisfactoryTrades,NumTrades60Ever2DerogPubRec,NumTrades90Ever2DerogPubRec,PercentTradesNeverDelq,MSinceMostRecentDelq,MaxDelq2PublicRecLast12M,...,CfNetFractionInstallBurden,CfNumRevolvingTradesWBalance,CfNumInstallTradesWBalance,CfNumBank2NatlTradesWHighUtilization,CfPercentTradesWBalance,GoalScore,GoalValue,GoalName,method,model
0,63,309,36,112,17,2,1,68,7,2,...,-8,29,1,2,67,0.746128,1,RiskPerformance,DiCE-kdtree,XGBoost
1,63,309,36,112,17,2,1,68,7,2,...,-8,-4,12,2,67,0.518291,1,RiskPerformance,DiCE-kdtree,XGBoost
2,63,309,36,112,17,2,1,68,7,2,...,10,5,15,2,67,0.513132,1,RiskPerformance,DiCE-kdtree,XGBoost
3,63,309,36,112,17,2,1,68,7,2,...,-8,-4,12,2,67,0.540617,1,RiskPerformance,DiCE-kdtree,XGBoost
4,63,309,36,112,17,2,1,68,7,2,...,-8,2,10,2,86,0.534851,1,RiskPerformance,DiCE-kdtree,XGBoost
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,63,309,36,112,17,2,1,68,7,2,...,-8,5,17,2,67,0.611199,1,RiskPerformance,DiCE-kdtree,XGBoost
96,63,309,36,112,17,2,1,68,7,2,...,3,5,1,2,67,0.520912,1,RiskPerformance,DiCE-kdtree,XGBoost
97,63,309,36,112,17,2,1,68,7,2,...,-8,5,9,2,67,0.543316,1,RiskPerformance,DiCE-kdtree,XGBoost
98,63,309,36,112,17,2,1,68,7,2,...,-8,5,10,2,86,0.510151,1,RiskPerformance,DiCE-kdtree,XGBoost


### Random sampling

In [46]:
explainer = dice_ml.Dice(d, m, method="random")

In [47]:
explanation = explainer.generate_counterfactuals(X, total_CFs=N, desired_class=1)

100%|██████████████████████████████████████████████████| 1/1 [00:01<00:00,  1.23s/it]


In [49]:
result_random = create_result_df(model, explanation, "DiCE-random", "XGBoost")
result_random

Unnamed: 0,ExternalRiskEstimate,MSinceOldestTradeOpen,MSinceMostRecentTradeOpen,AverageMInFile,NumSatisfactoryTrades,NumTrades60Ever2DerogPubRec,NumTrades90Ever2DerogPubRec,PercentTradesNeverDelq,MSinceMostRecentDelq,MaxDelq2PublicRecLast12M,...,CfNetFractionInstallBurden,CfNumRevolvingTradesWBalance,CfNumInstallTradesWBalance,CfNumBank2NatlTradesWHighUtilization,CfPercentTradesWBalance,GoalScore,GoalValue,GoalName,method,model
0,63,309,36,112,17,2,1,68,7,2,...,-8,12,7,2,67,0.626933,1,RiskPerformance,DiCE-random,XGBoost
1,63,309,36,112,17,2,1,68,7,2,...,-8,5,1,-2,67,0.542209,1,RiskPerformance,DiCE-random,XGBoost
2,63,309,36,112,17,2,1,68,7,2,...,67,5,4,10,67,0.566169,1,RiskPerformance,DiCE-random,XGBoost
3,63,309,36,112,17,2,1,68,7,2,...,-8,5,12,-3,67,0.631115,1,RiskPerformance,DiCE-random,XGBoost
4,63,309,36,112,17,2,1,68,7,2,...,-8,5,1,2,36,0.507506,1,RiskPerformance,DiCE-random,XGBoost
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,63,309,36,112,17,2,1,68,7,2,...,-8,5,1,2,67,0.563153,1,RiskPerformance,DiCE-random,XGBoost
96,63,309,36,112,17,2,1,68,7,2,...,16,5,1,2,96,0.507414,1,RiskPerformance,DiCE-random,XGBoost
97,63,309,36,112,17,2,1,68,7,2,...,-8,5,9,2,67,0.555554,1,RiskPerformance,DiCE-random,XGBoost
98,63,309,36,112,17,2,1,68,7,2,...,-8,5,1,2,67,0.533025,1,RiskPerformance,DiCE-random,XGBoost


### Genetic

In [50]:
explainer = dice_ml.Dice(d, m, method="genetic")

In [51]:
explanation = explainer.generate_counterfactuals(X, total_CFs=N, desired_class=1)

100%|██████████████████████████████████████████████████| 1/1 [00:39<00:00, 39.77s/it]


In [52]:
result_genetic = create_result_df(model, explanation, "DiCE-genetic", "XGBoost")
result_genetic

Unnamed: 0,ExternalRiskEstimate,MSinceOldestTradeOpen,MSinceMostRecentTradeOpen,AverageMInFile,NumSatisfactoryTrades,NumTrades60Ever2DerogPubRec,NumTrades90Ever2DerogPubRec,PercentTradesNeverDelq,MSinceMostRecentDelq,MaxDelq2PublicRecLast12M,...,CfNetFractionInstallBurden,CfNumRevolvingTradesWBalance,CfNumInstallTradesWBalance,CfNumBank2NatlTradesWHighUtilization,CfPercentTradesWBalance,GoalScore,GoalValue,GoalName,method,model
0,63,309,36,112,17,2,1,68,7,2,...,34.0,5.0,6.0,1.0,2.0,0.575931,1,RiskPerformance,DiCE-genetic,XGBoost
0,63,309,36,112,17,2,1,68,7,2,...,2.0,-8.0,3.0,1.0,2.0,0.577633,1,RiskPerformance,DiCE-genetic,XGBoost
0,63,309,36,112,17,2,1,68,7,2,...,-9.0,5.0,-9.0,2.0,0.0,0.538772,1,RiskPerformance,DiCE-genetic,XGBoost
0,63,309,36,112,17,2,1,68,7,2,...,-9.0,5.0,3.0,3.0,-9.0,0.582301,1,RiskPerformance,DiCE-genetic,XGBoost
0,63,309,36,112,17,2,1,68,7,2,...,-9.0,-9.0,3.0,-9.0,-9.0,0.543194,1,RiskPerformance,DiCE-genetic,XGBoost
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,63,309,36,112,17,2,1,68,7,2,...,1.0,-9.0,3.0,5.0,0.0,0.655021,1,RiskPerformance,DiCE-genetic,XGBoost
0,63,309,36,112,17,2,1,68,7,2,...,9.0,-9.0,-9.0,2.0,-9.0,0.506820,1,RiskPerformance,DiCE-genetic,XGBoost
0,63,309,36,112,17,2,1,68,7,2,...,33.0,-8.0,-9.0,2.0,-9.0,0.591551,1,RiskPerformance,DiCE-genetic,XGBoost
0,63,309,36,112,17,2,1,68,7,2,...,-9.0,-9.0,12.0,-9.0,-9.0,0.529796,1,RiskPerformance,DiCE-genetic,XGBoost


## MLP

### Load model

In [53]:
from joblib import load

mlp_model = load('../../models/mlp.joblib') 

### KD-Tree method

In [54]:
m_mlp = dice_ml.Model(model=mlp_model, backend="sklearn")

In [55]:
explainer = dice_ml.Dice(d, m_mlp, method="kdtree")

In [56]:
explanation = explainer.generate_counterfactuals(X, total_CFs=N, desired_class=1)

100%|██████████████████████████████████████████████████| 1/1 [00:05<00:00,  5.03s/it]


In [57]:
result_mlp_kdtree = create_result_df(mlp_model, explanation, "DiCE-kdtree", "MLP")
result_mlp_kdtree

Unnamed: 0,ExternalRiskEstimate,MSinceOldestTradeOpen,MSinceMostRecentTradeOpen,AverageMInFile,NumSatisfactoryTrades,NumTrades60Ever2DerogPubRec,NumTrades90Ever2DerogPubRec,PercentTradesNeverDelq,MSinceMostRecentDelq,MaxDelq2PublicRecLast12M,...,CfNetFractionInstallBurden,CfNumRevolvingTradesWBalance,CfNumInstallTradesWBalance,CfNumBank2NatlTradesWHighUtilization,CfPercentTradesWBalance,GoalScore,GoalValue,GoalName,method,model
6114,63,309,36,112,17,2,1,68,7,2,...,-8.0,5.0,5.0,2.0,59.0,0.800390,1,RiskPerformance,DiCE-kdtree,MLP
3385,63,309,36,112,17,2,1,68,7,2,...,-8.0,5.0,1.0,2.0,43.0,0.505903,1,RiskPerformance,DiCE-kdtree,MLP
10400,63,309,36,112,17,2,1,68,7,2,...,-8.0,2.0,1.0,0.0,80.0,0.658341,1,RiskPerformance,DiCE-kdtree,MLP
10404,63,309,36,112,17,2,1,68,7,2,...,-8.0,4.0,1.0,2.0,50.0,0.816829,1,RiskPerformance,DiCE-kdtree,MLP
4300,63,309,36,112,17,2,1,68,7,2,...,-8.0,4.0,1.0,1.0,45.0,0.640329,1,RiskPerformance,DiCE-kdtree,MLP
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4296,63,309,36,112,17,2,1,68,7,2,...,-8.0,6.0,1.0,1.0,44.0,0.537413,1,RiskPerformance,DiCE-kdtree,MLP
2921,63,309,36,112,17,2,1,68,7,2,...,-8.0,4.0,1.0,2.0,71.0,0.636599,1,RiskPerformance,DiCE-kdtree,MLP
6436,63,309,36,112,17,2,1,68,7,2,...,-8.0,4.0,-8.0,1.0,67.0,0.618862,1,RiskPerformance,DiCE-kdtree,MLP
8930,63,309,36,112,17,2,1,68,7,2,...,-8.0,1.0,1.0,-8.0,67.0,0.770597,1,RiskPerformance,DiCE-kdtree,MLP


### Random sampling

In [58]:
explainer = dice_ml.Dice(d, m_mlp, method="random")

In [59]:
explanation = explainer.generate_counterfactuals(X, total_CFs=N, desired_class=1)

100%|██████████████████████████████████████████████████| 1/1 [00:00<00:00,  1.83it/s]


In [60]:
result_mlp_random = create_result_df(mlp_model, explanation, "DiCE-random", "MLP")
result_mlp_random

Unnamed: 0,ExternalRiskEstimate,MSinceOldestTradeOpen,MSinceMostRecentTradeOpen,AverageMInFile,NumSatisfactoryTrades,NumTrades60Ever2DerogPubRec,NumTrades90Ever2DerogPubRec,PercentTradesNeverDelq,MSinceMostRecentDelq,MaxDelq2PublicRecLast12M,...,CfNetFractionInstallBurden,CfNumRevolvingTradesWBalance,CfNumInstallTradesWBalance,CfNumBank2NatlTradesWHighUtilization,CfPercentTradesWBalance,GoalScore,GoalValue,GoalName,method,model
0,63,309,36,112,17,2,1,68,7,2,...,190,5,1,2,67,0.530626,1,RiskPerformance,DiCE-random,MLP
1,63,309,36,112,17,2,1,68,7,2,...,-8,5,1,2,67,0.513075,1,RiskPerformance,DiCE-random,MLP
2,63,309,36,112,17,2,1,68,7,2,...,-8,5,1,2,67,0.561061,1,RiskPerformance,DiCE-random,MLP
3,63,309,36,112,17,2,1,68,7,2,...,-8,5,1,2,67,0.714999,1,RiskPerformance,DiCE-random,MLP
4,63,309,36,112,17,2,1,68,7,2,...,-8,5,1,2,67,0.646974,1,RiskPerformance,DiCE-random,MLP
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,63,309,36,112,17,2,1,68,7,2,...,-8,5,1,2,67,0.555043,1,RiskPerformance,DiCE-random,MLP
96,63,309,36,112,17,2,1,68,7,2,...,-8,5,1,2,67,0.578635,1,RiskPerformance,DiCE-random,MLP
97,63,309,36,112,17,2,1,68,7,2,...,-8,5,1,2,67,0.552087,1,RiskPerformance,DiCE-random,MLP
98,63,309,36,112,17,2,1,68,7,2,...,-8,-2,1,2,67,0.537666,1,RiskPerformance,DiCE-random,MLP


### Genetic

In [61]:
explainer = dice_ml.Dice(d, m_mlp, method="genetic")

In [62]:
explanation = explainer.generate_counterfactuals(X, total_CFs=N, desired_class=1)

100%|██████████████████████████████████████████████████| 1/1 [00:33<00:00, 33.13s/it]


In [63]:
result_mlp_genetic = create_result_df(mlp_model, explanation, "DiCE-genetic", "MLP")
result_mlp_genetic

Unnamed: 0,ExternalRiskEstimate,MSinceOldestTradeOpen,MSinceMostRecentTradeOpen,AverageMInFile,NumSatisfactoryTrades,NumTrades60Ever2DerogPubRec,NumTrades90Ever2DerogPubRec,PercentTradesNeverDelq,MSinceMostRecentDelq,MaxDelq2PublicRecLast12M,...,CfNetFractionInstallBurden,CfNumRevolvingTradesWBalance,CfNumInstallTradesWBalance,CfNumBank2NatlTradesWHighUtilization,CfPercentTradesWBalance,GoalScore,GoalValue,GoalName,method,model
0,63,309,36,112,17,2,1,68,7,2,...,4.0,-8.0,4.0,1.0,2.0,0.761218,1,RiskPerformance,DiCE-genetic,MLP
0,63,309,36,112,17,2,1,68,7,2,...,9.0,-9.0,5.0,1.0,-9.0,0.962630,1,RiskPerformance,DiCE-genetic,MLP
0,63,309,36,112,17,2,1,68,7,2,...,4.0,-8.0,4.0,-9.0,-9.0,0.643243,1,RiskPerformance,DiCE-genetic,MLP
0,63,309,36,112,17,2,1,68,7,2,...,-9.0,-9.0,-9.0,2.0,2.0,0.857960,1,RiskPerformance,DiCE-genetic,MLP
0,63,309,36,112,17,2,1,68,7,2,...,28.0,-8.0,-9.0,1.0,0.0,0.708585,1,RiskPerformance,DiCE-genetic,MLP
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,63,309,36,112,17,2,1,68,7,2,...,9.0,-8.0,3.0,-8.0,-9.0,0.941911,1,RiskPerformance,DiCE-genetic,MLP
0,63,309,36,112,17,2,1,68,7,2,...,22.0,-9.0,2.0,-8.0,0.0,0.939157,1,RiskPerformance,DiCE-genetic,MLP
0,63,309,36,112,17,2,1,68,7,2,...,13.0,5.0,0.0,2.0,-9.0,0.959632,1,RiskPerformance,DiCE-genetic,MLP
0,63,309,36,112,17,2,1,68,7,2,...,-9.0,-9.0,4.0,2.0,-9.0,0.894425,1,RiskPerformance,DiCE-genetic,MLP


## Export results

In [64]:
final_df = pd.concat([result_kdtree, result_random, result_genetic, result_mlp_kdtree, result_mlp_random, result_mlp_genetic])
final_df.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,0.10
ExternalRiskEstimate,63,63,63,63,63,63,63,63,63,63,...,63,63,63,63,63,63,63,63,63,63
MSinceOldestTradeOpen,309,309,309,309,309,309,309,309,309,309,...,309,309,309,309,309,309,309,309,309,309
MSinceMostRecentTradeOpen,36,36,36,36,36,36,36,36,36,36,...,36,36,36,36,36,36,36,36,36,36
AverageMInFile,112,112,112,112,112,112,112,112,112,112,...,112,112,112,112,112,112,112,112,112,112
NumSatisfactoryTrades,17,17,17,17,17,17,17,17,17,17,...,17,17,17,17,17,17,17,17,17,17
NumTrades60Ever2DerogPubRec,2,2,2,2,2,2,2,2,2,2,...,2,2,2,2,2,2,2,2,2,2
NumTrades90Ever2DerogPubRec,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
PercentTradesNeverDelq,68,68,68,68,68,68,68,68,68,68,...,68,68,68,68,68,68,68,68,68,68
MSinceMostRecentDelq,7,7,7,7,7,7,7,7,7,7,...,7,7,7,7,7,7,7,7,7,7
MaxDelq2PublicRecLast12M,2,2,2,2,2,2,2,2,2,2,...,2,2,2,2,2,2,2,2,2,2


In [65]:
final_df.to_csv("../../results/cf-dice.csv")