In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")

In [2]:
# Sklearn imports
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier

# DiCE imports
import dice_ml

# Original notebook from: https://medium.com/@bijil.subhash/explainable-ai-diverse-counterfactual-explanations-dice-315f058c0364

In [3]:
from sklearn import preprocessing

In [4]:
from sklearn.linear_model import LogisticRegression

In [5]:
df = pd.read_csv('train.csv') #reading data
df.replace('?',np.NaN, inplace=True)
# Dropping missing values


df = df.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1) #dropping columns that are not useful for classifcation
df = df.dropna(axis=0) #dropping nan rows
df.dropna(inplace=True)

In [6]:
le = preprocessing.LabelEncoder() #encoding the categorical variables into numericals
df['Sex'] = le.fit_transform(df['Sex']) #{'female': 0, 'male': 1}
df['Embarked'] = le.fit_transform(df['Embarked']) #{'C': 0, 'Q': 1, 'S': 2}

In [7]:
df.columns

Index(['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare',
       'Embarked'],
      dtype='object')

In [8]:
y = df["Survived"].values

In [9]:
X = df[['Pclass',  'Sex', 'Age', 'SibSp', 'Parch', 'Fare',
       'Embarked']]

In [10]:
train_dataset, test_dataset, y_train, y_test = train_test_split(df, y, test_size=0.2, random_state=42, stratify = y) #train test split


In [11]:
X_train = train_dataset.drop('Survived', axis=1)
X_test = test_dataset.drop('Survived', axis=1)

In [12]:

#model training
model = LogisticRegression(max_iter=500)

In [13]:
X_train.shape

(569, 7)

In [14]:

model.fit(X_train, y_train)

In [15]:
d = dice_ml.Data(dataframe=train_dataset, continuous_features=['Age', 'Fare'], 
                 outcome_name='Survived')
m = dice_ml.Model(model=model, backend="sklearn")
exp = dice_ml.Dice(d, m, method="random")

In [16]:
e = exp.generate_counterfactuals(X_test[0:1], total_CFs=5, desired_class="opposite")
e.visualize_as_dataframe(show_only_changes=True)

  0%|          | 0/1 [00:00<?, ?it/s]

100%|██████████| 1/1 [00:00<00:00,  8.10it/s]

Query instance (original outcome : 1)





Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Survived
0,2,0,32.0,0,0,13.0,2,1



Diverse Counterfactual set (new outcome: 0)


Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Survived
0,-,-,76.5,-,-,-,-,0.0
1,-,-,-,5.0,3.0,-,-,0.0
2,-,-,78.9,-,-,-,-,0.0
3,-,-,70.0,-,-,-,-,0.0
4,3.0,-,50.3,-,-,-,-,0.0


In [17]:
e = exp.generate_counterfactuals(X_test[0:1], total_CFs=5, desired_class="opposite", 
                                 features_to_vary=['Age'])
e.visualize_as_dataframe(show_only_changes=True)

100%|██████████| 1/1 [00:00<00:00, 18.45it/s]

Query instance (original outcome : 1)





Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Survived
0,2,0,32.0,0,0,13.0,2,1



Diverse Counterfactual set (new outcome: 0)


Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Survived
0,-,-,71.9,-,-,-,-,0.0
1,-,-,67.7,-,-,-,-,0.0
2,-,-,75.1,-,-,-,-,0.0
3,-,-,77.9,-,-,-,-,0.0
4,-,-,79.1,-,-,-,-,0.0


In [18]:
e = exp.generate_counterfactuals(X_test[10:11], total_CFs=5, desired_class="opposite")
e.visualize_as_dataframe(show_only_changes=True)

100%|██████████| 1/1 [00:00<00:00, 14.50it/s]

Query instance (original outcome : 1)





Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Survived
0,3,0,21.0,0,0,7.75,1,1



Diverse Counterfactual set (new outcome: 0)


Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Survived
0,-,-,69.8,-,-,237.0,-,0.0
1,-,-,-,3.0,-,174.2,-,0.0
2,-,1.0,-,-,-,-,-,0.0
3,-,-,55.9,-,-,-,-,0.0
4,-,1.0,64.3,-,-,-,-,0.0
