<a href="https://colab.research.google.com/github/souravkrpathak/Attrition/blob/main/XAI.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
!pip install -U scikit-learn



In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
import pickle
import warnings
warnings.filterwarnings('ignore')

In [8]:
!pip install lime
import lime
import lime.lime_tabular



In [9]:
!pip install shap
import shap



In [32]:
from sklearn.inspection import partial_dependence

In [10]:
file_path = input("Input file path: ")
input_df = pd.read_csv(file_path)
model_path = input("Input model file path: ")
model = pickle.load(open(model_path, 'rb'))
target = input("Enter target variable: ")

Input file path: /content/diabetes.csv
Input model file path: /content/diabetes_rf_classifier.pkl
Enter target variable: Outcome


In [11]:
#Function to create and export decile dataframe in json format

def create_decile(input_df, model, target):
  X = input_df.drop(target, 1)
  y = input_df[target]
  y_pred_prob = model.predict_proba( X )[:,1]
  y_pred = model.predict( X )
  df = pd.DataFrame({'Y': y_pred, 'Probability': y_pred_prob})
  X_dec = X.reset_index(drop=True)
  df2 = df.join(X_dec)
  df2['Decile'] = pd.cut(df2['Probability'], 10, labels=False)
  df2 = df2.sort_values(by=['Probability'], ascending = False)
  df2.reset_index(drop=True, inplace=True)
  df3 = pd.DataFrame({'Y': np.nan, 	'Probability': np.nan, 	'Glucose': np.nan,	'BloodPressure': np.nan,	'SkinThickness': np.nan,	'Insulin': np.nan, 'BMI': np.nan,'DiabetesPedigreeFunction': np.nan, 'Age': np.nan, 'Pregnancies': np.nan,'Decile': np.nan}, index=[0])
  for i in range(df2.shape[0]):
    if df2['Decile'][i] not in list(df3['Decile']):
      df3 = df3.append(df2.iloc[i], ignore_index=True)
  df3 = df3.dropna()
  df3.to_json('decile.json')
  return df3

In [12]:
create_decile(input_df, model, target)

Unnamed: 0,Y,Probability,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Pregnancies,Decile
1,1.0,0.59,127.0,46.0,21.0,335.0,34.4,0.176,22.0,2.0,9.0
2,1.0,0.55,119.0,44.0,47.0,63.0,35.5,0.28,25.0,1.0,8.0
3,1.0,0.51,103.0,60.0,33.0,192.0,24.0,0.966,33.0,4.0,7.0
4,0.0,0.471538,89.0,76.0,34.0,37.0,31.2,0.192,23.0,1.0,6.0
5,0.0,0.43,91.0,82.0,0.0,0.0,35.6,0.587,68.0,8.0,5.0
6,0.0,0.39,83.0,68.0,0.0,0.0,18.2,0.624,27.0,1.0,4.0
7,0.0,0.35,57.0,60.0,0.0,0.0,21.7,0.735,67.0,0.0,3.0
8,0.0,0.31,132.0,0.0,0.0,0.0,32.9,0.302,23.0,4.0,2.0
9,0.0,0.27,94.0,0.0,0.0,0.0,0.0,0.256,25.0,0.0,1.0
10,0.0,0.22,84.0,0.0,0.0,0.0,0.0,0.304,21.0,2.0,0.0


In [13]:
#Function to explain prediction using lime

def explain_lime(input_df, model, target):
  X = input_df.drop(target, 1)
  y = input_df[target]
  y_pred_prob = model.predict_proba( X )[:,1]
  y_pred = model.predict( X )
  X_features = list( X.columns )
  decile_df = create_decile(input_df, model, target)
  del decile_df['Y']
  del decile_df['Probability']
  del decile_df['Decile']
  explainer = (lime.lime_tabular.LimeTabularExplainer(training_data = X.to_numpy(), training_labels = y, feature_names = X_features, kernel_width=3,
                                                      verbose = True ))
  for i in [0,9]:
    exp = explainer.explain_instance(decile_df.iloc[i].to_numpy(), model.predict_proba )
    exp_list = exp.as_list()
    exp_keys = [exp_list[j][0] for j in range (len(exp_list))]
    exp_values = [exp_list[j][1] for j in range (len(exp_list))]
    exp_dict = dict(zip(exp_keys, exp_values))
    with open("lime_" + str(i) + ".json", "w") as outfile:
      json.dump(exp_dict, outfile)

In [14]:
explain_lime(input_df, model, target)

Intercept 0.483477814620771
Prediction_local [0.44521007]
Right: 0.2033544759334233
Intercept 0.5204000313722312
Prediction_local [0.39784312]
Right: 0.01615079365079365


In [15]:
#Function to explain prediction using Shap

def explain_shap(input_df, model, target):
  X = input_df.drop(target, 1)
  y = input_df[target]
  y_pred_prob = model.predict_proba( X )[:,1]
  y_pred = model.predict( X )
  X_features = list( X.columns )
  decile_df = create_decile(input_df, model, target)
  del decile_df['Y']
  del decile_df['Probability']
  del decile_df['Decile']
  
  for i in [0,9]:
    data_for_prediction = X.iloc[i]
    data_for_prediction_array = data_for_prediction.values.reshape(1, -1)
    model.predict_proba(data_for_prediction_array)
    explainer = shap.TreeExplainer(model)
    shap_values = explainer.shap_values(data_for_prediction_array)
    exp_values = list(shap_values[1][0])
    exp_keys = X_features
    exp_dict = dict(zip(exp_keys, exp_values))
    with open("shap_" + str(i) + ".json", "w") as outfile:
      json.dump(exp_dict, outfile)

In [16]:
explain_shap(input_df, model, target)

In [55]:
#Create PDP json

def pdp(input_df, model):
  feature = [(input('Enter variable name: '))]
  X = input_df.drop(target, 1)
  X_features = list( X.columns )
  pdp, axes = partial_dependence(model, X, features = feature)
  pdp_df = pd.DataFrame({'Axes': list(axes[0]), 'PDP': list(pdp[0])})
  pdp_df.to_json('pdp_' + feature[0] + '.json')
  return pdp_df.head()

In [56]:
pdp(input_df, model)

Enter variable name: Insulin


Unnamed: 0,Axes,PDP
0,0.0,0.444486
1,2.97303,0.444486
2,5.946061,0.444486
3,8.919091,0.444486
4,11.892121,0.441401
