In [None]:
# Any results you write to the current directory are saved as output.
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Ignore all warnings
import warnings
warnings.filterwarnings("ignore")
   
# Importing Libraries
import numpy as np # linear algebra
np.random.seed(1)
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from pandas_profiling import ProfileReport
import matplotlib.pyplot as plt
import seaborn as sn
import sklearn
import sklearn.datasets
from sklearn.model_selection import train_test_split 
import sklearn.ensemble
import lime
import lime.lime_tabular

In [None]:
data = pd.read_csv("../input/indian-liver-patient-records/indian_liver_patient.csv")
categorical_features=range(1,2);
categorical_names = {}
for feature in categorical_features:
    le = sklearn.preprocessing.LabelEncoder()
    le.fit(data.iloc[:, feature])
    
    categorical_names[feature] = le.classes_
def partition(x):
    if x == 2:
        return 0
    return 1
data['Dataset'] = data['Dataset'].map(partition)
labels = data.iloc[:,10]
data=data.iloc[:,:-1]
data.dtypes[data.dtypes=='object']
features=data.columns.values


In [None]:
 data.isnull().sum()


In [None]:
data[~pd.notnull(data['Albumin_and_Globulin_Ratio'])]


In [None]:
meanval=data['Albumin_and_Globulin_Ratio'].mean()
data['Albumin_and_Globulin_Ratio'].fillna(meanval, inplace=True)

In [None]:
data.isnull().sum()


# No missing values in the data
# End of Preprocessing

In [None]:
train, test, labels_train, labels_test = sklearn.model_selection.train_test_split(data , labels, train_size=0.80)


In [None]:
def encode_and_bind(original_dataframe, feature_to_encode):
    dummies = pd.get_dummies(original_dataframe[[feature_to_encode]])
    res = pd.concat([original_dataframe, dummies], axis=1)
    res=res.drop('Gender',axis=1)
    return(res)

In [None]:
encodedtrain=encode_and_bind(train, 'Gender')
encodedtest=encode_and_bind(test, 'Gender')


In [None]:
rf = sklearn.ensemble.RandomForestClassifier()
rf.fit(encodedtrain, labels_train)

In [None]:
# Making a prediction
from sklearn.metrics import classification_report
test_preds = rf.predict(encodedtest)
test_accuracy = classification_report(labels_test,test_preds)
print(test_accuracy)

# Feature Importance by Random Forest Classifier

In [None]:
from matplotlib import pyplot
importance = rf.feature_importances_
final_importance=importance[0:10]
final_importance[9]=importance[9]+importance[10]
#fimp=final_importance[0] + final_importance[-1] + final_importance[1:10]
# summarize feature importance
fimp=pd.DataFrame(final_importance,index=features,columns=['Values'])
finalimp=fimp.sort_values(by=['Values'],ascending=False)
#for i,v in enumerate(final_importance):
#    print('Feature: %0d, Score: %.5f' % (i,v))
# plot feature importance




In [None]:
_ = finalimp.plot(kind='bar', 
                    title='Random Forest Classifier Feature Importance', 
                    legend=False)

# End of Random Forest Classifier

# LOCO Begins

In [None]:
preds = rf.predict_proba(encodedtest)  
maxidx=np.argmax(preds, axis=1)
fout=preds[np.arange(len(preds)),maxidx]

In [None]:
rf1 = sklearn.ensemble.RandomForestClassifier()
preds_final=np.zeros((117,10))
# calculate LOCO for each variable
k=0
for i in features:
    test_loco = test.drop(i,axis=1)                                
    train_loco = train.drop(i,axis=1)
    if(i!='Gender'):
        etrain=encode_and_bind(train_loco, 'Gender')
        etest=encode_and_bind(test_loco, 'Gender')
        rf1.fit(etrain,labels_train)
        preds_loco = rf1.predict_proba(etest)     
        preds_result = preds_loco[np.arange(len(preds_loco)),maxidx]       
    else:
        rf1.fit(train_loco,labels_train)
        preds_loco = rf1.predict_proba(test_loco)     
        preds_result = preds_loco[np.arange(len(preds_loco)),maxidx]
        
    #display(preds_result)    
    # subtract the LOCO prediction from the original prediction
    preds_final[:,k] = fout - preds_result
    # update progress
    print('LOCO Progress: column' + str(i) + 'completed')
    k=k+1
print('Done.') 

In [None]:
q=pd.DataFrame(preds_final,columns=features)
display(q)


In [None]:
p=50                                          # put different values between 0 to 117

# select single customer
# convert to Pandas
a=preds_final[p,:]
predspandas1 = pd.DataFrame(a.reshape(-1, len(a)),columns=features)

#sorting
predspandas = predspandas1.T.sort_values(by=0, ascending=False)[:5]

# plot
_ = predspandas.plot(kind='bar', 
                    title='Top Five Reason Codes for Liver Prediction\n', 
                    legend=False)
      


display(q.loc[[p]])

In [None]:
result  = preds_final.sum(axis=0)
b = pd.DataFrame(result.reshape(-1, len(a)),columns=features)
display(b)
#sorting
c = b.T.sort_values(by=0, ascending=False)[:10]


d = np.absolute(preds_final)
e = d.sum(axis=0)
g = pd.DataFrame(e.reshape(-1, len(e)),columns=features)
display(g)
#sorting
h = g.T.sort_values(by=0, ascending=False)[:10]

In [None]:
# plot
_ = c.plot(kind='bar', 
                    title='Top Five Global Reason Codes for Liver Prediction\n', 
                    legend=False)

In [None]:
# plot
_ = h.plot(kind='bar', 
                    title='Top Five Global Reason Codes for Liver  Prediction\n', 
                    legend=False)

# End of LOCO

# Partial Dependency Plots

In [None]:
features1=features.tolist()
features1.append('Gender_Male')
features1.append('Gender_Female')
features1.remove('Gender')


In [None]:
from matplotlib import pyplot as plt
from pdpbox import pdp, get_dataset, info_plots
for x in features1:
    feature_to_plot=x
    pdp_dist = pdp.pdp_isolate(model=rf, dataset=encodedtest, model_features=features1, feature=feature_to_plot)

    pdp.pdp_plot(pdp_dist, feature_to_plot)
    plt.show()

# End of Partial Dependency Plots

#  Shapley Values Start

In [None]:
import shap 
explainershap = shap.TreeExplainer(rf)


In [None]:
shapvariable= 50
choosen_instance = encodedtest.iloc[[shapvariable]]
shap_values = explainershap.shap_values(choosen_instance)
shap.initjs()
shap.force_plot(explainershap.expected_value[1], shap_values[1], choosen_instance)

# Shapely Values End

In [None]:
predict_fn = lambda x: rf.predict(encode_and_bind(x, 'Gender'))


In [None]:
np.where(labels_test!=predict_fn(test))[0]


# LIME Start

In [None]:
def fct(x):
    if x == 1:
        return 'Male'
    return 'Female'

def inner(x):
    y=pd.DataFrame(x,columns=features)
    y['Gender'] = y['Gender'].map(fct)
    return y

In [None]:
predict_proba_fn = lambda x:rf.predict_proba(encode_and_bind(inner(x), 'Gender'))



In [None]:
trainnew=train.copy()
for feature in categorical_features:
    le = sklearn.preprocessing.LabelEncoder()
    le.fit(trainnew.iloc[:, feature])
    trainnew.iloc[:, feature] = le.transform(trainnew.iloc[:, feature])

#trainnew=trainnew.astype(float)
testnew=test.copy()
for feature in categorical_features:
    le = sklearn.preprocessing.LabelEncoder()
    le.fit(testnew.iloc[:, feature])
    testnew.iloc[:, feature] = le.transform(testnew.iloc[:, feature])
#testnew=testnew.astype(float)

In [None]:
display(trainnew)
display(testnew)

In [None]:
class_names = ['no','yes']
explainer = lime.lime_tabular.LimeTabularExplainer(trainnew.values, class_names= class_names, feature_names= features,
                                                   categorical_features= categorical_features, 
                                                   categorical_names= categorical_names, kernel_width=3, verbose=False)

In [None]:
# Explaing random instance using LIME explainer 

idx = 50
exp = explainer.explain_instance(testnew.values[idx],  predict_proba_fn, num_features=5, top_labels=1)

exp.show_in_notebook(show_table=True, show_all=False)

# End of LIME