In [22]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pickle as pkl
from sklearn.model_selection import train_test_split

import sklearn
from sklearn import preprocessing
from sklearn.metrics import (precision_score,f1_score,recall_score,accuracy_score,make_scorer)
 
from alibi.explainers import AnchorTabular

pandas.util.testing is deprecated. Use the functions in the public API at pandas.testing instead.


#### Anchors

Function to get TP/TN/FP/FN

In [28]:
#Function to retrieve potentially interesting instances. It retrieves from the specified cells, n instances
#with the highest/lowest pred proba (head/tail)
def get_instances(model,X,y,categ,conf,n):
    y_pred = pd.DataFrame(model.predict(X), index=X.index, columns=['pred'])
    y_pred_prob = pd.DataFrame(model.predict_proba(X), index=X.index, columns=['0_pred_prob','1_pred_prob'])

    df_outcomes = pd.concat([X,y,y_pred,y_pred_prob],axis=1)
    
    if categ == "TP":
        df_TP = df_outcomes.loc[(df_outcomes['is_canceled'] == 1) & 
                                (df_outcomes['pred'] == 1)].sort_values(by='1_pred_prob',ascending=False)
        if conf == "High":
            return df_TP.head(n)
        else:
            return df_TP.tail(n)
    elif categ == "FP":
        df_FP = df_outcomes.loc[(df_outcomes['is_canceled'] == 0) & 
                                (df_outcomes['pred'] == 1)].sort_values(by='1_pred_prob',ascending=False)
        if conf == "High":
            return df_FP.head(n)
        else:
            return df_FP.tail(n)
    elif categ == "TN":
        df_TN = df_outcomes.loc[(df_outcomes['is_canceled'] == 0) & 
                                (df_outcomes['pred'] == 0)].sort_values(by='1_pred_prob',ascending=False)
        if conf == "High":
            return df_TN.head(n)
        else:
            return df_TN.tail(n)
    else:
        df_FN = df_outcomes.loc[(df_outcomes['is_canceled'] == 1) & 
                                (df_outcomes['pred'] == 0)].sort_values(by='1_pred_prob',ascending=False)
        if conf == "High":
            return df_FN.head(n)
        else:
            return df_FN.tail(n)

Dealing with categorical variables directly from original dataset

In [32]:
df = pd.read_csv("datasets/dataset_nocatenc.csv") 

df["hotel"] = df["hotel"].astype(dtype="category")
df["is_canceled"] = df["is_canceled"].astype(dtype="category")
df["meal"] = df["meal"].astype(dtype="category")
df["country"] = df["country"].astype(dtype="category")
df["market_segment"] = df["market_segment"].astype(dtype="category")
df["reserved_room_type"] = df["reserved_room_type"].astype(dtype="category")
df["assigned_room_type"] = df["assigned_room_type"].astype(dtype="category")
df["deposit_type"] = df["deposit_type"].astype(dtype="category")
df["customer_type"] = df["customer_type"].astype(dtype="category")

In [33]:
labels = df[["is_canceled"]]
features = df.drop(["is_canceled"],axis=1)

In [34]:
categorical_features = [0,9,10,11,14,15,17,19]

categorical_names = {}
for feature in categorical_features:
    le = sklearn.preprocessing.LabelEncoder()
    le.fit(features.iloc[:, feature])
    features.iloc[:, feature] = le.transform(features.iloc[:, feature])
    categorical_names[feature] = le.classes_

In [35]:
X_train_labEnc, X_test_labEnc, y_train_labEnc, y_test_labEnc = train_test_split(features, labels, test_size = 0.2, random_state = 0)
print(X_train_labEnc.shape,X_test_labEnc.shape, y_train_labEnc.shape, y_test_labEnc.shape)

(94980, 23) (23746, 23) (94980, 1) (23746, 1)


In [36]:
df_norefcat = pd.read_csv("datasets/dataset_norefcats.csv")
df_norefcat = df_norefcat.drop(["hotel_City Hotel"],axis=1)

labels_cde = df_norefcat.iloc[:,0]
features_cde = df_norefcat.iloc[:,1:]

X_train_cde, X_test_cde, y_train_cde, y_test_cde = train_test_split(features_cde, labels_cde, test_size = 0.2, random_state = 0)
print(X_train_cde.shape,X_test_cde.shape)

(94980, 89) (23746, 89)


In [37]:
cat_indices_in_X_test_labEnc = [9,10,11,14,15,17,19]

# list of length 23, for each feature in X_test_labEnc the column index of the corresponding feature in X_test_cde
# in case of categorical features, the index of the first encoding column 
feature_starting_index_in_X_test_cde = [15,0,1,2,3,4,5,6,7,16,21,52,8,9,60,70,10,82,11,85,12,13,14]

In [38]:
def predict_self(data):
    #data is a n time p matrix, where p is the number of features of X_test_labEnc (==23)
    data_cde = []
    for vec in data:
        
        vec_cde = np.zeros(X_test_cde.shape[1]) # 89 columns
        
        for idx, v in enumerate(vec):
            if not idx in cat_indices_in_X_test_labEnc:
                #print("num ",idx," ",v)
                # v is a value of a numeric feature
                vec_cde[feature_starting_index_in_X_test_cde[idx]] = v
                
            else:
                #print("cat ",idx," ",v)
                # v is a label encoding of a categorical feature
                vec_cde[feature_starting_index_in_X_test_cde[idx] + int(v)] = 1
        
        data_cde.append(vec_cde)
    
    data_cde = np.matrix(data_cde)
    # NOT NECESSARY FOR PREDICTION, BUT FOR TESTING
    data_cde = pd.DataFrame(data_cde)
    data_cde.columns = X_test_cde.columns
    
    return(grad_boost_cde.predict_proba(data_cde))

In [78]:
import lime
import lime.lime_tabular

grad_boost_cde = pkl.load(open('models/gb_lime.pkl', 'rb'))

explainer = AnchorTabular(predict_self, 
                          feature_names = np.array(X_train_labEnc.columns),
                          categorical_names=categorical_names)

Trying to unpickle estimator DummyClassifier from version 0.22.1 when using version 0.23.2. This might lead to breaking code or invalid results. Use at your own risk.
Trying to unpickle estimator DecisionTreeRegressor from version 0.22.1 when using version 0.23.2. This might lead to breaking code or invalid results. Use at your own risk.
Trying to unpickle estimator GradientBoostingClassifier from version 0.22.1 when using version 0.23.2. This might lead to breaking code or invalid results. Use at your own risk.


In [79]:
explainer.fit(X_train_labEnc.to_numpy(), disc_perc=(25, 50, 75))

AnchorTabular(meta={'name': 'AnchorTabular', 'type': ['blackbox'], 'explanations': ['local'], 'params': {'seed': None, 'disc_perc': (25, 50, 75)}})

In [80]:
x = get_instances(grad_boost_cde,
                  X_test_cde,
                  y_test_cde, 
                  'TP',
                  'High',
                  1)

instance = X_test_labEnc.loc[x.index[0],:]
print('Prediction: ', class_names[explainer.predictor(instance.to_numpy().reshape(-1,1))[0]])

Prediction:  0


In [81]:
explanation = explainer.explain(instance.to_numpy(), threshold=0.95)
print('Anchor: %s' % (' AND '.join(explanation.anchor)))
print('Precision: %.2f' % explanation.precision)
print('Coverage: %.2f' % explanation.coverage)

Anchor: previous_cancellations > 0.00 AND deposit_type = Non Refund
Precision: 1.00
Coverage: 0.05


In [60]:
x = get_instances(grad_boost_cde,
                  X_test_cde,
                  y_test_cde, 
                  'TN',
                  'High',
                  1)

instance = X_test_labEnc.loc[x.index[0],:]
print('Prediction: ', class_names[explainer.predictor(instance.to_numpy().reshape(-1,1))[0]])

Prediction:  0


In [61]:
explanation = explainer.explain(instance.to_numpy(), threshold=0.95)
print('Anchor: %s' % (' AND '.join(explanation.anchor)))
print('Precision: %.2f' % explanation.precision)
print('Coverage: %.2f' % explanation.coverage)

Anchor: country = DEU AND total_of_special_requests > 1.00 AND reserved_room_type = D
Precision: 0.98
Coverage: 0.06


In [62]:
x = get_instances(grad_boost_cde,
                  X_test_cde,
                  y_test_cde, 
                  'FP',
                  'High',
                  1)

instance = X_test_labEnc.loc[x.index[0],:]
print('Prediction: ', class_names[explainer.predictor(instance.to_numpy().reshape(-1,1))[0]])

Prediction:  0


In [63]:
explanation = explainer.explain(instance.to_numpy(), threshold=0.95)
print('Anchor: %s' % (' AND '.join(explanation.anchor)))
print('Precision: %.2f' % explanation.precision)
print('Coverage: %.2f' % explanation.coverage)

Could not find an result satisfying the 0.95 precision constraint. Now returning the best non-eligible result.


Anchor: lead_time > 18.00 AND customer_type = Transient AND booking_changes <= 0.00 AND required_car_parking_spaces <= 0.00 AND adr > 70.00 AND stays_in_week_nights > 3.00 AND adults <= 2.00 AND 0.00 < stays_in_weekend_nights <= 2.00 AND previous_bookings_not_canceled <= 0.00 AND children <= 0.00 AND babies <= 0.00 AND previous_cancellations <= 0.00 AND days_in_waiting_list <= 0.00 AND market_segment = Online TA AND hotel = Resort Hotel AND arrival_date_day_of_month > 23.00 AND arrival_date_week_number > 38.00
Precision: 0.73
Coverage: 0.33


In [64]:
x = get_instances(grad_boost_cde,
                  X_test_cde,
                  y_test_cde, 
                  'FN',
                  'High',
                  1)

instance = X_test_labEnc.loc[x.index[0],:]
print('Prediction: ', class_names[explainer.predictor(instance.to_numpy().reshape(-1,1))[0]])

Prediction:  0


In [65]:
explanation = explainer.explain(instance.to_numpy(), threshold=0.95)
print('Anchor: %s' % (' AND '.join(explanation.anchor)))
print('Precision: %.2f' % explanation.precision)
print('Coverage: %.2f' % explanation.coverage)

Anchor: country = GBR AND lead_time <= 69.00 AND hotel = Resort Hotel AND stays_in_week_nights <= 2.00 AND reserved_room_type = A
Precision: 0.95
Coverage: 0.33


In [66]:
x = get_instances(grad_boost_cde,
                  X_test_cde,
                  y_test_cde, 
                  'TP',
                  'Low',
                  1)

instance = X_test_labEnc.loc[x.index[0],:]
print('Prediction: ', class_names[explainer.predictor(instance.to_numpy().reshape(-1,1))[0]])

Prediction:  0


In [67]:
explanation = explainer.explain(instance.to_numpy(), threshold=0.95)
print('Anchor: %s' % (' AND '.join(explanation.anchor)))
print('Precision: %.2f' % explanation.precision)
print('Coverage: %.2f' % explanation.coverage)

Anchor: country = PRT AND total_of_special_requests <= 0.00 AND assigned_room_type = A AND 1.00 < stays_in_week_nights <= 3.00 AND booking_changes <= 0.00 AND customer_type = Transient AND reserved_room_type = A AND hotel = City Hotel AND meal = BB AND stays_in_weekend_nights <= 1.00 AND required_car_parking_spaces <= 0.00 AND arrival_date_week_number <= 38.00 AND previous_bookings_not_canceled <= 0.00 AND babies <= 0.00 AND children <= 0.00 AND lead_time <= 161.00 AND adr > 95.00 AND market_segment = Aviation
Precision: 0.97
Coverage: 0.41


In [68]:
x = get_instances(grad_boost_cde,
                  X_test_cde,
                  y_test_cde, 
                  'TN',
                  'Low',
                  1)

instance = X_test_labEnc.loc[x.index[0],:]
print('Prediction: ', class_names[explainer.predictor(instance.to_numpy().reshape(-1,1))[0]])

Prediction:  0


In [69]:
explanation = explainer.explain(instance.to_numpy(), threshold=0.95)
print('Anchor: %s' % (' AND '.join(explanation.anchor)))
print('Precision: %.2f' % explanation.precision)
print('Coverage: %.2f' % explanation.coverage)

Anchor: market_segment = Offline TA/TO AND booking_changes > 0.00 AND deposit_type = No Deposit AND country = ITA
Precision: 0.98
Coverage: 0.20


In [70]:
x = get_instances(grad_boost_cde,
                  X_test_cde,
                  y_test_cde, 
                  'FP',
                  'Low',
                  1)

instance = X_test_labEnc.loc[x.index[0],:]
print('Prediction: ', class_names[explainer.predictor(instance.to_numpy().reshape(-1,1))[0]])

Prediction:  0


In [71]:
explanation = explainer.explain(instance.to_numpy(), threshold=0.95)
print('Anchor: %s' % (' AND '.join(explanation.anchor)))
print('Precision: %.2f' % explanation.precision)
print('Coverage: %.2f' % explanation.coverage)

Anchor: lead_time > 161.00 AND assigned_room_type = A AND customer_type = Transient AND required_car_parking_spaces <= 0.00 AND total_of_special_requests <= 1.00 AND stays_in_week_nights <= 3.00 AND reserved_room_type = A AND previous_bookings_not_canceled <= 0.00 AND 16.00 < arrival_date_day_of_month <= 23.00 AND adults <= 2.00 AND stays_in_weekend_nights <= 1.00 AND hotel = City Hotel AND adr > 126.00 AND market_segment = Online TA
Precision: 0.98
Coverage: 0.25


In [72]:
x = get_instances(grad_boost_cde,
                  X_test_cde,
                  y_test_cde, 
                  'FN',
                  'Low',
                  1)

instance = X_test_labEnc.loc[x.index[0],:]
print('Prediction: ', class_names[explainer.predictor(instance.to_numpy().reshape(-1,1))[0]])

Prediction:  0


In [73]:
explanation = explainer.explain(instance.to_numpy(), threshold=0.95)
print('Anchor: %s' % (' AND '.join(explanation.anchor)))
print('Precision: %.2f' % explanation.precision)
print('Coverage: %.2f' % explanation.coverage)

Anchor: lead_time <= 18.00 AND total_of_special_requests > 0.00
Precision: 0.96
Coverage: 0.25
