## ü©∫ üë©üèº‚Äç‚öïÔ∏è üë®üèº‚Äç‚öïÔ∏è Glass-box classifiers for predicting no-show appointments üë©üèº‚Äçüíº üë®üèº‚Äçüíº

In [5]:
from collections import Counter

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from interpret.glassbox import ExplainableBoostingClassifier, LogisticRegression, ClassificationTree, DecisionListClassifier

from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score

from sklearn.inspection import PartialDependenceDisplay

from imblearn.under_sampling import InstanceHardnessThreshold

from interpret.blackbox import LimeTabular
from interpret import show
import shap

from yellowbrick.model_selection import FeatureImportances

import wandb

In [2]:
df = pd.read_csv('../data/cleaned_data_final.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 110521 entries, 0 to 110520
Data columns (total 16 columns):
 #   Column                Non-Null Count   Dtype
---  ------                --------------   -----
 0   Gender                110521 non-null  int64
 1   Scholarship           110521 non-null  int64
 2   Hypertension          110521 non-null  int64
 3   Diabetes              110521 non-null  int64
 4   Alcoholism            110521 non-null  int64
 5   Handicap              110521 non-null  int64
 6   SMS_received          110521 non-null  int64
 7   Show                  110521 non-null  int64
 8   ScheduledHour         110521 non-null  int64
 9   ScheduledMonth        110521 non-null  int64
 10  AppointmentMonth      110521 non-null  int64
 11  ScheduledDayOfWeek    110521 non-null  int64
 12  AppointmentDayOfWeek  110521 non-null  int64
 13  AgeGroupInt           110521 non-null  int64
 14  NeighbourhoodInt      110521 non-null  int64
 15  AwaitingTimeGroup     110521 non-n

In [3]:
X = df.drop(['Show'], axis=1)
y = df['Show']

##### ‚öñÔ∏è IHT for imbalanced data

This approach is inspired by the paper Batool, Tasneem, et al. "Predicting hospital no-shows using machine learning." 2020 IEEE International Conference on Internet of Things and Intelligence System (IoTaIS). IEEE, 2021.

In [4]:
iht = InstanceHardnessThreshold(random_state=42)
X_res, y_res = iht.fit_resample(X, y)

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.3, random_state=42)

### EBM (Explainable Boosting Classifier)


In [8]:
ebm = ExplainableBoostingClassifier()
ebm.fit(X_train,y_train)

In [11]:
def get_metrics(model, X_train, y_train, X_test, y_test):
    train_acc = accuracy_score(y_train, model.predict(X_train))
    train_f1 = f1_score(y_train, model.predict(X_train))
    test_acc = accuracy_score(y_test, model.predict(X_test))
    test_f1 = f1_score(y_test, model.predict(X_test))
    return train_acc, train_f1, test_acc, test_f1

In [13]:
ebm_train_acc, ebm_train_f1, ebm_test_acc, ebm_test_f1 = get_metrics(ebm, X_train, y_train, X_test, y_test)
print(f'Explainable Boosting Classifier Train Accuracy: {ebm_train_acc*100:.2f}%')
print(f'Explainable Boosting Classifier Train F1 Score: {ebm_train_f1*100:.2f}%')
print(f'Explainable Boosting Classifier Test Accuracy: {ebm_test_acc*100:.2f}%')
print(f'Explainable Boosting Classifier Test F1 Score: {ebm_test_f1*100:.2f}%')

Explainable Boosting Classifier Train Accuracy: 93.34%
Explainable Boosting Classifier Train F1 Score: 93.55%
Explainable Boosting Classifier Test Accuracy: 93.33%
Explainable Boosting Classifier Test F1 Score: 93.47%


In [14]:
ebm_global = ebm.explain_global()
show(ebm_global)

In [15]:
ebm_local = ebm.explain_local(X_test,y_test)
show(ebm_local)

In [18]:
lime = LimeTabular(
    model=ebm,
    data=X_train,
    random_state=1
)

lime_local = lime.explain_local(
    X_test[:5],
    y_test[:5],
    name='LIME'
)
show(lime_local)