In [None]:
display_width = 100 # in percentage

from IPython.display import display, HTML
display(HTML("<style>.container { width:"+str(display_width)+"% !important; }</style>"))

In [None]:
%reload_ext autoreload
%autoreload 2

In [None]:
import sys
sys.path.append("/Users/sudhanshugupta/Library/Python/3.9/lib/python/site-packages")

In [None]:
import pandas as pd
import numpy as np

import os

import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
from EDA import EDA_medical_appointment
from preprocessing import preprocMedicalAppointment
from feature_extraction import featuresMedicalAppointment, feature_selection_permutation_importance, feature_selection_hierarchical_clustering
from classification import LogisticRegressionClf, RandomForestClf, MLPClf, XGBClf
from utils import train_val_test_split, compare_performances_across_classifiers

# Read Data

In [None]:
df_medical_appointment = pd.read_csv("./data/medical_appointment_train.csv")
df_medical_appointment.rename(columns={"No-show": "no_show"}, inplace=True)
df_medical_appointment['no_show'].replace({"No": 0, "Yes": 1}, inplace=True)
feature_cols = [
    'PatientID', 'AppointmentID', 'Gender', 'ScheduledDay',
    'AppointmentDay', 'Age', 'Neighbourhood', 'Scholarship', 'Hypertension',
    'Diabetes', 'Alcoholism', 'Handicap', 'SMS_received',
]
df_medical_appointment.sample(10)

# Exploratory Data Analysis

In [None]:
eda = EDA_medical_appointment(df_medical_appointment[feature_cols], df_medical_appointment['no_show'])

In [None]:
eda.describe_general_stats()

Min age < 0 in some cases. These are incorrect entries and for overall performance, it is desirable to remove these values from the dataset.

In [None]:
eda.stats_per_unique_patient()

Since, 83% of the patients visit less than 3 times, building the model primarily as a sequence classification problem is not desirable. However, a second level of sequence classification can be added to the repeat patients. This is however, not explored in this analysis.

**Conclusion:** Frame the problem as a binary classification per patient

In [None]:
eda.visulaize_class_distribution()

***Conclusion***

**This is an imbalanced classification problem and techniques such as:**
1. oversampling
2. assigning class weights when performing classification
3. creating synthetic data of the imbalanced class using techniques such as SMOTE

In [None]:
eda.visulaize_no_show_prob_per_variable()

***Conclusion***

**For most variables, the independent and conditional probabilities are very similar. This implies that a person not showing up for an appointment is almost independent of these categorical variables. This means that knowing one of these variables or not doesn't substantially alter the probability of a person not showing up for an appointment. However, joint probabilities are not explored in this section, and a combination of these variables occurring together may impact the outcome.**

# Preprocessing

In [None]:
preprocessor = preprocMedicalAppointment()

In [None]:
print("before", df_medical_appointment.shape)
df_medical_appointment = preprocessor.remove_outliers_age(df_medical_appointment)
print("after", df_medical_appointment.shape)

In [None]:
X_train, y_train, X_test, y_test, X_val, y_val = train_val_test_split(df_medical_appointment, feature_cols=feature_cols, target_col='no_show', test_percent=20)

# Feature Extraction

In [None]:
feature_extractor = featuresMedicalAppointment()

In [None]:
X_train.shape

In [None]:
scheduled_after_appointment_strategy = 'drop' # numeric or 'drop'
X_train, y_train = feature_extractor.feat_n_hours_scheduled_before(X_train, y_train, scheduled_after_appointment_strategy=scheduled_after_appointment_strategy)
X_train.shape, y_train.shape

In [None]:
X_train = feature_extractor.feat_appointment_date(X_train)
X_train.shape

In [None]:
infrequent_threshold = 100
X_train = feature_extractor.feat_categorical_to_one_hot_encoding_train(X_train, infrequent_threshold=infrequent_threshold)
X_train.shape

In [None]:
columns_to_scale = ['Age']
X_train = feature_extractor.feat_minmax_norm_train(X_train, columns_to_scale)
X_train.shape

In [None]:
X_train.drop(columns=['PatientID', 'AppointmentID', 'ScheduledDay', 'AppointmentDay'], inplace=True)
X_train.shape

In [None]:
X_train.info()

# Feature Selection

>**Permutation Importance**

    One of the most basic questions we might ask of a model is: 
    What features have the biggest impact on predictions? This concept is called feature importance.

    In this notebook we use permutation importance. Compared to most other approaches, permutation importance is:
           - fast to calculate,
           - widely used and understood, and
           - consistent with properties we would want a feature importance measure to have.
    The basic question it answers is: If I randomly shuffle a single column of the validation data, leaving the target and all other columns in place, how would that affect the accuracy of predictions in that now-shuffled data? Randomly re-ordering a single column should cause less accurate predictions, since the resulting data no longer corresponds to anything observed in the real world. Model accuracy especially suffers if we shuffle a column that the model relied on heavily for predictions.

>**Hierarchical clustering**

    In hierarchical clustering, we look at every pair of objects and say which two objects are the closest. We then take the closest pair, delete them, and replace them with the midpoint of the two. Then repeat that again and again. Since we are removing points and replacing them with their averages, we are gradually reducing a number of points by pairwise combining. Rather than looking at points, we look at variables and see which two variables are the most similar. In a resulting dendrogram, the vertical axis here is how similar are the two points that are being compared. If they are closer to 0, that means that they are very similar. In this case, (1 - Spearman’s R) is used. So correlation is almost exactly the same as the R², but it’s between two variables rather than a variable and its prediction.

## Feature Selection Method 1: Permutation Importance + Hierarchical Clustering

### Permutation Importance 1

In [None]:
df_feature_importances = feature_selection_permutation_importance(
    X_train[:1000], 
    y_train[:1000], 
    n_jobs=4, 
    n_repeats=4,
    plot=True,
)

### Hierarchical Clustering 1

In [None]:
THRESHOLD_CLUSTERING = 0.75
THRESHOLD_IMPORTANCE = None # Set to None in first iteration

In [None]:
df_selected_features = feature_selection_hierarchical_clustering(
    X_train, 
    threshold_clustering=THRESHOLD_CLUSTERING, 
    threshold_importance=THRESHOLD_IMPORTANCE, 
    df_feature_importances=df_feature_importances, 
    plot=True,
)

display(df_selected_features)

In [None]:
FEATURES_PERMUTATION_IMPORTANCE_2 = df_selected_features['feature'].to_list()

### Permutation Importance 2

In [None]:
df_feature_importances = feature_selection_permutation_importance(
    X_train[FEATURES_PERMUTATION_IMPORTANCE_2][:1000],
    y_train[:1000],
    n_jobs=4,
    n_repeats=4,
    plot=True,
)

### Hierarchical Clustering 2

In [None]:
THRESHOLD_CLUSTERING = 0.75 #0.055
THRESHOLD_IMPORTANCE = 0.005

In [None]:
df_selected_features = feature_selection_hierarchical_clustering(
    X_train[FEATURES_PERMUTATION_IMPORTANCE_2], 
    threshold_clustering=THRESHOLD_CLUSTERING, 
    threshold_importance=THRESHOLD_IMPORTANCE, 
    df_feature_importances=df_feature_importances, 
    plot=True,
)

display(df_selected_features)

In [None]:
FEATURES_LIST_PI_HC = df_selected_features['feature'].to_list()
len(FEATURES_LIST_PI_HC)

## Feature Selection Method 2: Principal Components Analysis

In [None]:
feature_extractor.plot_optimal_PCA_components(X_train)

In [None]:
# feature_extractor.feat_PCA_train(X_train, n_components=10)

The fact that one principal component explains all the variance suggests that the original features are not providing distinct and independent information. Instead, they are highly correlated or redundant with each other.

This method is not used in this analysis further.

## Select the desired feature list

In [None]:
FEATURES_LIST_SELECTED = list(X_train.columns) # FEATURES_LIST_PI_HC / list(X_train.columns)
print(len(FEATURES_LIST_SELECTED))

# Classification

add calibration

force infrequent neighborhood classes in test to be same as train

or knoiw the infrequent alreaDY AND FORCE IN BOTH TESTR AND TRAIN

In [None]:
df_metrics = pd.DataFrame()

In [None]:
X_test, y_test = feature_extractor.feat_n_hours_scheduled_before(X_test, y_test, scheduled_after_appointment_strategy=scheduled_after_appointment_strategy)
X_test = feature_extractor.feat_appointment_date(X_test)
X_test = feature_extractor.feat_categorical_to_one_hot_encoding_test(X_test)
X_test = feature_extractor.feat_minmax_norm_test(X_test, columns_to_scale)
X_test.drop(columns=['PatientID', 'AppointmentID', 'ScheduledDay', 'AppointmentDay'], inplace=True)
X_test.shape

## Logistic Regression Classifier

In [None]:
clf_logistic_regression = LogisticRegressionClf()
clf_logistic_regression.set_class_weights()
clf_logistic_regression.fit_classifier(X_train[FEATURES_LIST_SELECTED], y_train)

In [None]:
y_pred = clf_logistic_regression.predict_classifier(X_test[FEATURES_LIST_SELECTED])
precision_lr, recall_lr, f1_lr, accuracy_lr = clf_logistic_regression.evaluate_classifier(y_test, y_pred)

In [None]:
df_metrics = pd.concat([
    df_metrics,
    pd.DataFrame({
        "classifier": "logistic_regression",
        "precision": precision_lr,
        "recall": recall_lr,
        "f1": f1_lr,
        "accuracy": accuracy_lr,
    }, index=[0])
])
df_metrics

## Random Forest Classifier

In [None]:
clf_rf = RandomForestClf()
clf_rf.set_class_weights()
clf_rf.fit_classifier(X_train[FEATURES_LIST_SELECTED], y_train)

In [None]:
y_pred = clf_rf.predict_classifier(X_test[FEATURES_LIST_SELECTED])
precision_rf, recall_rf, f1_rf, accuracy_rf = clf_rf.evaluate_classifier(y_test, y_pred)

In [None]:
df_metrics = pd.concat([
    df_metrics,
    pd.DataFrame({
        "classifier": "random_forest",
        "precision": precision_rf,
        "recall": recall_rf,
        "f1": f1_rf,
        "accuracy": accuracy_rf,
    }, index=[0])
])
df_metrics

### Hyperparameter Tuning

ADD GRAPHS TO SHOW DIFFERENT CLASSIFIERS HYPERPARAMETER TUNING AND SEE WHAT FACTOR MAKES A DIFFERENCE

In [None]:
param_space = {
    'n_estimators': (10, 500),
    'max_depth': (50, 200),
    'min_samples_split': (2, 100),
    'min_samples_leaf': (5, 100),
    'max_features': (0.2, 1.0),
}

clf_rf.hyperparmater_tuning(X_train[FEATURES_LIST_SELECTED], y_train, param_space, n_iter=50, cv=5, n_jobs=32)

In [None]:
y_pred = clf_rf.predict_classifier(X_test[FEATURES_LIST_SELECTED])
precision_rf_best, recall_rf_best, f1_rf_best, accuracy_rf_best = clf_rf.evaluate_classifier(y_test, y_pred)

In [None]:
df_metrics = pd.concat([
    df_metrics,
    pd.DataFrame({
        "classifier": "random_forest_HP_tuned",
        "precision": precision_best,
        "recall": recall_best,
        "f1": f1_best,
        "accuracy": accuracy_best,
    }, index=[0])
])
df_metrics

## XGBoost Classifier

In [None]:
clf_xgb = XGBClf()
clf_xgb.fit_classifier(X_train[FEATURES_LIST_SELECTED], y_train)

In [None]:
y_pred = clf_xgb.predict_classifier(X_test[FEATURES_LIST_SELECTED])
precision_xgb, recall_xgb, f1_xgb, accuracy_xgb = clf_xgb.evaluate_classifier(y_test, y_pred)

In [None]:
threshold_values = np.arange(0, 1, 0.05)
clf_xgb.predict_per_threshold(X_test[FEATURES_LIST_SELECTED], y_test, threshold_values)

In [None]:
pd.DataFrame.from_dict({
        "classifier": ["XGBoost"],
        "precision": [precision_xgb],
        "recall": [recall_xgb],
        "f1": [f1_xgb],
        "accuracy": [accuracy_xgb],
    })

In [None]:
df_metrics = pd.concat([
    df_metrics,
    pd.DataFrame({
        "classifier": "XGBoost",
        "precision": precision_xgb,
        "recall": recall_xgb,
        "f1": f1_xgb,
        "accuracy": accuracy_xgb,
    }, index=[0])
])
df_metrics

## MLP Classifier

In [None]:
clf_mlp = MLPClf()
clf_mlp.set_class_weights()
clf_mlp.fit_classifier(X_train[FEATURES_LIST_SELECTED], y_train)

In [None]:
y_pred = clf_mlp.predict_classifier(X_test[FEATURES_LIST_SELECTED])
precision_mlp, recall_mlp, f1_mlp, accuracy_mlp = clf_mlp.evaluate_classifier(y_test, y_pred)

In [None]:
df_metrics = pd.concat([
    df_metrics,
    pd.DataFrame({
        "classifier": "MLP",
        "precision": precision_mlp,
        "recall": recall_mlp,
        "f1": f1_mlp,
        "accuracy": accuracy_mlp,
    }, index=[0])
])
df_metrics

In [None]:
y_pred = clf_mlp.predict_at_threshold(X_test[FEATURES_LIST_SELECTED], 0.4)
precision_mlp_thr_0_4, recall_mlp_thr_0_4, f1_mlp_thr_0_4, accuracy_mlp_thr_0_4 = clf_mlp.evaluate_classifier(y_test, y_pred)

In [None]:
df_metrics = pd.concat([
    df_metrics,
    pd.DataFrame({
        "classifier": "MLP_thr_0.4",
        "precision": precision_mlp_thr_0_4,
        "recall": recall_mlp_thr_0_4,
        "f1": f1_mlp_thr_0_4,
        "accuracy": accuracy_mlp_thr_0_4,
    }, index=[0])
])
df_metrics

In [None]:
threshold_values = np.arange(0, 1, 0.05)
clf_mlp.predict_per_threshold(X_test[FEATURES_LIST_SELECTED], y_test, threshold_values)

## Compare Classifers

In [None]:
compare_performances_across_classifiers(df_metrics)

# Error analysis

In [None]:
import giskard

In [None]:
classifier = clf_rf

In [None]:
giskard_model = giskard.Model(
    model=classifier.predict_proba_classifier,  # A prediction function that encapsulates all the data pre-processing steps and that could be executed with the dataset used by the scan.
    model_type="classification",  # Either regression, classification or text_generation.
    name="Medical Appointment",  # Optional
    classification_labels=np.unique(y_test),  # Their order MUST be identical to the prediction_function's output order
    feature_names=FEATURES_LIST_SELECTED,  # Default: all columns of your dataset
    # classification_threshold=0.5,  # Default: 0.5
)

giskard_dataset = giskard.Dataset(pd.concat([X_test, pd.Series(y_test, name='target')], axis=1))

# Then apply the scan
results = giskard.scan(giskard_model, giskard_dataset)

In [None]:
display(results)