# ML Models

# Imports

#### Standard library imports

In [3]:
import sys
import os
sys.path.append("../..")

#### Third party imports

In [4]:
import pandas as pd
import pickle
import numpy as np
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import roc_auc_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
from sklearn.svm import SVC

#### Local application imports

In [5]:
%load_ext autoreload
%autoreload 2

from pkg_dir.config import *
from pkg_dir.src.utils import *
from pkg_dir.src.functions import *
from pkg_dir.src.parameters import *

# Pickle Extraction

In [3]:
path = '../../pkg_dir/data/pickles'
name = 'dataset_v1.pkl'

## Reading extract object saved as pickle locally
pkl_obj = path + "/" + name

with open(pkl_obj, 'rb') as obj_content:
    df = pickle.load(obj_content)

In [4]:
df

Unnamed: 0,appointment_id,appointment_weekday,appointment_start,appointment_dur_min,recurring_patient,creation_to_start_hrs,confirm_request_msg,patient_confirm,confirm_to_start_hrs,clinic,appointment_source,appointment_date_update,patient_age,patient_sex,doctor,medical_specialty,appointment_status_simplified
2,2021923564,5,2022-01-22 12:30:00,30.0,first,20.52,0,0,,MARINA NACIONAL,phone,no_update,32.0,F,DR. ORLANDO JOSE HERNANDEZ MARTINEZ,UROLOGIA,cancel
3,2021853399,0,2022-01-24 12:00:00,30.0,first,52.19,0,0,,MARINA NACIONAL,phone,no_update,32.0,F,DRA JUANA GUADALUPE CABALLERO MARTINEZ,GINECOLOGIA,cancel
4,2021845947,0,2022-01-31 13:00:00,30.0,first,221.78,0,0,,TLALPAN,phone,no_update,32.0,F,Mariana Sarao Pineda,DERMATOLOGIA,cancel
5,2021924951,1,2022-01-25 13:30:00,30.0,first,78.35,0,0,,BASILICA,phone,no_update,32.0,F,DRA ANA MARIA ESCOBEDO HERNANDEZ,DERMATOLOGIA,cancel
6,2021889374,5,2022-01-22 08:00:00,30.0,first,24.22,0,1,21.75,BASILICA,phone,no_update,40.0,F,DRA MONICA MARCELA MACIAS ORTEGA,ENDOCRINOLOGIA,completed
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
544083,2022136584,3,2022-12-29 10:00:00,30.0,recurrent,2181.63,1,1,49.28,COAPA,phone,no_update,60.0,M,DRA LAURA FABIOLA PORTILLO GARCIA,NEFROLOGIA,no_show
544084,2022141229,2,2022-11-30 08:30:00,30.0,recurrent,14.12,0,1,14.05,COAPA,phone,no_update,33.0,M,DR. MIGUEL ANGEL MONTERO MARQUEZ,OTORRINOLARINGOLOGIA,completed
544085,2022506277,5,2022-06-25 10:00:00,30.0,first,25.08,0,1,19.02,COAPA,phone,no_update,33.0,M,DRA SANDRA XOCHIQUETZAL CRUZ ORDOÑEZ,UROLOGIA,completed
544086,2023228381,3,2023-01-05 08:30:00,30.0,recurrent,39.87,0,1,15.32,COAPA,phone,no_update,33.0,M,DRA SANDRA XOCHIQUETZAL CRUZ ORDOÑEZ,UROLOGIA,no_show


In [5]:
df.columns
#relevant columns: appointment_weekday, appointment_dur_min, recurring_patient, creation_to_start_hrs, confirm_request_msg, 
#confirm_to_start_hrs, clinic, appointment_source, appointment_date_update, rounded_start_times

#missing data: recurring_patient, creation_to_start_hrs, confirm_to_start_hrs, patient_age, patient_sex 

Index(['appointment_id', 'appointment_weekday', 'appointment_start',
       'appointment_dur_min', 'recurring_patient', 'creation_to_start_hrs',
       'confirm_request_msg', 'patient_confirm', 'confirm_to_start_hrs',
       'clinic', 'appointment_source', 'appointment_date_update',
       'patient_age', 'patient_sex', 'doctor', 'medical_specialty',
       'appointment_status_simplified'],
      dtype='object')

In [6]:
df['appointment_weekday'].isna().mean()

0.0

In [7]:
rounded_times = pd.to_datetime(df['appointment_start']).dt.round('30min')
df['rounded_start_times'] = rounded_times.dt.strftime('%H:%M:%S')

In [8]:
df['confirm_to_start_hrs'].mean() #replace na

31.54249449537882

In [9]:
df.isna().mean()

appointment_id                   0.000000
appointment_weekday              0.000000
appointment_start                0.000000
appointment_dur_min              0.000000
recurring_patient                0.002816
creation_to_start_hrs            0.019948
confirm_request_msg              0.000000
patient_confirm                  0.000000
confirm_to_start_hrs             0.233176
clinic                           0.000000
appointment_source               0.000000
appointment_date_update          0.000000
patient_age                      0.003889
patient_sex                      0.015100
doctor                           0.000000
medical_specialty                0.000000
appointment_status_simplified    0.000000
rounded_start_times              0.000000
dtype: float64

In [10]:
df['medical_specialty'].replace({'Ultrasonido': 'ULTRASONIDO'}, inplace = True)

In [11]:
df['confirm_to_start_hrs'].fillna(df['confirm_to_start_hrs'].mean(), inplace = True)

In [12]:
df.dropna(subset= ['recurring_patient', 'creation_to_start_hrs', 'patient_age', 'patient_sex'], inplace = True)

In [13]:
df.drop(columns = ['appointment_id', 'appointment_start'], inplace = True)

In [14]:
X = df.drop(columns = ['appointment_status_simplified'])
y = df['appointment_status_simplified']

In [19]:
X

Unnamed: 0,appointment_weekday,appointment_dur_min,recurring_patient,creation_to_start_hrs,confirm_request_msg,patient_confirm,confirm_to_start_hrs,clinic,appointment_source,appointment_date_update,patient_age,patient_sex,doctor,medical_specialty,rounded_start_times
2,5,30.0,first,20.52,0,0,31.542494,MARINA NACIONAL,phone,no_update,32.0,F,DR. ORLANDO JOSE HERNANDEZ MARTINEZ,UROLOGIA,12:30:00
3,0,30.0,first,52.19,0,0,31.542494,MARINA NACIONAL,phone,no_update,32.0,F,DRA JUANA GUADALUPE CABALLERO MARTINEZ,GINECOLOGIA,12:00:00
4,0,30.0,first,221.78,0,0,31.542494,TLALPAN,phone,no_update,32.0,F,Mariana Sarao Pineda,DERMATOLOGIA,13:00:00
5,1,30.0,first,78.35,0,0,31.542494,BASILICA,phone,no_update,32.0,F,DRA ANA MARIA ESCOBEDO HERNANDEZ,DERMATOLOGIA,13:30:00
6,5,30.0,first,24.22,0,1,21.750000,BASILICA,phone,no_update,40.0,F,DRA MONICA MARCELA MACIAS ORTEGA,ENDOCRINOLOGIA,08:00:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
544082,3,30.0,recurrent,1510.57,0,1,26.570000,COAPA,phone,no_update,60.0,M,DRA LAURA FABIOLA PORTILLO GARCIA,NEFROLOGIA,11:00:00
544083,3,30.0,recurrent,2181.63,1,1,49.280000,COAPA,phone,no_update,60.0,M,DRA LAURA FABIOLA PORTILLO GARCIA,NEFROLOGIA,10:00:00
544084,2,30.0,recurrent,14.12,0,1,14.050000,COAPA,phone,no_update,33.0,M,DR. MIGUEL ANGEL MONTERO MARQUEZ,OTORRINOLARINGOLOGIA,08:30:00
544085,5,30.0,first,25.08,0,1,19.020000,COAPA,phone,no_update,33.0,M,DRA SANDRA XOCHIQUETZAL CRUZ ORDOÑEZ,UROLOGIA,10:00:00


In [15]:
X_train, X_test, y_train_old, y_test_old = train_test_split(X, y, test_size=0.25, random_state=42)

In [16]:
le = LabelEncoder()
y_train = le.fit_transform(y_train_old)
y_test = le.fit_transform(y_test_old)

In [19]:
y_train, y_test

(array([1, 2, 1, ..., 2, 1, 1]), array([1, 1, 1, ..., 1, 2, 0]))

In [20]:
most_common_label = np.argmax(np.bincount(y_train))
baseline_1_acc = (y_test == most_common_label).mean()
baseline_1_acc

0.6401005534756762

In [21]:
encoder = OneHotEncoder(handle_unknown="ignore")
encoder.fit(X_train)
X_train = encoder.transform(X_train)
X_test = encoder.transform(X_test)

In [22]:
model = LogisticRegression(multi_class='multinomial', solver='lbfgs')

In [23]:
model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [24]:
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)

In [25]:
model_acc = accuracy_score(y_test, y_pred)

In [26]:
model_acc

0.7915781069621414

In [27]:
coefficients = model.coef_[0]

In [28]:
for feature, coef in zip(X.columns, coefficients):
    print(f"{feature}: {coef}")

appointment_weekday: -0.08053486387505376
appointment_dur_min: 0.060244979370688616
recurring_patient: 0.04577819009295222
creation_to_start_hrs: 0.08550159874942155
confirm_request_msg: 0.09098373318722333
patient_confirm: 0.05308090131474075
confirm_to_start_hrs: -0.3913676469438612
clinic: 0.004925207343934351
appointment_source: -0.11894096685114629
appointment_date_update: -0.6644510098856173
patient_age: 1.078243512136026
patient_sex: -0.2871608489375163
doctor: 0.26998263472340134
medical_specialty: 0.3464975853450315
rounded_start_times: 0.18992254670121425


In [29]:
most_significant_features = [feature for feature, coef in zip(X.columns, coefficients) if abs(coef) > 0.1]
print("Most significant features:", most_significant_features)

Most significant features: ['confirm_to_start_hrs', 'appointment_source', 'appointment_date_update', 'patient_age', 'patient_sex', 'doctor', 'medical_specialty', 'rounded_start_times']


In [33]:
roc_auc_micro = roc_auc_score(y_test, y_pred_proba, multi_class='ovr', average='micro')
print(f'Logistic Regression Test AUC: {roc_auc_micro:.4f}')
cm_logistic = confusion_matrix(y_test, y_pred)
print ("Confusion Matrix : \n", cm_logistic)
print ("\nAccuracy:", accuracy_score(y_test, y_pred))

Logistic Regression Test AUC: 0.8820
Confusion Matrix : 
 [[15524  4370   294]
 [ 1457 57494   378]
 [ 2282 10537   351]]

Accuracy: 0.7915781069621414


In [37]:
true_positives = np.diag(cm_logistic)
false_positives = np.sum(cm_logistic, axis=0) - true_positives
false_negatives = np.sum(cm_logistic, axis=1) - true_positives
true_negatives = np.sum(cm_logistic) - (true_positives + false_positives + false_negatives)


# Calculate False Positive Rate (FPR), False Negative Rate (FNR), and True Positive Rate (TPR)
fpr = false_positives / (false_positives + true_negatives)
fnr = false_negatives / (false_negatives + true_positives)
tpr = true_positives / (true_positives + false_negatives)

#0 cancel, #1 completed, #2 no_show
print("False Positive Rate (FPR):", fpr)
print("False Negative Rate (FNR):", fnr)
print("True Positive Rate (TPR):", tpr)


False Positive Rate (FPR): [0.05157313 0.44687931 0.00845102]
False Negative Rate (FNR): [0.23102833 0.03092923 0.97334852]
True Positive Rate (TPR): [0.76897167 0.96907077 0.02665148]


In [35]:
dtc = DecisionTreeClassifier(random_state = 10)
dtc = dtc.fit(X_train, y_train)

In [174]:
y_pred_dtc = dtc.predict(X_test)
cm_dtc = confusion_matrix(y_test, y_pred_dtc)
print ("Confusion Matrix : \n", cm_dtc)
print ("\nAccuracy:", accuracy_score(y_test, y_pred_dtc))

Confusion Matrix : 
 [[14123  3815  2250]
 [ 3803 47157  8369]
 [ 2188  8293  2689]]

Accuracy: 0.690161511323055


In [176]:
rf = RandomForestClassifier(max_depth=5)
rf.fit(X_train, y_train)

In [177]:
y_pred_rf = rf.predict(X_test)
cm_rf = confusion_matrix(y_test, y_pred_rf)
print ("Confusion Matrix: \n", cm_rf)
print ("\nAccuracy:", accuracy_score(y_test, y_pred_rf))

Confusion Matrix: 
 [[ 4213 15975     0]
 [  121 59208     0]
 [ 1026 12144     0]]

Accuracy: 0.684249139577287


In [28]:
lda = LinearDiscriminantAnalysis()
lda.fit(X_train,y_train)

: 

In [179]:
y_pred_lda = lda.predict(X_test)
lda_acc = accuracy_score(y_test, y_pred_lda)
print(f'LDA Test Accuracy: {lda_acc:.4f}')
cm_lda = confusion_matrix(y_test, y_pred_lda)
print ("Confusion Matrix: \n", cm_lda)

LDA Test Accuracy: 0.7927
Confusion Matrix: 
 [[15786  4352    50]
 [ 1697 57606    26]
 [ 2531 10556    83]]


In [22]:
path = '../../pkg_dir/data/pickles'
name = 'dataset_v2.pkl'

## Reading extract object saved as pickle locally
pkl_obj = path + "/" + name

with open(pkl_obj, 'rb') as obj_content:
    df2 = pickle.load(obj_content)

In [23]:
path = '../../pkg_dir/data/pickles'
name = 'dataset_v3.pkl'

## Reading extract object saved as pickle locally
pkl_obj = path + "/" + name

with open(pkl_obj, 'rb') as obj_content:
    df3 = pickle.load(obj_content)

In [43]:
path = '../../pkg_dir/data/pickles'
name = 'dataset_v4.pkl'

## Reading extract object saved as pickle locally
pkl_obj = path + "/" + name

with open(pkl_obj, 'rb') as obj_content:
    df4 = pickle.load(obj_content)

In [44]:
df4.drop(columns = ['clinic','doctor'], inplace = True)

In [55]:
rounded_times = pd.to_datetime(df4['appointment_start']).dt.round('60min')
df4['rounded_times'] = rounded_times.dt.strftime('%H:%M:%S')

In [56]:
months = [m.strftime('%B') for m in df4['appointment_start']]

In [57]:
df4['month'] = months
df4.drop(columns = ['appointment_start', 'appointment_id', 'appointment_dur_min'], inplace = True)

In [58]:
df4.dropna(inplace = True)

In [None]:
df4.drop(columns = ['time', 'month'], inplace = True)

In [59]:
df4

Unnamed: 0,appointment_weekday,recurring_patient,creation_to_start_hrs,confirm_request_msg,patient_confirm,confirm_to_start_hrs,appointment_source,appointment_date_update,patient_age,patient_sex,medical_specialty,appointment_status_simplified,rounded_times,month
21,2,first,776.55,0,1,34.18,phone,no_update,18.0,F,DERMATOLOGIA,completed,20:00:00,February
23,5,first,1415.13,0,1,23.58,phone,no_update,44.0,F,DERMATOLOGIA,completed,08:00:00,January
49,4,first,154.24,0,1,33.55,phone,no_update,44.0,F,DERMATOLOGIA,completed,19:00:00,January
64,0,first,97.30,0,1,46.28,phone,no_update,22.0,F,GINECOLOGIA,no_show,10:00:00,January
92,5,recurrent,332.75,0,1,23.68,phone,no_update,21.0,M,DERMATOLOGIA,completed,09:00:00,January
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
544022,1,first,78.32,0,1,78.27,phone,no_update,50.0,F,DERMATOLOGIA,completed,18:00:00,December
544045,3,first,19.37,0,1,19.37,phone,no_update,22.0,F,OTORRINOLARINGOLOGIA,completed,08:00:00,December
544052,4,first,30.73,0,1,-0.13,phone,no_update,32.0,F,GINECOLOGIA,completed,16:00:00,December
544058,4,first,26.68,0,1,26.68,phone,no_update,58.0,F,DERMATOLOGIA,completed,13:00:00,December


In [60]:
X = pd.get_dummies(df4.drop(columns = ['appointment_status_simplified']))
y = df4['appointment_status_simplified']

In [61]:
X_train, X_test, y_train_old, y_test_old = train_test_split(X, y, test_size=0.25, random_state=42)

In [62]:
le = LabelEncoder()
y_train = le.fit_transform(y_train_old)
y_test = le.fit_transform(y_test_old)

In [71]:
most_common_label = np.argmax(np.bincount(y_train))
baseline_1_acc = (y_test == most_common_label).mean()
baseline_1_acc

0.7718305169399817

In [101]:
logreg = LogisticRegression(multi_class='multinomial', solver='lbfgs')

In [102]:
logreg.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [104]:
y_pred_log = model.predict(X_test)

In [105]:
print(classification_report(y_test, y_pred_log))

              precision    recall  f1-score   support

           0       0.51      0.03      0.06       652
           1       0.77      1.00      0.87      9272
           2       0.14      0.00      0.00      2089

    accuracy                           0.77     12013
   macro avg       0.48      0.34      0.31     12013
weighted avg       0.65      0.77      0.68     12013



In [106]:
coefficients = logreg.coef_[0]
for feature, coef in zip(X.columns, coefficients):
    print(f"{feature}: {coef}")

appointment_weekday: -0.15537605457894074
creation_to_start_hrs: -0.0002788558711754355
confirm_request_msg: 0.01148890997880348
patient_confirm: -0.1407591505287899
confirm_to_start_hrs: 0.004056874919843806
patient_age: -0.004699666557646977
recurring_patient_first: -0.1055299233730342
recurring_patient_recurrent: -0.035229227155738334
appointment_source_online: -0.0014442059018435797
appointment_source_phone: -0.13931494462695684
appointment_date_update_no_update: -0.14735034979455533
appointment_date_update_update: 0.006591199265771638
patient_sex_F: -0.07819211076855033
patient_sex_M: -0.06256703976023759
medical_specialty_DERMATOLOGIA: -0.08998786319463495
medical_specialty_GINECOLOGIA: -0.03508402784959304
medical_specialty_OTORRINOLARINGOLOGIA: -0.006972589065830168
medical_specialty_UROLOGIA: -0.008714670418740182
rounded_times_07:00:00: -0.0020495951478421365
rounded_times_08:00:00: -0.017849415824627658
rounded_times_09:00:00: -0.007172600981775316
rounded_times_10:00:00: -0

In [74]:
params = {'max_features': np.arange(1, len(X_train.columns) + 1)}
rfc = RandomForestClassifier(random_state = 88, max_depth=5)
rfc_cv = GridSearchCV(rfc, params, cv = 5, scoring = 'accuracy',verbose = False)
rfc_cv.fit(X_train, y_train)
y_pred_rfc = rfc_cv.best_estimator_.predict(X_test)

0.774827270457005

In [99]:
print(classification_report(y_test, y_pred_rfc))

              precision    recall  f1-score   support

           0       0.58      0.07      0.13       652
           1       0.78      1.00      0.87      9272
           2       0.89      0.00      0.01      2089

    accuracy                           0.77     12013
   macro avg       0.75      0.36      0.34     12013
weighted avg       0.79      0.77      0.68     12013



In [109]:
feature_importances = rfc_cv.best_estimator_.feature_importances_
# Display feature importances
for feature, importance in zip(X.columns, feature_importances):
    print(f'{feature}: {importance}')

appointment_weekday: 0.020530756097385423
creation_to_start_hrs: 0.1769076133807584
confirm_request_msg: 0.0510704793670824
patient_confirm: 0.0
confirm_to_start_hrs: 0.4920649880925885
patient_age: 0.17350866221729996
recurring_patient_first: 0.005162449016152291
recurring_patient_recurrent: 0.004471230450894838
appointment_source_online: 0.0008323740731131491
appointment_source_phone: 0.0008823664143405849
appointment_date_update_no_update: 0.0030651101115215765
appointment_date_update_update: 0.0034959291704376246
patient_sex_F: 0.002252729681802592
patient_sex_M: 0.0008685566871489292
medical_specialty_DERMATOLOGIA: 0.014451741963892374
medical_specialty_GINECOLOGIA: 0.002741358239288189
medical_specialty_OTORRINOLARINGOLOGIA: 0.001794743461238444
medical_specialty_UROLOGIA: 0.008034734788390977
rounded_times_07:00:00: 0.0020171857207059453
rounded_times_08:00:00: 0.0012703757716167215
rounded_times_09:00:00: 0.0002825403196572334
rounded_times_10:00:00: 0.0009757866121490271
round

In [75]:
gbc = GradientBoostingClassifier(n_estimators = 3300, max_leaf_nodes = 10, random_state = 88)
gbc.fit(X_train, y_train) 
y_pred_gbc = gbc.predict(X_test)

0.7619245817031549

In [97]:
print(classification_report(y_test, y_pred_gbc))

              precision    recall  f1-score   support

           0       0.48      0.12      0.19       652
           1       0.78      0.97      0.86      9272
           2       0.28      0.05      0.09      2089

    accuracy                           0.76     12013
   macro avg       0.51      0.38      0.38     12013
weighted avg       0.68      0.76      0.69     12013



In [114]:
feature_importances = gbc.feature_importances_
# Display feature importances
for feature, importance in zip(X.columns, feature_importances):
    print(f'{feature}: {importance}')

appointment_weekday: 0.03338784509483776
creation_to_start_hrs: 0.3385837317272868
confirm_request_msg: 0.0204737966000304
patient_confirm: 0.0
confirm_to_start_hrs: 0.283955203104199
patient_age: 0.10784976983883166
recurring_patient_first: 0.005242498365300948
recurring_patient_recurrent: 0.004610612480363044
appointment_source_online: 0.00112337874717397
appointment_source_phone: 0.0011444677151788193
appointment_date_update_no_update: 0.0043375309275258496
appointment_date_update_update: 0.004085086070962494
patient_sex_F: 0.0041326961653400255
patient_sex_M: 0.004080394243453123
medical_specialty_DERMATOLOGIA: 0.010897008200580824
medical_specialty_GINECOLOGIA: 0.010169562211390577
medical_specialty_OTORRINOLARINGOLOGIA: 0.008183755422039533
medical_specialty_UROLOGIA: 0.00662312074128278
rounded_times_07:00:00: 0.00323321013503676
rounded_times_08:00:00: 0.00645005422204147
rounded_times_09:00:00: 0.005019517793892648
rounded_times_10:00:00: 0.005439617033446911
rounded_times_11:

In [76]:
dtc = DecisionTreeClassifier(random_state = 10)
dtc = dtc.fit(X_train, y_train)
y_pred_dtc = dtc.predict(X_test)

0.6357279613751768

In [95]:
print(classification_report(y_test, y_pred_dtc))

              precision    recall  f1-score   support

           0       0.12      0.15      0.13       652
           1       0.79      0.77      0.78      9272
           2       0.20      0.21      0.20      2089

    accuracy                           0.64     12013
   macro avg       0.37      0.37      0.37     12013
weighted avg       0.65      0.64      0.64     12013



In [115]:
feature_importances = dtc.feature_importances_
# Display feature importances
for feature, importance in zip(X.columns, feature_importances):
    print(f'{feature}: {importance}')

appointment_weekday: 0.058431905097469
creation_to_start_hrs: 0.20013785430367784
confirm_request_msg: 0.008422619790795945
patient_confirm: 0.0
confirm_to_start_hrs: 0.2013447851830968
patient_age: 0.1432398030779084
recurring_patient_first: 0.005984059001295175
recurring_patient_recurrent: 0.004933375087491277
appointment_source_online: 0.0008841458618540233
appointment_source_phone: 0.0008541134511232579
appointment_date_update_no_update: 0.00428987393115933
appointment_date_update_update: 0.0046355354469496705
patient_sex_F: 0.014331108331245894
patient_sex_M: 0.013678479721569657
medical_specialty_DERMATOLOGIA: 0.008661647049783262
medical_specialty_GINECOLOGIA: 0.012620342795349841
medical_specialty_OTORRINOLARINGOLOGIA: 0.010695985135775677
medical_specialty_UROLOGIA: 0.009170314519615783
rounded_times_07:00:00: 0.0037412906518573996
rounded_times_08:00:00: 0.011331425917505992
rounded_times_09:00:00: 0.007954531288129571
rounded_times_10:00:00: 0.015124688549951876
rounded_time

In [77]:
lda = LinearDiscriminantAnalysis()
lda.fit(X_train,y_train)
y_pred_lda = lda.predict(X_test)

0.7723299758594856

In [94]:
print(classification_report(y_test, y_pred_lda))

              precision    recall  f1-score   support

           0       0.67      0.02      0.03       652
           1       0.77      1.00      0.87      9272
           2       0.00      0.00      0.00      2089

    accuracy                           0.77     12013
   macro avg       0.48      0.34      0.30     12013
weighted avg       0.63      0.77      0.67     12013



In [79]:
lda.coef_[0]
for feature, coef in zip(X.columns, coefficients):
    print(f"{feature}: {coef}")

appointment_weekday: -0.15537605457894074
creation_to_start_hrs: -0.0002788558711754355
confirm_request_msg: 0.01148890997880348
patient_confirm: -0.1407591505287899
confirm_to_start_hrs: 0.004056874919843806
patient_age: -0.004699666557646977
recurring_patient_first: -0.1055299233730342
recurring_patient_recurrent: -0.035229227155738334
appointment_source_online: -0.0014442059018435797
appointment_source_phone: -0.13931494462695684
appointment_date_update_no_update: -0.14735034979455533
appointment_date_update_update: 0.006591199265771638
patient_sex_F: -0.07819211076855033
patient_sex_M: -0.06256703976023759
medical_specialty_DERMATOLOGIA: -0.08998786319463495
medical_specialty_GINECOLOGIA: -0.03508402784959304
medical_specialty_OTORRINOLARINGOLOGIA: -0.006972589065830168
medical_specialty_UROLOGIA: -0.008714670418740182
rounded_times_07:00:00: -0.0020495951478421365
rounded_times_08:00:00: -0.017849415824627658
rounded_times_09:00:00: -0.007172600981775316
rounded_times_10:00:00: -0

In [92]:
knn_classifier = KNeighborsClassifier(n_neighbors=30)

# Train the classifier
knn_classifier.fit(X_train, y_train)

# Make predictions on the test set
y_pred = knn_classifier.predict(X_test)

# Evaluate the classifier
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

# Display classification report
print('Classification Report:')
print(classification_report(y_test, y_pred))

Accuracy: 0.7732456505452426
Classification Report:
              precision    recall  f1-score   support

           0       0.49      0.08      0.13       652
           1       0.78      1.00      0.87      9272
           2       0.25      0.00      0.00      2089

    accuracy                           0.77     12013
   macro avg       0.51      0.36      0.34     12013
weighted avg       0.67      0.77      0.68     12013



In [12]:
path = '../../pkg_dir/data/pickles'
name = 'dataset_v6.pkl'

## Reading extract object saved as pickle locally
pkl_obj = path + "/" + name

with open(pkl_obj, 'rb') as obj_content:
    df6 = pickle.load(obj_content)

In [13]:
df6['medical_specialty'].replace({'Ultrasonido': 'ULTRASONIDO'}, inplace = True)
rounded_times = pd.to_datetime(df6['appointment_start']).dt.round('60min')
df6['rounded_start_times'] = rounded_times.dt.strftime('%H:%M:%S')
df6['confirm_to_start_hrs'].fillna(df6['confirm_to_start_hrs'].mean(), inplace = True)
df6.dropna(subset= ['recurring_patient', 'creation_to_start_hrs', 'patient_age', 'patient_sex'], inplace = True)
months = [m.strftime('%B') for m in df6['appointment_start']]
df6['month'] = months
# Mapping for days of the week
days_mapping = {0: 'Monday', 1: 'Tuesday', 2: 'Wednesday', 3: 'Thursday', 4: 'Friday', 5: 'Saturday', 6: 'Sunday'}

# Mapping for 01 to 'no' and 'yes'
binary_mapping = {0: 'no', 1: 'yes'}

# Replace values in the DataFrame
df6['appointment_weekday'] = df6['appointment_weekday'].replace(days_mapping)
df6['confirm_request_msg'] = df6['confirm_request_msg'].replace(binary_mapping)
df6['patient_confirm'] = df6['patient_confirm'].replace(binary_mapping)

In [14]:
df6

Unnamed: 0,appointment_id,appointment_weekday,appointment_start,appointment_dur_min,recurring_patient,creation_to_start_hrs,confirm_request_msg,patient_confirm,confirm_to_start_hrs,clinic,appointment_source,appointment_date_update,patient_age,patient_sex,doctor,medical_specialty,appointment_status_simplified,rounded_start_times,month
2,2021923564,Saturday,2022-01-22 12:30:00,30.0,first,20.52,no,no,30.535624,MARINA NACIONAL,phone,no_update,32.0,F,DR. ORLANDO JOSE HERNANDEZ MARTINEZ,UROLOGIA,fail,12:00:00,January
3,2021853399,Monday,2022-01-24 12:00:00,30.0,first,52.19,no,no,30.535624,MARINA NACIONAL,phone,no_update,32.0,F,DRA JUANA GUADALUPE CABALLERO MARTINEZ,GINECOLOGIA,fail,12:00:00,January
6,2021889374,Saturday,2022-01-22 08:00:00,30.0,first,24.22,no,yes,21.750000,BASILICA,phone,no_update,40.0,F,DRA MONICA MARCELA MACIAS ORTEGA,ENDOCRINOLOGIA,completed,08:00:00,January
8,2021757372,Saturday,2022-01-22 09:30:00,30.0,recurrent,1792.41,no,yes,23.170000,MARINA NACIONAL,phone,no_update,41.0,M,DR. SAID CHAYA SALGADO,DERMATOLOGIA,completed,10:00:00,January
13,2021991250,Monday,2022-10-03 11:00:00,30.0,recurrent,92.60,no,yes,50.210000,TLALPAN,phone,no_update,39.0,F,DRA VANESSA ELIZABETH PICHARDO SOSA,GINECOLOGIA,completed,11:00:00,October
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
544074,2022133177,Friday,2022-12-02 18:00:00,60.0,first,73.80,no,yes,55.520000,COAPA,phone,no_update,68.0,F,DRA EDITH GOMEZ CONTRERAS,GINECOLOGIA,completed,18:00:00,December
544075,2022133269,Friday,2022-12-16 14:30:00,30.0,recurrent,331.84,no,yes,23.320000,COAPA,phone,no_update,68.0,F,DRA EDITH GOMEZ CONTRERAS,GINECOLOGIA,completed,14:00:00,December
544084,2022141229,Wednesday,2022-11-30 08:30:00,30.0,recurrent,14.12,no,yes,14.050000,COAPA,phone,no_update,33.0,M,DR. MIGUEL ANGEL MONTERO MARQUEZ,OTORRINOLARINGOLOGIA,completed,08:00:00,November
544085,2022506277,Saturday,2022-06-25 10:00:00,30.0,first,25.08,no,yes,19.020000,COAPA,phone,no_update,33.0,M,DRA SANDRA XOCHIQUETZAL CRUZ ORDOÑEZ,UROLOGIA,completed,10:00:00,June


In [15]:
df6.drop(columns = ['appointment_id', 'appointment_start', 'doctor'], inplace = True)

In [16]:
X = pd.get_dummies(df6.drop(columns = ['appointment_status_simplified']))
y = df6['appointment_status_simplified']

In [18]:
X_train, X_test, y_train_old, y_test_old = train_test_split(X, y, test_size=0.25, random_state=42)

In [19]:
le = LabelEncoder()
y_train = le.fit_transform(y_train_old)
y_test = le.fit_transform(y_test_old)

In [20]:
most_common_label = np.argmax(np.bincount(y_train))
baseline_1_acc = (y_test == most_common_label).mean()
baseline_1_acc

0.7108896647309602

In [58]:
logreg = LogisticRegression(class_weight = 'balanced')

In [59]:
logreg.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [60]:
y_pred = logreg.predict(X_test)
y_pred_proba = logreg.predict_proba(X_test)

In [61]:
# Evaluate the classifier
accuracy = accuracy_score(y_test, y_pred)
accuracy

0.7904524640783172

In [62]:
coefficients = logreg.coef_[0]
for feature, coef in zip(X.columns, coefficients):
    print(f"{feature}: {coef}")

appointment_dur_min: 0.012324963932175107
creation_to_start_hrs: 0.00037356016043653515
confirm_to_start_hrs: 0.00582708457573722
patient_age: -0.008770058525970945
appointment_weekday_Friday: 0.0991148822586555
appointment_weekday_Monday: -0.021674673825741616
appointment_weekday_Saturday: 0.1010756298782563
appointment_weekday_Sunday: -0.00756604110909659
appointment_weekday_Thursday: 0.0008116503671154585
appointment_weekday_Tuesday: 0.009760860048931105
appointment_weekday_Wednesday: -0.03719914616645228
recurring_patient_first: 0.21624508575144855
recurring_patient_recurrent: -0.07192192429967867
confirm_request_msg_no: -0.10313679259802659
confirm_request_msg_yes: 0.24745995404997473
patient_confirm_no: 1.6187525903171724
patient_confirm_yes: -1.474429428865802
clinic_BASILICA: 0.08692667372859403
clinic_COAPA: -0.13469888747047382
clinic_CUAJIMALPA: -0.1783561429220656
clinic_DEL VALLE: 0.06564135025524359
clinic_DOCTORES: 0.03309361825505298
clinic_MARINA NACIONAL: 0.1046428590

In [55]:
roc_auc_micro = roc_auc_score(y_test, y_pred)
print(f'Logistic Regression Test AUC: {roc_auc_micro:.4f}')
cm_logistic = confusion_matrix(y_test, y_pred)
print ("Confusion Matrix : \n", cm_logistic)
print ("\nAccuracy:", accuracy_score(y_test, y_pred))

Logistic Regression Test AUC: 0.6890
Confusion Matrix : 
 [[37664  2856]
 [ 9088  7391]]

Accuracy: 0.7904524640783172


In [27]:
true_positives = np.diag(cm_logistic)
false_positives = np.sum(cm_logistic, axis=0) - true_positives
false_negatives = np.sum(cm_logistic, axis=1) - true_positives
true_negatives = np.sum(cm_logistic) - (true_positives + false_positives + false_negatives)


# Calculate False Positive Rate (FPR), False Negative Rate (FNR), and True Positive Rate (TPR)
fpr = false_positives / (false_positives + true_negatives)
fnr = false_negatives / (false_negatives + true_positives)
tpr = true_positives / (true_positives + false_negatives)

#0 completed, 1 no_show/fail
print("False Positive Rate (FPR):", fpr)
print("False Negative Rate (FNR):", fnr)
print("True Positive Rate (TPR):", tpr)

False Positive Rate (FPR): [0.55148977 0.07048371]
False Negative Rate (FNR): [0.07048371 0.55148977]
True Positive Rate (TPR): [0.92951629 0.44851023]


In [63]:
logreg2 = LogisticRegression()

In [64]:
logreg2.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [65]:
y_pred = logreg2.predict(X_test)
y_pred_proba = logreg2.predict_proba(X_test)
# Evaluate the classifier
accuracy = accuracy_score(y_test, y_pred)
accuracy

0.8101019316128353

In [66]:
coefficients = logreg2.coef_[0]
for feature, coef in zip(X.columns, coefficients):
    print(f"{feature}: {coef}")

appointment_dur_min: 0.00681348890825324
creation_to_start_hrs: 0.0003574628617685982
confirm_to_start_hrs: 0.004970894306840192
patient_age: -0.008921440572588146
appointment_weekday_Friday: 0.06522309031561324
appointment_weekday_Monday: -0.04099932410748946
appointment_weekday_Saturday: 0.0888276596119326
appointment_weekday_Sunday: -0.012135974570305854
appointment_weekday_Thursday: -0.018414681386726986
appointment_weekday_Tuesday: -0.00461274979525195
appointment_weekday_Wednesday: -0.052279570802799026
recurring_patient_first: 0.10942205343876048
recurring_patient_recurrent: -0.08381360417363842
confirm_request_msg_no: -0.20562361652914213
confirm_request_msg_yes: 0.2312320657943797
patient_confirm_no: 1.55954442802623
patient_confirm_yes: -1.5339359787615003
clinic_BASILICA: 0.021159926359416863
clinic_COAPA: -0.12426576855148115
clinic_CUAJIMALPA: -0.2013533984514467
clinic_DEL VALLE: 0.06270128919992345
clinic_DOCTORES: 0.03494934545512618
clinic_MARINA NACIONAL: 0.1014018452

In [31]:
roc_auc_micro = roc_auc_score(y_test, y_pred, multi_class='ovr', average='micro')
print(f'Logistic Regression Test AUC: {roc_auc_micro:.4f}')
cm_logistic = confusion_matrix(y_test, y_pred)
print ("Confusion Matrix : \n", cm_logistic)
print ("\nAccuracy:", accuracy_score(y_test, y_pred))

Logistic Regression Test AUC: 0.6896
Confusion Matrix : 
 [[39519  1001]
 [ 9823  6656]]

Accuracy: 0.8101019316128353


In [56]:
cm = cm_logistic

In [57]:
accuracy = (cm.ravel()[0] + cm.ravel()[3]) / sum(cm.ravel())
TPR = cm.ravel()[3] / (cm.ravel()[2] + cm.ravel()[3])
FPR = cm.ravel()[1] / (cm.ravel()[0] + cm.ravel()[1])
print('accuracy: ', accuracy)
print('TPR: ', TPR)
print('FPR: ', FPR)

accuracy:  0.7904524640783172
TPR:  0.4485102251350203
FPR:  0.0704837117472853


In [32]:
true_positives = np.diag(cm_logistic)
false_positives = np.sum(cm_logistic, axis=0) - true_positives
false_negatives = np.sum(cm_logistic, axis=1) - true_positives
true_negatives = np.sum(cm_logistic) - (true_positives + false_positives + false_negatives)


# Calculate False Positive Rate (FPR), False Negative Rate (FNR), and True Positive Rate (TPR)
fpr = false_positives / (false_positives + true_negatives)
fnr = false_negatives / (false_negatives + true_positives)
tpr = true_positives / (true_positives + false_negatives)

#0 = completed, 1 = no_show/fail
print("False Positive Rate (FPR):", fpr)
print("False Negative Rate (FNR):", fnr)
print("True Positive Rate (TPR):", tpr)

False Positive Rate (FPR): [0.596092   0.02470385]
False Negative Rate (FNR): [0.02470385 0.596092  ]
True Positive Rate (TPR): [0.97529615 0.403908  ]


In [43]:
np.sum(cm_logistic, axis=0)

array([49342,  7657])

In [33]:
# Display classification report
print('Classification Report:')
print(classification_report(y_test, y_pred))

Classification Report:
              precision    recall  f1-score   support

           0       0.80      0.98      0.88     40520
           1       0.87      0.40      0.55     16479

    accuracy                           0.81     56999
   macro avg       0.84      0.69      0.72     56999
weighted avg       0.82      0.81      0.78     56999



In [34]:
[np.exp(x)/(1 + np.exp(x)) for x in logreg.coef_[0]]

[0.5030812019790448,
 0.5000933900390231,
 0.5014567670219033,
 0.49780749942130825,
 0.5247584554556303,
 0.49458154367062435,
 0.5252474165659555,
 0.49810849874596286,
 0.5002029125806394,
 0.5024401956382513,
 0.4907012857121648,
 0.5538515844390672,
 0.4820272656488026,
 0.47423863353267703,
 0.5615512105891386,
 0.8346230249161803,
 0.18627029598334807,
 0.5217179946105138,
 0.4663761015124562,
 0.45552879096355925,
 0.5164044477143652,
 0.5082726495689631,
 0.5261368689434952,
 0.5029169339959897,
 0.5387734522361867,
 0.4948912498729046,
 0.5410967942752982,
 0.5460890411823884,
 0.4898619328609503,
 0.5056441555063743,
 0.5303988567531354,
 0.49619344160765233,
 0.4706025956521704,
 0.5351315672131708,
 0.5178262864576675,
 0.5164024409108785,
 0.49988704432098285,
 0.5063217599257758,
 0.5105531804378457,
 0.5060236783163011,
 0.4772014581895131,
 0.4839465770123617,
 0.4850284946247145,
 0.511818721615606,
 0.5258711175631033,
 0.5040101712913924,
 0.4996521487210221,
 0.499

In [35]:
knn_classifier = KNeighborsClassifier()

# Train the classifier
knn_classifier.fit(X_train, y_train)

# Make predictions on the test set
y_pred = knn_classifier.predict(X_test)

# Evaluate the classifier
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

# Display classification report
print('Classification Report:')
print(classification_report(y_test, y_pred))

Accuracy: 0.7662590571764417
Classification Report:
              precision    recall  f1-score   support

           0       0.80      0.90      0.85     40520
           1       0.64      0.43      0.52     16479

    accuracy                           0.77     56999
   macro avg       0.72      0.67      0.68     56999
weighted avg       0.75      0.77      0.75     56999



In [67]:
lda = LinearDiscriminantAnalysis()
lda.fit(X_train,y_train)
y_pred_lda = lda.predict(X_test)
accuracy = accuracy_score(y_test, y_pred_lda)
classification_report_result = classification_report(y_test, y_pred_lda)

print("Accuracy:", accuracy)
print("Classification Report:\n", classification_report_result)

Accuracy: 0.8097335040965631
Classification Report:
               precision    recall  f1-score   support

           0       0.80      0.98      0.88     40520
           1       0.87      0.40      0.55     16479

    accuracy                           0.81     56999
   macro avg       0.84      0.69      0.71     56999
weighted avg       0.82      0.81      0.78     56999



In [68]:
coefficients = lda.coef_[0]
for feature, coef in zip(X.columns, coefficients):
    print(f"{feature}: {coef}")

appointment_dur_min: 0.0027425178990566337
creation_to_start_hrs: 0.00037410469683267267
confirm_to_start_hrs: 0.003400788708960249
patient_age: -0.009263016175394548
appointment_weekday_Friday: 0.05080733493246024
appointment_weekday_Monday: -0.015909304665684774
appointment_weekday_Saturday: 0.10564572088182446
appointment_weekday_Sunday: -0.07920959334754486
appointment_weekday_Thursday: -0.026596830477782517
appointment_weekday_Tuesday: -0.008360563918298071
appointment_weekday_Wednesday: -0.057706150270571965
recurring_patient_first: 0.10359808386210634
recurring_patient_recurrent: -0.10359808386210619
confirm_request_msg_no: -0.24664993013330766
confirm_request_msg_yes: 0.2466499301333069
patient_confirm_no: 2.058846008091765
patient_confirm_yes: -2.0588460080917645
clinic_BASILICA: 0.01868975331935496
clinic_COAPA: -0.13173623824041178
clinic_CUAJIMALPA: -0.2775462828632893
clinic_DEL VALLE: 0.17125453131246926
clinic_DOCTORES: 0.20672194424957152
clinic_MARINA NACIONAL: 0.09778

In [49]:
cm_lda= confusion_matrix(y_test, y_pred_lda)
accuracy = (cm_lda.ravel()[0] + cm_lda.ravel()[3]) / sum(cm_lda.ravel())
TPR = cm_lda.ravel()[3] / (cm_lda.ravel()[2] + cm_lda.ravel()[3])
FPR = cm_lda.ravel()[1] / (cm_lda.ravel()[0] + cm_lda.ravel()[1])
print('accuracy: ', accuracy)
print('TPR: ', TPR)
print('FPR: ', FPR)

accuracy:  0.8097335040965631
TPR:  0.4003276897870016
FPR:  0.02376604146100691


In [70]:
roc_auc_score(y_test, y_pred_lda)

0.6882808241629974

In [37]:
dtc = DecisionTreeClassifier(class_weight = 'balanced', random_state = 10)
dtc = dtc.fit(X_train, y_train)
y_pred_dtc = dtc.predict(X_test)
print(classification_report(y_test, y_pred_dtc))

              precision    recall  f1-score   support

           0       0.80      0.79      0.80     40520
           1       0.50      0.51      0.50     16479

    accuracy                           0.71     56999
   macro avg       0.65      0.65      0.65     56999
weighted avg       0.71      0.71      0.71     56999



In [39]:
params = {'max_features': np.arange(1, 10)}
rfc = RandomForestClassifier(random_state = 88, class_weight='balanced', max_depth=5)
rfc_cv = GridSearchCV(rfc, params, cv = 5, scoring = 'accuracy',verbose = False)
rfc_cv.fit(X_train, y_train)
y_pred_rfc = rfc_cv.best_estimator_.predict(X_test)
print(classification_report(y_test, y_pred_rfc))

KeyboardInterrupt: 

In [40]:
gbc = GradientBoostingClassifier(n_estimators = 3300, max_leaf_nodes = 10, random_state = 88)
gbc.fit(X_train, y_train) 
y_pred_gbc = gbc.predict(X_test)
print(classification_report(y_test, y_pred_gbc))

KeyboardInterrupt: 