# ML Models

# Imports

#### Standard library imports

In [1]:
import sys
import os
sys.path.append("../..")

#### Third party imports

In [2]:
import pandas as pd
import pickle
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn import metrics
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import roc_auc_score

#### Local application imports

In [3]:
%load_ext autoreload
%autoreload 2

from pkg_dir.config import *
from pkg_dir.src.utils import *
from pkg_dir.src.functions import *
from pkg_dir.src.parameters import *

# Pickle Extraction

In [4]:
path = '../../pkg_dir/data/pickles'
name = 'dataset_v1.pkl'

## Reading extract object saved as pickle locally
pkl_obj = path + "/" + name

with open(pkl_obj, 'rb') as obj_content:
    df = pickle.load(obj_content)

In [5]:
df

Unnamed: 0,appointment_id,appointment_weekday,appointment_start,appointment_dur_min,recurring_patient,creation_to_start_hrs,confirm_request_msg,patient_confirm,confirm_to_start_hrs,clinic,appointment_source,appointment_date_update,patient_age,patient_sex,doctor,medical_specialty,appointment_status_simplified
2,2021923564,5,2022-01-22 12:30:00,30.0,first,20.52,0,0,,MARINA NACIONAL,phone,no_update,32.0,F,DR. ORLANDO JOSE HERNANDEZ MARTINEZ,UROLOGIA,cancel
3,2021853399,0,2022-01-24 12:00:00,30.0,first,52.19,0,0,,MARINA NACIONAL,phone,no_update,32.0,F,DRA JUANA GUADALUPE CABALLERO MARTINEZ,GINECOLOGIA,cancel
4,2021845947,0,2022-01-31 13:00:00,30.0,first,221.78,0,0,,TLALPAN,phone,no_update,32.0,F,Mariana Sarao Pineda,DERMATOLOGIA,cancel
5,2021924951,1,2022-01-25 13:30:00,30.0,first,78.35,0,0,,BASILICA,phone,no_update,32.0,F,DRA ANA MARIA ESCOBEDO HERNANDEZ,DERMATOLOGIA,cancel
6,2021889374,5,2022-01-22 08:00:00,30.0,first,24.22,0,1,21.75,BASILICA,phone,no_update,40.0,F,DRA MONICA MARCELA MACIAS ORTEGA,ENDOCRINOLOGIA,completed
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
544083,2022136584,3,2022-12-29 10:00:00,30.0,recurrent,2181.63,1,1,49.28,COAPA,phone,no_update,60.0,M,DRA LAURA FABIOLA PORTILLO GARCIA,NEFROLOGIA,no_show
544084,2022141229,2,2022-11-30 08:30:00,30.0,recurrent,14.12,0,1,14.05,COAPA,phone,no_update,33.0,M,DR. MIGUEL ANGEL MONTERO MARQUEZ,OTORRINOLARINGOLOGIA,completed
544085,2022506277,5,2022-06-25 10:00:00,30.0,first,25.08,0,1,19.02,COAPA,phone,no_update,33.0,M,DRA SANDRA XOCHIQUETZAL CRUZ ORDOÑEZ,UROLOGIA,completed
544086,2023228381,3,2023-01-05 08:30:00,30.0,recurrent,39.87,0,1,15.32,COAPA,phone,no_update,33.0,M,DRA SANDRA XOCHIQUETZAL CRUZ ORDOÑEZ,UROLOGIA,no_show


In [6]:
df.columns
#relevant columns: appointment_weekday, appointment_dur_min, recurring_patient, creation_to_start_hrs, confirm_request_msg, 
#confirm_to_start_hrs, clinic, appointment_source, appointment_date_update, rounded_start_times

#missing data: recurring_patient, creation_to_start_hrs, confirm_to_start_hrs, patient_age, patient_sex 

Index(['appointment_id', 'appointment_weekday', 'appointment_start',
       'appointment_dur_min', 'recurring_patient', 'creation_to_start_hrs',
       'confirm_request_msg', 'patient_confirm', 'confirm_to_start_hrs',
       'clinic', 'appointment_source', 'appointment_date_update',
       'patient_age', 'patient_sex', 'doctor', 'medical_specialty',
       'appointment_status_simplified'],
      dtype='object')

In [7]:
df['appointment_weekday'].isna().mean()

0.0

In [8]:
rounded_times = pd.to_datetime(df['appointment_start']).dt.round('30min')
df['rounded_start_times'] = rounded_times.dt.strftime('%H:%M:%S')

In [9]:
df['confirm_to_start_hrs'].mean() #replace na

31.54249449537882

In [10]:
df.isna().mean()

appointment_id                   0.000000
appointment_weekday              0.000000
appointment_start                0.000000
appointment_dur_min              0.000000
recurring_patient                0.002816
creation_to_start_hrs            0.019948
confirm_request_msg              0.000000
patient_confirm                  0.000000
confirm_to_start_hrs             0.233176
clinic                           0.000000
appointment_source               0.000000
appointment_date_update          0.000000
patient_age                      0.003889
patient_sex                      0.015100
doctor                           0.000000
medical_specialty                0.000000
appointment_status_simplified    0.000000
rounded_start_times              0.000000
dtype: float64

In [11]:
df['medical_specialty'].replace({'Ultrasonido': 'ULTRASONIDO'}, inplace = True)

In [12]:
df['confirm_to_start_hrs'].fillna(df['confirm_to_start_hrs'].mean(), inplace = True)

In [13]:
df.dropna(subset= ['recurring_patient', 'creation_to_start_hrs', 'patient_age', 'patient_sex'], inplace = True)

In [14]:
df.drop(columns = ['appointment_id', 'appointment_start'], inplace = True)

In [15]:
X = df.drop(columns = ['appointment_status_simplified'])
y = df['appointment_status_simplified']

In [16]:
X_train, X_test, y_train_old, y_test_old = train_test_split(X, y, test_size=0.25, random_state=42)

In [18]:
le = LabelEncoder()
y_train = le.fit_transform(y_train_old)
y_test = le.fit_transform(y_test_old)

In [19]:
y_train, y_test

(array([1, 2, 1, ..., 2, 1, 1]), array([1, 1, 1, ..., 1, 2, 0]))

In [20]:
most_common_label = np.argmax(np.bincount(y_train))
baseline_1_acc = (y_test == most_common_label).mean()
baseline_1_acc

0.6401005534756762

In [21]:
encoder = OneHotEncoder(handle_unknown="ignore")
encoder.fit(X_train)
X_train = encoder.transform(X_train)
X_test = encoder.transform(X_test)

In [22]:
model = LogisticRegression(multi_class='multinomial', solver='lbfgs')

In [23]:
model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [24]:
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)

In [25]:
model_acc = accuracy_score(y_test, y_pred)

In [26]:
model_acc

0.7915781069621414

In [27]:
coefficients = model.coef_[0]

In [28]:
for feature, coef in zip(X.columns, coefficients):
    print(f"{feature}: {coef}")

appointment_weekday: -0.08053486387505376
appointment_dur_min: 0.060244979370688616
recurring_patient: 0.04577819009295222
creation_to_start_hrs: 0.08550159874942155
confirm_request_msg: 0.09098373318722333
patient_confirm: 0.05308090131474075
confirm_to_start_hrs: -0.3913676469438612
clinic: 0.004925207343934351
appointment_source: -0.11894096685114629
appointment_date_update: -0.6644510098856173
patient_age: 1.078243512136026
patient_sex: -0.2871608489375163
doctor: 0.26998263472340134
medical_specialty: 0.3464975853450315
rounded_start_times: 0.18992254670121425


In [29]:
most_significant_features = [feature for feature, coef in zip(X.columns, coefficients) if abs(coef) > 0.1]
print("Most significant features:", most_significant_features)

Most significant features: ['confirm_to_start_hrs', 'appointment_source', 'appointment_date_update', 'patient_age', 'patient_sex', 'doctor', 'medical_specialty', 'rounded_start_times']


In [33]:
roc_auc_micro = roc_auc_score(y_test, y_pred_proba, multi_class='ovr', average='micro')
print(f'Logistic Regression Test AUC: {roc_auc_micro:.4f}')
cm_logistic = confusion_matrix(y_test, y_pred)
print ("Confusion Matrix : \n", cm_logistic)
print ("\nAccuracy:", accuracy_score(y_test, y_pred))

Logistic Regression Test AUC: 0.8820
Confusion Matrix : 
 [[15524  4370   294]
 [ 1457 57494   378]
 [ 2282 10537   351]]

Accuracy: 0.7915781069621414


In [37]:
true_positives = np.diag(cm_logistic)
false_positives = np.sum(cm_logistic, axis=0) - true_positives
false_negatives = np.sum(cm_logistic, axis=1) - true_positives
true_negatives = np.sum(cm_logistic) - (true_positives + false_positives + false_negatives)


# Calculate False Positive Rate (FPR), False Negative Rate (FNR), and True Positive Rate (TPR)
fpr = false_positives / (false_positives + true_negatives)
fnr = false_negatives / (false_negatives + true_positives)
tpr = true_positives / (true_positives + false_negatives)

#0 cancel, #1 completed, #2 no_show
print("False Positive Rate (FPR):", fpr)
print("False Negative Rate (FNR):", fnr)
print("True Positive Rate (TPR):", tpr)


False Positive Rate (FPR): [0.05157313 0.44687931 0.00845102]
False Negative Rate (FNR): [0.23102833 0.03092923 0.97334852]
True Positive Rate (TPR): [0.76897167 0.96907077 0.02665148]


In [35]:
dtc = DecisionTreeClassifier(random_state = 10)
dtc = dtc.fit(X_train, y_train)

In [174]:
y_pred_dtc = dtc.predict(X_test)
cm_dtc = confusion_matrix(y_test, y_pred_dtc)
print ("Confusion Matrix : \n", cm_dtc)
print ("\nAccuracy:", accuracy_score(y_test, y_pred_dtc))

Confusion Matrix : 
 [[14123  3815  2250]
 [ 3803 47157  8369]
 [ 2188  8293  2689]]

Accuracy: 0.690161511323055


In [176]:
rf = RandomForestClassifier(max_depth=5)
rf.fit(X_train, y_train)

In [177]:
y_pred_rf = rf.predict(X_test)
cm_rf = confusion_matrix(y_test, y_pred_rf)
print ("Confusion Matrix: \n", cm_rf)
print ("\nAccuracy:", accuracy_score(y_test, y_pred_rf))

Confusion Matrix: 
 [[ 4213 15975     0]
 [  121 59208     0]
 [ 1026 12144     0]]

Accuracy: 0.684249139577287


In [28]:
lda = LinearDiscriminantAnalysis()
lda.fit(X_train,y_train)

: 

In [179]:
y_pred_lda = lda.predict(X_test)
lda_acc = accuracy_score(y_test, y_pred_lda)
print(f'LDA Test Accuracy: {lda_acc:.4f}')
cm_lda = confusion_matrix(y_test, y_pred_lda)
print ("Confusion Matrix: \n", cm_lda)

LDA Test Accuracy: 0.7927
Confusion Matrix: 
 [[15786  4352    50]
 [ 1697 57606    26]
 [ 2531 10556    83]]
