# ML Models

# Imports

#### Standard library imports

In [1]:
import sys
import os
sys.path.append("../..")

#### Third party imports

In [83]:
import pandas as pd
import pickle
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import roc_auc_score
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant
from sklearn.linear_model import LinearRegression
from sklearn.metrics import classification_report

#### Local application imports

In [3]:
%load_ext autoreload
%autoreload 2

from pkg_dir.config import *
from pkg_dir.src.utils import *
from pkg_dir.src.functions import *
from pkg_dir.src.parameters import *

# Pickle Extraction

In [32]:
path = '../../pkg_dir/data/pickles'
name = 'dataset_v1.pkl'

## Reading extract object saved as pickle locally
pkl_obj = path + "/" + name

with open(pkl_obj, 'rb') as obj_content:
    df = pickle.load(obj_content)

In [33]:
df

Unnamed: 0,appointment_id,appointment_weekday,appointment_start,appointment_dur_min,recurring_patient,creation_to_start_hrs,confirm_request_msg,patient_confirm,confirm_to_start_hrs,clinic,appointment_source,appointment_date_update,patient_age,patient_sex,doctor,medical_specialty,appointment_status_simplified
2,2021923564,5,2022-01-22 12:30:00,30.0,first,20.52,0,0,,MARINA NACIONAL,phone,no_update,32.0,F,DR. ORLANDO JOSE HERNANDEZ MARTINEZ,UROLOGIA,cancel
3,2021853399,0,2022-01-24 12:00:00,30.0,first,52.19,0,0,,MARINA NACIONAL,phone,no_update,32.0,F,DRA JUANA GUADALUPE CABALLERO MARTINEZ,GINECOLOGIA,cancel
4,2021845947,0,2022-01-31 13:00:00,30.0,first,221.78,0,0,,TLALPAN,phone,no_update,32.0,F,Mariana Sarao Pineda,DERMATOLOGIA,cancel
5,2021924951,1,2022-01-25 13:30:00,30.0,first,78.35,0,0,,BASILICA,phone,no_update,32.0,F,DRA ANA MARIA ESCOBEDO HERNANDEZ,DERMATOLOGIA,cancel
6,2021889374,5,2022-01-22 08:00:00,30.0,first,24.22,0,1,21.75,BASILICA,phone,no_update,40.0,F,DRA MONICA MARCELA MACIAS ORTEGA,ENDOCRINOLOGIA,completed
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
544083,2022136584,3,2022-12-29 10:00:00,30.0,recurrent,2181.63,1,1,49.28,COAPA,phone,no_update,60.0,M,DRA LAURA FABIOLA PORTILLO GARCIA,NEFROLOGIA,no_show
544084,2022141229,2,2022-11-30 08:30:00,30.0,recurrent,14.12,0,1,14.05,COAPA,phone,no_update,33.0,M,DR. MIGUEL ANGEL MONTERO MARQUEZ,OTORRINOLARINGOLOGIA,completed
544085,2022506277,5,2022-06-25 10:00:00,30.0,first,25.08,0,1,19.02,COAPA,phone,no_update,33.0,M,DRA SANDRA XOCHIQUETZAL CRUZ ORDOÑEZ,UROLOGIA,completed
544086,2023228381,3,2023-01-05 08:30:00,30.0,recurrent,39.87,0,1,15.32,COAPA,phone,no_update,33.0,M,DRA SANDRA XOCHIQUETZAL CRUZ ORDOÑEZ,UROLOGIA,no_show


In [34]:
df['medical_specialty'].replace({'Ultrasonido': 'ULTRASONIDO'}, inplace = True)
rounded_times = pd.to_datetime(df['appointment_start']).dt.round('60min')
df['rounded_start_times'] = rounded_times.dt.strftime('%H:%M:%S')
df['confirm_to_start_hrs'].fillna(df['confirm_to_start_hrs'].mean(), inplace = True)
df.dropna(subset= ['recurring_patient', 'creation_to_start_hrs', 'patient_age', 'patient_sex'], inplace = True)
months = [m.strftime('%B') for m in df['appointment_start']]
df['month'] = months
# Mapping for days of the week
days_mapping = {0: 'Monday', 1: 'Tuesday', 2: 'Wednesday', 3: 'Thursday', 4: 'Friday', 5: 'Saturday', 6: 'Sunday'}

# Mapping for 01 to 'no' and 'yes'
binary_mapping = {0: 'no', 1: 'yes'}

# Replace values in the DataFrame
df['appointment_weekday'] = df['appointment_weekday'].replace(days_mapping)
df['confirm_request_msg'] = df['confirm_request_msg'].replace(binary_mapping)
df['patient_confirm'] = df['patient_confirm'].replace(binary_mapping)

In [35]:
df.drop(columns = ['appointment_id', 'appointment_start', 'doctor'], inplace = True)

In [39]:
gynecology = df.loc[(df['medical_specialty'] == 'GINECOLOGIA') & (df['appointment_status_simplified'] != 'cancel')]
gynecology.drop(columns = 'medical_specialty', inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ginecology.drop(columns = 'medical_specialty', inplace = True)


In [45]:
gynecology

Unnamed: 0,appointment_weekday,appointment_dur_min,recurring_patient,creation_to_start_hrs,confirm_request_msg,patient_confirm,confirm_to_start_hrs,clinic,appointment_source,appointment_date_update,patient_age,patient_sex,appointment_status_simplified,rounded_start_times,month
13,Monday,30.0,recurrent,92.60,no,yes,50.21,TLALPAN,phone,no_update,39.0,F,completed,11:00:00,October
14,Friday,30.0,recurrent,150.96,yes,yes,21.90,COAPA,phone,no_update,26.0,F,completed,14:00:00,January
20,Thursday,30.0,first,127.04,no,yes,8.13,TLALPAN,phone,no_update,38.0,F,completed,16:00:00,January
24,Saturday,30.0,recurrent,167.85,yes,yes,17.63,COAPA,phone,no_update,25.0,F,completed,08:00:00,January
30,Monday,30.0,first,99.54,no,yes,67.97,TLALPAN,phone,no_update,24.0,F,no_show,14:00:00,October
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
544052,Friday,60.0,first,30.73,no,yes,-0.13,BASILICA,phone,no_update,32.0,F,completed,16:00:00,December
544064,Friday,30.0,first,17.62,no,yes,17.62,COAPA,phone,no_update,24.0,F,completed,11:00:00,December
544072,Saturday,30.0,first,2.96,no,yes,2.96,CUAJIMALPA,phone,no_update,41.0,F,completed,12:00:00,December
544074,Friday,60.0,first,73.80,no,yes,55.52,COAPA,phone,no_update,68.0,F,completed,18:00:00,December


In [46]:
X = gynecology.drop(columns = ['appointment_status_simplified'])
y = gynecology['appointment_status_simplified']

In [47]:
X

Unnamed: 0,appointment_weekday,appointment_dur_min,recurring_patient,creation_to_start_hrs,confirm_request_msg,patient_confirm,confirm_to_start_hrs,clinic,appointment_source,appointment_date_update,patient_age,patient_sex,rounded_start_times,month
13,Monday,30.0,recurrent,92.60,no,yes,50.21,TLALPAN,phone,no_update,39.0,F,11:00:00,October
14,Friday,30.0,recurrent,150.96,yes,yes,21.90,COAPA,phone,no_update,26.0,F,14:00:00,January
20,Thursday,30.0,first,127.04,no,yes,8.13,TLALPAN,phone,no_update,38.0,F,16:00:00,January
24,Saturday,30.0,recurrent,167.85,yes,yes,17.63,COAPA,phone,no_update,25.0,F,08:00:00,January
30,Monday,30.0,first,99.54,no,yes,67.97,TLALPAN,phone,no_update,24.0,F,14:00:00,October
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
544052,Friday,60.0,first,30.73,no,yes,-0.13,BASILICA,phone,no_update,32.0,F,16:00:00,December
544064,Friday,30.0,first,17.62,no,yes,17.62,COAPA,phone,no_update,24.0,F,11:00:00,December
544072,Saturday,30.0,first,2.96,no,yes,2.96,CUAJIMALPA,phone,no_update,41.0,F,12:00:00,December
544074,Friday,60.0,first,73.80,no,yes,55.52,COAPA,phone,no_update,68.0,F,18:00:00,December


In [48]:
X_train, X_test, y_train_old, y_test_old = train_test_split(X, y, test_size=0.25, random_state=42)

In [49]:
le = LabelEncoder()
y_train = le.fit_transform(y_train_old)
y_test = le.fit_transform(y_test_old)

In [65]:
y_train_old, y_test_old

(534715    completed
 489764      no_show
 431058    completed
 423963    completed
 254814    completed
             ...    
 260991      no_show
 536224    completed
 480435    completed
 23308     completed
 298046    completed
 Name: appointment_status_simplified, Length: 34255, dtype: object,
 358511      no_show
 365145    completed
 185609    completed
 451336    completed
 531616    completed
             ...    
 238393    completed
 253550    completed
 326766    completed
 382728    completed
 481145    completed
 Name: appointment_status_simplified, Length: 11419, dtype: object)

In [50]:
y_train, y_test

(array([0, 1, 0, ..., 0, 0, 0]), array([1, 0, 0, ..., 0, 0, 0]))

In [51]:
most_common_label = np.argmax(np.bincount(y_train))
baseline_1_acc = (y_test == most_common_label).mean()
baseline_1_acc

0.8165338470969437

In [52]:
encoder = OneHotEncoder(handle_unknown="ignore")
encoder.fit(X_train)
X_train = encoder.transform(X_train)
X_test = encoder.transform(X_test)

In [85]:
model = LogisticRegression(class_weight='balanced')

In [86]:
model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [87]:
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)

In [88]:
model_acc = accuracy_score(y_test, y_pred)
model_acc

In [90]:
coefficients = model.coef_[0]

In [91]:
for feature, coef in zip(X.columns, coefficients):
    print(f"{feature}: {coef}")

appointment_weekday: -0.05092829889355761
appointment_dur_min: 0.01622074882145207
recurring_patient: 0.10000658998745128
creation_to_start_hrs: 0.11735631402600437
confirm_request_msg: -0.09238978622880799
patient_confirm: -0.08906982357384997
confirm_to_start_hrs: -0.026894670811244383
clinic: -0.06349030900106144
appointment_source: -0.13132222759297607
appointment_date_update: 0.9798180900418912
patient_age: 0.34981457258312126
patient_sex: -0.6037099736731184
rounded_start_times: -0.6542011772801011
month: -0.37068584555301415


In [92]:
most_significant_features = [feature for feature, coef in zip(X.columns, coefficients) if abs(coef) > 0.1]
print("Most significant features:", most_significant_features)

Most significant features: ['recurring_patient', 'creation_to_start_hrs', 'appointment_source', 'appointment_date_update', 'patient_age', 'patient_sex', 'rounded_start_times', 'month']


In [93]:
cm= confusion_matrix(y_test, y_pred)
accuracy = (cm.ravel()[0] + cm.ravel()[3]) / sum(cm.ravel())
TPR = cm.ravel()[3] / (cm.ravel()[2] + cm.ravel()[3])
FPR = cm.ravel()[1] / (cm.ravel()[0] + cm.ravel()[1])
print('accuracy: ', accuracy)
print('TPR: ', TPR)
print('FPR: ', FPR)

accuracy:  0.6762413521324109
TPR:  0.3723150357995227
FPR:  0.2554697554697555


In [94]:
print(classification_report(y_test, y_pred))
print(roc_auc_score(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.84      0.74      0.79      9324
           1       0.25      0.37      0.30      2095

    accuracy                           0.68     11419
   macro avg       0.54      0.56      0.54     11419
weighted avg       0.73      0.68      0.70     11419

0.5584226401648836


In [95]:
y_pred_proba

array([[0.49249005, 0.50750995],
       [0.65674782, 0.34325218],
       [0.9205632 , 0.0794368 ],
       ...,
       [0.54933614, 0.45066386],
       [0.51643091, 0.48356909],
       [0.52581736, 0.47418264]])

In [69]:
X1 = pd.get_dummies(X)
y1 = le.fit_transform(y)

In [77]:
def calculate_vif(df, features):    
    vif, tolerance = {}, {}
    # all the features that you want to examine
    for feature in features:
        # extract all the other features you will regress against
        X = [f for f in features if f != feature]        
        X, y = df[X], df[feature]
        # extract r-squared from the fit
        r2 = LinearRegression().fit(X, y).score(X, y)                
        
        # calculate tolerance
        tolerance[feature] = 1 - r2
        # calculate VIF
        vif[feature] = 1/(tolerance[feature])
    # return VIF DataFrame
    return pd.DataFrame({'VIF': vif, 'Tolerance': tolerance})

In [78]:
X1.columns

Index(['appointment_dur_min', 'creation_to_start_hrs', 'confirm_to_start_hrs',
       'patient_age', 'appointment_weekday_Friday',
       'appointment_weekday_Monday', 'appointment_weekday_Saturday',
       'appointment_weekday_Sunday', 'appointment_weekday_Thursday',
       'appointment_weekday_Tuesday', 'appointment_weekday_Wednesday',
       'recurring_patient_first', 'recurring_patient_recurrent',
       'confirm_request_msg_no', 'confirm_request_msg_yes',
       'patient_confirm_no', 'patient_confirm_yes', 'clinic_BASILICA',
       'clinic_COAPA', 'clinic_CUAJIMALPA', 'clinic_DEL VALLE',
       'clinic_DOCTORES', 'clinic_MARINA NACIONAL', 'clinic_TLALPAN',
       'appointment_source_online', 'appointment_source_phone',
       'appointment_date_update_no_update', 'appointment_date_update_update',
       'patient_sex_F', 'patient_sex_M', 'rounded_start_times_07:00:00',
       'rounded_start_times_08:00:00', 'rounded_start_times_09:00:00',
       'rounded_start_times_10:00:00', 'roun

In [79]:
f = ['appointment_dur_min', 'creation_to_start_hrs', 'confirm_to_start_hrs',
       'patient_age', 'appointment_weekday_Friday',
       'appointment_weekday_Monday', 'appointment_weekday_Saturday',
       'appointment_weekday_Sunday', 'appointment_weekday_Thursday',
       'appointment_weekday_Tuesday', 'appointment_weekday_Wednesday',
       'recurring_patient_first', 'recurring_patient_recurrent',
       'confirm_request_msg_no', 'confirm_request_msg_yes',
       'patient_confirm_no', 'patient_confirm_yes', 'clinic_BASILICA',
       'clinic_COAPA', 'clinic_CUAJIMALPA', 'clinic_DEL VALLE',
       'clinic_DOCTORES', 'clinic_MARINA NACIONAL', 'clinic_TLALPAN',
       'appointment_source_online', 'appointment_source_phone',
       'appointment_date_update_no_update', 'appointment_date_update_update',
       'patient_sex_F', 'patient_sex_M', 'rounded_start_times_07:00:00',
       'rounded_start_times_08:00:00', 'rounded_start_times_09:00:00',
       'rounded_start_times_10:00:00', 'rounded_start_times_11:00:00',
       'rounded_start_times_12:00:00', 'rounded_start_times_13:00:00',
       'rounded_start_times_14:00:00', 'rounded_start_times_15:00:00',
       'rounded_start_times_16:00:00', 'rounded_start_times_17:00:00',
       'rounded_start_times_18:00:00', 'rounded_start_times_19:00:00',
       'rounded_start_times_20:00:00', 'month_April', 'month_August',
       'month_December', 'month_February', 'month_January', 'month_July',
       'month_June', 'month_March', 'month_May', 'month_November',
       'month_October', 'month_September']

In [80]:
calculate_vif(df=X1, features=f)

  vif[feature] = 1/(tolerance[feature])
  vif[feature] = 1/(tolerance[feature])
  vif[feature] = 1/(tolerance[feature])
  vif[feature] = 1/(tolerance[feature])
  vif[feature] = 1/(tolerance[feature])
  vif[feature] = 1/(tolerance[feature])
  vif[feature] = 1/(tolerance[feature])
  vif[feature] = 1/(tolerance[feature])
  vif[feature] = 1/(tolerance[feature])
  vif[feature] = 1/(tolerance[feature])
  vif[feature] = 1/(tolerance[feature])
  vif[feature] = 1/(tolerance[feature])
  vif[feature] = 1/(tolerance[feature])
  vif[feature] = 1/(tolerance[feature])
  vif[feature] = 1/(tolerance[feature])
  vif[feature] = 1/(tolerance[feature])
  vif[feature] = 1/(tolerance[feature])
  vif[feature] = 1/(tolerance[feature])
  vif[feature] = 1/(tolerance[feature])
  vif[feature] = 1/(tolerance[feature])
  vif[feature] = 1/(tolerance[feature])
  vif[feature] = 1/(tolerance[feature])
  vif[feature] = 1/(tolerance[feature])
  vif[feature] = 1/(tolerance[feature])
  vif[feature] = 1/(tolerance[feature])


Unnamed: 0,VIF,Tolerance
appointment_dur_min,1.039547,0.961958
creation_to_start_hrs,1.235599,0.809324
confirm_to_start_hrs,1.029169,0.971658
patient_age,1.016509,0.983759
appointment_weekday_Friday,inf,0.0
appointment_weekday_Monday,inf,0.0
appointment_weekday_Saturday,inf,0.0
appointment_weekday_Sunday,inf,0.0
appointment_weekday_Thursday,inf,0.0
appointment_weekday_Tuesday,inf,0.0
