# Importing Libraries


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Importing cleaned data

In [4]:
df = pd.read_parquet('D:/healthplusclinic/data/02-clean_data/cleaned_data.parquet')

In [5]:
df.head()

Unnamed: 0,appointment_id,patient_id,provider_id,appointment_date,appointment_time,lead_time_days,wait_time_minutes,is_no_show_0_1,age,insurance_type,specialty,provider_clinic_id,clinic_assignment,clinic_name,city,hours_start,hours_end
0,a0071090,p001027,pr00198,2024-01-07,12:15:00,45,33.0,0,82,private,oncology,c007,full-time,oncology clinic,chicago,10:00:00,19:00:00
1,a0074955,p000716,pr00002,2024-07-08,15:45:00,21,19.0,0,19,private,neurology,c006,full-time,neurology clinic,chicago,08:00:00,18:00:00
2,a0097350,p000507,pr00101,2024-01-16,10:15:00,31,18.0,0,30,private,pulmonology,c015,full-time,pulmonology clinic,houston,08:00:00,16:00:00
3,a0036456,p001164,pr00010,2024-05-19,15:45:00,12,29.0,0,18,public,pulmonology,c015,full-time,pulmonology clinic,houston,08:00:00,16:00:00
4,a0023637,p003837,pr00066,2025-03-26,15:15:00,7,19.0,0,56,public,orthopedics,c003,full-time,orthopedic clinic,houston,09:00:00,17:00:00


In [7]:
print(df.columns)

Index(['appointment_id', 'patient_id', 'provider_id', 'appointment_date',
       'appointment_time', 'lead_time_days', 'wait_time_minutes',
       'is_no_show_0_1', 'age', 'insurance_type', 'specialty',
       'provider_clinic_id', 'clinic_assignment', 'clinic_name', 'city',
       'hours_start', 'hours_end'],
      dtype='str')


In [8]:
df.info()

<class 'pandas.DataFrame'>
RangeIndex: 120000 entries, 0 to 119999
Data columns (total 17 columns):
 #   Column              Non-Null Count   Dtype         
---  ------              --------------   -----         
 0   appointment_id      120000 non-null  str           
 1   patient_id          120000 non-null  str           
 2   provider_id         120000 non-null  str           
 3   appointment_date    120000 non-null  datetime64[us]
 4   appointment_time    120000 non-null  object        
 5   lead_time_days      120000 non-null  int64         
 6   wait_time_minutes   120000 non-null  float64       
 7   is_no_show_0_1      120000 non-null  int64         
 8   age                 120000 non-null  int64         
 9   insurance_type      120000 non-null  str           
 10  specialty           120000 non-null  str           
 11  provider_clinic_id  120000 non-null  str           
 12  clinic_assignment   120000 non-null  str           
 13  clinic_name         120000 non-null  str

# Feature Engineering


#### Creating new column whethers the appointment was booked on weekend or weekdays

In [9]:
df['days_week'] = df['appointment_date'].dt.isocalendar().day #parsing day of week from appointment_date column

In [10]:
df.head()

Unnamed: 0,appointment_id,patient_id,provider_id,appointment_date,appointment_time,lead_time_days,wait_time_minutes,is_no_show_0_1,age,insurance_type,specialty,provider_clinic_id,clinic_assignment,clinic_name,city,hours_start,hours_end,days_week
0,a0071090,p001027,pr00198,2024-01-07,12:15:00,45,33.0,0,82,private,oncology,c007,full-time,oncology clinic,chicago,10:00:00,19:00:00,7
1,a0074955,p000716,pr00002,2024-07-08,15:45:00,21,19.0,0,19,private,neurology,c006,full-time,neurology clinic,chicago,08:00:00,18:00:00,1
2,a0097350,p000507,pr00101,2024-01-16,10:15:00,31,18.0,0,30,private,pulmonology,c015,full-time,pulmonology clinic,houston,08:00:00,16:00:00,2
3,a0036456,p001164,pr00010,2024-05-19,15:45:00,12,29.0,0,18,public,pulmonology,c015,full-time,pulmonology clinic,houston,08:00:00,16:00:00,7
4,a0023637,p003837,pr00066,2025-03-26,15:15:00,7,19.0,0,56,public,orthopedics,c003,full-time,orthopedic clinic,houston,09:00:00,17:00:00,3


In [11]:
# creating new column by checking whether the appoinment day is weekend or not
df['is_weekend'] = df['days_week'].apply(lambda x: True if (x==6 or x==7) else False)

In [17]:
# converting to hours from appointment time and creating new column
df['hour'] = df['appointment_time'].apply(lambda x: x.hour)

In [20]:
# creating new column from hours by binning
bins = [0,12,17,24]
labels = ['Morning', 'Afternoon','Evening']
df['time_of_day'] = pd.cut(df['hour'], bins=bins, labels=labels, right=False)


In [21]:
df.drop(columns='hour',inplace=True)

In [31]:
# creating new columns for how long time clinic opens
# converting hours_start and hours_end to string type, both columns were previoulsy converted into datetime object
# It is converted as timedelta expects str types
df['open_hours'] = pd.to_timedelta(df['hours_end'].astype(str)) - pd.to_timedelta(df['hours_start'].astype(str))
df['open_hours'] = (df['open_hours'].dt.total_seconds()/3600).astype(int)

In [33]:
# deleting both open hours and end hours column as we don't need that one
df.drop(['hours_start','hours_end'], axis=1, inplace=True)

In [36]:
# defining age group grom age of pateints
print('Max Age: ',df['age'].max())
print('Min Age: ',df['age'].min())

Max Age:  90
Min Age:  0


In [38]:
bins = [0,3,17,31,45,np.inf]
labels = ['babies','childern','young','middle_age','old']
df['age_group'] = pd.cut(df['age'], bins=bins, labels=labels, right = False)

In [39]:
df.head()

Unnamed: 0,appointment_id,patient_id,provider_id,appointment_date,appointment_time,lead_time_days,wait_time_minutes,is_no_show_0_1,age,insurance_type,specialty,provider_clinic_id,clinic_assignment,clinic_name,city,days_week,is_weekend,time_of_day,open_hours,age_group
0,a0071090,p001027,pr00198,2024-01-07,12:15:00,45,33.0,0,82,private,oncology,c007,full-time,oncology clinic,chicago,7,True,Afternoon,9,old
1,a0074955,p000716,pr00002,2024-07-08,15:45:00,21,19.0,0,19,private,neurology,c006,full-time,neurology clinic,chicago,1,False,Afternoon,10,young
2,a0097350,p000507,pr00101,2024-01-16,10:15:00,31,18.0,0,30,private,pulmonology,c015,full-time,pulmonology clinic,houston,2,False,Morning,8,young
3,a0036456,p001164,pr00010,2024-05-19,15:45:00,12,29.0,0,18,public,pulmonology,c015,full-time,pulmonology clinic,houston,7,True,Afternoon,8,young
4,a0023637,p003837,pr00066,2025-03-26,15:15:00,7,19.0,0,56,public,orthopedics,c003,full-time,orthopedic clinic,houston,3,False,Afternoon,8,old
