# Libraries

In [None]:
import kagglehub
import pandas as pd
import numpy as np

path = kagglehub.dataset_download("joniarroba/noshowappointments")
print(f"Dataset downloaded to: {path}")

Downloading from https://www.kaggle.com/api/v1/datasets/download/joniarroba/noshowappointments?dataset_version_number=5...


100%|██████████| 2.40M/2.40M [00:00<00:00, 65.8MB/s]

Extracting files...
Dataset downloaded to: /root/.cache/kagglehub/datasets/joniarroba/noshowappointments/versions/5





# Data Loading

In [None]:
df = pd.read_csv(f"{path}/KaggleV2-May-2016.csv")

In [None]:
df.head()

Unnamed: 0,PatientId,AppointmentID,Gender,ScheduledDay,AppointmentDay,Age,Neighbourhood,Scholarship,Hipertension,Diabetes,Alcoholism,Handcap,SMS_received,No-show
0,29872500000000.0,5642903,F,2016-04-29T18:38:08Z,2016-04-29T00:00:00Z,62,JARDIM DA PENHA,0,1,0,0,0,0,No
1,558997800000000.0,5642503,M,2016-04-29T16:08:27Z,2016-04-29T00:00:00Z,56,JARDIM DA PENHA,0,0,0,0,0,0,No
2,4262962000000.0,5642549,F,2016-04-29T16:19:04Z,2016-04-29T00:00:00Z,62,MATA DA PRAIA,0,0,0,0,0,0,No
3,867951200000.0,5642828,F,2016-04-29T17:29:31Z,2016-04-29T00:00:00Z,8,PONTAL DE CAMBURI,0,0,0,0,0,0,No
4,8841186000000.0,5642494,F,2016-04-29T16:07:23Z,2016-04-29T00:00:00Z,56,JARDIM DA PENHA,0,1,1,0,0,0,No


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 110527 entries, 0 to 110526
Data columns (total 14 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   PatientId       110527 non-null  float64
 1   AppointmentID   110527 non-null  int64  
 2   Gender          110527 non-null  object 
 3   ScheduledDay    110527 non-null  object 
 4   AppointmentDay  110527 non-null  object 
 5   Age             110527 non-null  int64  
 6   Neighbourhood   110527 non-null  object 
 7   Scholarship     110527 non-null  int64  
 8   Hipertension    110527 non-null  int64  
 9   Diabetes        110527 non-null  int64  
 10  Alcoholism      110527 non-null  int64  
 11  Handcap         110527 non-null  int64  
 12  SMS_received    110527 non-null  int64  
 13  No-show         110527 non-null  object 
dtypes: float64(1), int64(8), object(5)
memory usage: 11.8+ MB


In [None]:
df.describe()

Unnamed: 0,patientid,appointmentid,age,scholarship,hipertension,diabetes,alcoholism,handcap,sms_received,no_show
count,110521.0,110521.0,110521.0,110521.0,110521.0,110521.0,110521.0,110521.0,110521.0,110521.0
mean,147492100000000.0,5675304.0,37.085694,0.098271,0.197248,0.071869,0.030401,0.022213,0.321034,0.201916
std,256092800000000.0,71295.76,23.104606,0.297682,0.397923,0.258272,0.17169,0.16144,0.466876,0.401432
min,39217.84,5030230.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,4172457000000.0,5640285.0,18.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,31725980000000.0,5680569.0,37.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,94389630000000.0,5725523.0,55.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
max,999981600000000.0,5790484.0,102.0,1.0,1.0,1.0,1.0,4.0,1.0,1.0


# Data Preprocessing

In [None]:
df.columns = df.columns.str.lower().str.replace('-', '_')

df['scheduledday'] = pd.to_datetime(df['scheduledday'])
df['appointmentday'] = pd.to_datetime(df['appointmentday'])

df['no_show'] = df['no_show'].map({'Yes': 1, 'No': 0})

df = df[(df['age'] >= 0) & (df['age'] <= 110)]

# Feature Engineering

In [None]:
['waiting_days'] = (df['appointmentday'] - df['scheduledday']).dt.days
df['appointment_dow'] = df['appointmentday'].dt.dayofweek
df['scheduled_hour'] = df['scheduledday'].dt.hour

df['prev_no_shows'] = df.groupby('patientid')['no_show'].shift().fillna(0)
df['appointment_count'] = df.groupby('patientid').cumcount() + 1

conditions = ['hipertension', 'diabetes', 'alcoholism', 'handcap']
df['condition_count'] = df[conditions].sum(axis=1)

In [None]:
neighborhood_stats = df.groupby('neighbourhood')['no_show'].agg(['mean', 'count'])
df = df.merge(neighborhood_stats, on='neighbourhood', suffixes=('', '_neighborhood'))

df['days_until_weekend'] = (4 - df['appointment_dow']) % 7
df['morning_appointment'] = (df['scheduled_hour'] < 12).astype(int)

In [None]:
df.drop(['patientid', 'appointmentid'], axis=1, inplace=True)

df = pd.get_dummies(df, columns=['gender', 'neighbourhood'])

df.to_csv('processed_no_shows.csv', index=False)