In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset
df = pd.read_csv(r"C:\Users\saman\Desktop\healthcare_noshows.csv")

In [3]:
df

Unnamed: 0,PatientId,AppointmentID,Gender,ScheduledDay,AppointmentDay,Age,Neighbourhood,Scholarship,Hipertension,Diabetes,Alcoholism,Handcap,SMS_received,Showed_up,Date.diff
0,2.987250e+13,5642903,F,2016-04-29,2016-04-29,62,JARDIM DA PENHA,False,True,False,False,False,False,True,0
1,5.589978e+14,5642503,M,2016-04-29,2016-04-29,56,JARDIM DA PENHA,False,False,False,False,False,False,True,0
2,4.262962e+12,5642549,F,2016-04-29,2016-04-29,62,MATA DA PRAIA,False,False,False,False,False,False,True,0
3,8.679512e+11,5642828,F,2016-04-29,2016-04-29,8,PONTAL DE CAMBURI,False,False,False,False,False,False,True,0
4,8.841186e+12,5642494,F,2016-04-29,2016-04-29,56,JARDIM DA PENHA,False,True,True,False,False,False,True,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
106982,2.572134e+12,5651768,F,2016-05-03,2016-06-07,56,MARIA ORTIZ,False,False,False,False,False,True,True,35
106983,3.596266e+12,5650093,F,2016-05-03,2016-06-07,51,MARIA ORTIZ,False,False,False,False,False,True,True,35
106984,1.557663e+13,5630692,F,2016-04-27,2016-06-07,21,MARIA ORTIZ,False,False,False,False,False,True,True,41
106985,9.213493e+13,5630323,F,2016-04-27,2016-06-07,38,MARIA ORTIZ,False,False,False,False,False,True,True,41


In [5]:
# Check for missing values
print(df.isnull().sum())

# Handle missing data (example)
df.fillna({'Gender': 'Unknown', 'Age': df['Age'].median()}, inplace=True)


PatientId         0
AppointmentID     0
Gender            0
ScheduledDay      0
AppointmentDay    0
Age               0
Neighbourhood     0
Scholarship       0
Hipertension      0
Diabetes          0
Alcoholism        0
Handcap           0
SMS_received      0
Showed_up         0
Date.diff         0
dtype: int64


In [6]:
# Convert date columns to datetime
df['ScheduledDay'] = pd.to_datetime(df['ScheduledDay'])
df['AppointmentDay'] = pd.to_datetime(df['AppointmentDay'])

In [7]:
# Extract useful datetime features
df['ScheduledDay_DOW'] = df['ScheduledDay'].dt.day_name()
df['AppointmentDay_DOW'] = df['AppointmentDay'].dt.day_name()
df['DaysBetween'] = (df['AppointmentDay'] - df['ScheduledDay']).dt.days


In [8]:
# Drop unnecessary columns
df.drop(['PatientId', 'AppointmentID', 'ScheduledDay', 'AppointmentDay'], axis=1, inplace=True)

In [12]:

# Convert boolean columns to integers
bool_cols = ['Scholarship', 'Hipertension', 'Diabetes', 'Alcoholism', 'Handcap', 'SMS_received']
for col in bool_cols:
    df[col] = df[col].astype(int)

# Convert Showed_up to NoShow (1 for no-show, 0 for attended)
df['NoShow'] = (~df['Showed_up']).astype(int)
df = df.drop('Showed_up', axis=1)  # Remove original Showed_up column

# Encode categorical variables
le_gender = LabelEncoder()
df['Gender'] = le_gender.fit_transform(df['Gender'])

le_neighbourhood = LabelEncoder()
df['Neighbourhood'] = le_neighbourhood.fit_transform(df['Neighbourhood'])

le_scheduled_dow = LabelEncoder()
df['ScheduledDay_DOW'] = le_scheduled_dow.fit_transform(df['ScheduledDay_DOW'])

le_appointment_dow = LabelEncoder()
df['AppointmentDay_DOW'] = le_appointment_dow.fit_transform(df['AppointmentDay_DOW'])



In [13]:
# Save cleaned data
df.to_csv('cleaned_appointments.csv', index=False)

print("Data cleaning complete. Cleaned data saved as 'cleaned_appointments.csv'.")
print(df.head())

Data cleaning complete. Cleaned data saved as 'cleaned_appointments.csv'.
   Gender  Age  Neighbourhood  Scholarship  Hipertension  Diabetes  \
0       0   62             39            0             1         0   
1       1   56             39            0             0         0   
2       0   62             45            0             0         0   
3       0    8             54            0             0         0   
4       0   56             39            0             1         1   

   Alcoholism  Handcap  SMS_received  Date.diff  ScheduledDay_DOW  \
0           0        0             0          0                 0   
1           0        0             0          0                 0   
2           0        0             0          0                 0   
3           0        0             0          0                 0   
4           0        0             0          0                 0   

   AppointmentDay_DOW  DaysBetween  NoShow  
0                   0            0       0  


Exploratory Data analysis


In [14]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load cleaned data
df = pd.read_csv('cleaned_appointments.csv')

# No-show distribution
plt.figure(figsize=(8,6))
sns.countplot(x='NoShow', data=df)
plt.title('No-show Distribution')
plt.xlabel('No-show (0 = Attended, 1 = No-show)')
plt.ylabel('Count')
plt.savefig('noshow_distribution.png')
plt.close()

# No-show rate by SMS_received
sms_noshow = df.groupby('SMS_received')['NoShow'].mean()
print("No-show rate by SMS_received:\n", sms_noshow)

# No-show rate by age group
df['AgeGroup'] = pd.cut(df['Age'], bins=[0, 30, 50, 80, 120], labels=['Young', 'Middle', 'Senior', 'Very Senior'])
age_noshow = df.groupby('AgeGroup')['NoShow'].mean()
print("No-show rate by AgeGroup:\n", age_noshow)

# No-show rate by AppointmentDay_DOW
weekday_noshow = df.groupby('AppointmentDay_DOW')['NoShow'].mean()
print("No-show rate by AppointmentDay_DOW:\n", weekday_noshow)

# Visualizations
plt.figure(figsize=(15, 5))

# SMS_received
plt.subplot(1, 3, 1)
sms_noshow.plot(kind='bar')
plt.title('No-show Rate by SMS Received')
plt.ylabel('No-show Rate')

# Age Group
plt.subplot(1, 3, 2)
age_noshow.plot(kind='bar')
plt.title('No-show Rate by Age Group')
plt.ylabel('No-show Rate')

# AppointmentDay_DOW
plt.subplot(1, 3, 3)
weekday_noshow.plot(kind='bar')
plt.title('No-show Rate by Weekday')
plt.ylabel('No-show Rate')

plt.tight_layout()
plt.savefig('eda_plots.png')
plt.close()

print("EDA complete. Plots saved as 'noshow_distribution.png' and 'eda_plots.png'.")

No-show rate by SMS_received:
 SMS_received
0    0.167288
1    0.276652
Name: NoShow, dtype: float64
No-show rate by AgeGroup:
 AgeGroup
Young          0.234035
Middle         0.207044
Senior         0.161432
Very Senior    0.164446
Name: NoShow, dtype: float64
No-show rate by AppointmentDay_DOW:
 AppointmentDay_DOW
0    0.213377
1    0.206136
2    0.230769
3    0.194730
4    0.202328
5    0.197210
Name: NoShow, dtype: float64


  age_noshow = df.groupby('AgeGroup')['NoShow'].mean()


EDA complete. Plots saved as 'noshow_distribution.png' and 'eda_plots.png'.


Model Training for Prediction

In [16]:
import pandas as pd
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report, confusion_matrix

# Load cleaned data
df = pd.read_csv('cleaned_appointments.csv')

# Features and target
features = ['Gender', 'Age', 'Neighbourhood', 'Scholarship', 'Hipertension', 'Diabetes', 
            'Alcoholism', 'Handcap', 'SMS_received', 'Date.diff', 'ScheduledDay_DOW', 
            'AppointmentDay_DOW', 'DaysBetween']
X = df[features]
y = df['NoShow']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define class weights to handle imbalance
class_weights = {0: 1, 1: 4}  # Weight no-show class 4x more to prioritize recall

# Initialize decision tree with class weights
base_model = DecisionTreeClassifier(class_weight=class_weights, random_state=42)

# Define parameter grid for tuning
param_grid = {
    'max_depth': [3, 5, 7, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Perform grid search
grid_search = GridSearchCV(base_model, param_grid, cv=5, scoring='recall_macro', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Best model
model = grid_search.best_estimator_
print("Best Parameters:", grid_search.best_params_)

# Predictions
y_pred = model.predict(X_test)

# Evaluate
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, zero_division=0)
recall = recall_score(y_test, y_pred, zero_division=0)

print("\nModel Performance:")
print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print("\nClassification Report:\n", classification_report(y_test, y_pred, zero_division=0))

# Confusion matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Attended', 'No-show'], 
            yticklabels=['Attended', 'No-show'])
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.savefig('confusion_matrix.png')
plt.close()

# Save model
with open('noshow_model.pkl', 'wb') as f:
    pickle.dump(model, f)

print("Model saved as 'noshow_model.pkl'. Confusion matrix saved as 'confusion_matrix.png'.")

Best Parameters: {'max_depth': 7, 'min_samples_leaf': 2, 'min_samples_split': 10}

Model Performance:
Accuracy: 0.58
Precision: 0.30
Recall: 0.81

Classification Report:
               precision    recall  f1-score   support

           0       0.92      0.52      0.66     17073
           1       0.30      0.81      0.44      4325

    accuracy                           0.58     21398
   macro avg       0.61      0.66      0.55     21398
weighted avg       0.79      0.58      0.62     21398

Model saved as 'noshow_model.pkl'. Confusion matrix saved as 'confusion_matrix.png'.
