In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report


In [4]:
# Load the data from the database
# data = pd.DataFrame.from_records(Appointment.objects.values(
#     'doctor', 'appointment_date', 'start_time', 'end_time', 'status', 'reason_for_visit'
# ))
data = pd.read_csv('appointments_data.csv')
# Feature engineering
data['appointment_date'] = pd.to_datetime(data['appointment_date'])
data['weekday'] = data['appointment_date'].dt.day_name()
data['hour'] = pd.to_datetime(data['start_time'], format='%H:%M:%S').dt.hour

# Map status to binary
data['is_completed'] = data['status'].map({'COMPLETED': 1, 'SCHEDULED': 1, 'CANCELLED': 0})


In [5]:
# Features and target
X = data[['doctor', 'weekday', 'hour']]
y = data['is_completed']

# Encode categorical features
X = pd.get_dummies(X, columns=['doctor', 'weekday'], drop_first=True)

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
model = RandomForestClassifier()
model.fit(X_train, y_train)

# Evaluate
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 0.62
              precision    recall  f1-score   support

           0       0.26      0.31      0.28        48
           1       0.77      0.72      0.74       152

    accuracy                           0.62       200
   macro avg       0.51      0.51      0.51       200
weighted avg       0.65      0.62      0.63       200



In [6]:
# Aggregate daily appointment counts
daily_data = data.groupby(['doctor', 'appointment_date']).size().reset_index(name='appointment_count')

# Add weekday information
daily_data['weekday'] = daily_data['appointment_date'].dt.day_name()

# Set demand threshold (e.g., top 25% of appointments)
threshold = daily_data['appointment_count'].quantile(0.75)
daily_data['high_demand'] = (daily_data['appointment_count'] >= threshold).astype(int)

# Features and target
X = pd.get_dummies(daily_data[['doctor', 'weekday']], drop_first=True)
y = daily_data['high_demand']

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
model = RandomForestClassifier()
model.fit(X_train, y_train)

# Evaluate
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 1.0
              precision    recall  f1-score   support

           1       1.00      1.00      1.00       190

    accuracy                           1.00       190
   macro avg       1.00      1.00      1.00       190
weighted avg       1.00      1.00      1.00       190



In [8]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 5, 10]
}

grid_search = GridSearchCV(RandomForestClassifier(), param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

print("Best Parameters:", grid_search.best_params_)
model = grid_search.best_estimator_

# Use the best parameters from GridSearchCV
model = RandomForestClassifier(
    n_estimators=100, 
    max_depth=10, 
    min_samples_split=2, 
    class_weight='balanced', 
    random_state=42
)

# Train the model again with the best parameters
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate
from sklearn.metrics import accuracy_score, classification_report
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))



Best Parameters: {'max_depth': 10, 'min_samples_split': 2, 'n_estimators': 100}
Accuracy: 1.0
              precision    recall  f1-score   support

           1       1.00      1.00      1.00       190

    accuracy                           1.00       190
   macro avg       1.00      1.00      1.00       190
weighted avg       1.00      1.00      1.00       190



In [11]:
from sklearn.model_selection import cross_val_score
cross_val_score(model, X, y, cv=5, scoring='accuracy').mean()

np.float64(1.0)

In [12]:
# Aggregate daily appointment counts for each doctor
daily_counts = data.groupby(['doctor', 'appointment_date']).size().reset_index(name='appointment_count')

# Check the aggregated data
print(daily_counts.head())


   doctor appointment_date  appointment_count
0       1       2020-01-04                  1
1       1       2020-01-17                  1
2       1       2020-01-21                  1
3       1       2020-01-25                  1
4       1       2020-01-31                  1


In [13]:
# Set a threshold for high demand (e.g., 5 appointments per day)
threshold = 5

# Add a new column for high-demand days
daily_counts['is_high_demand'] = daily_counts['appointment_count'] > threshold

# Merge this information back to your original data
data = pd.merge(data, daily_counts[['doctor', 'appointment_date', 'is_high_demand']], on=['doctor', 'appointment_date'], how='left')

# Check the updated data
print(data.head())


   doctor  patient appointment_date start_time  end_time     status  \
0       1       54       2021-03-10   08:59:13  14:23:36  COMPLETED   
1       4       52       2024-08-29   10:07:47  18:52:30  CANCELLED   
2       3       28       2024-11-20   00:13:55  21:30:46  CANCELLED   
3       3       16       2024-11-16   03:53:35  19:54:46  CANCELLED   
4       1        7       2020-01-17   11:27:06  17:19:36  COMPLETED   

                                    reason_for_visit  created_at  updated_at  \
0   However ability many pick. Life bad smile large.  2025-01-24  2025-01-14   
1  Various why paper describe understand. Floor m...  2025-01-26  2025-01-24   
2  Least his rule concern lose yes traditional. S...  2025-01-11  2025-01-01   
3  Occur fall who morning plant truth perform. Be...  2025-01-11  2025-01-20   
4  Probably thank audience weight establish docto...  2025-01-15  2025-01-01   

     weekday  hour  is_completed  is_high_demand  
0  Wednesday     8             1         