In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Load the dummy data
data = pd.read_csv('appointments_data.csv')

# Convert 'appointment_date' and 'start_time' to datetime types
data['appointment_date'] = pd.to_datetime(data['appointment_date'])
data['start_time'] = pd.to_datetime(data['start_time'], format='%H:%M:%S').dt.time

# Extract additional features
data['day_of_week'] = data['appointment_date'].dt.dayofweek
data['time_of_day'] = pd.to_datetime(data['start_time'], format='%H:%M:%S').dt.hour

# Encode categorical features (doctor, patient, status)
encoder = LabelEncoder()
data['doctor'] = encoder.fit_transform(data['doctor'])
data['patient'] = encoder.fit_transform(data['patient'])
data['status'] = encoder.fit_transform(data['status'])

# Label (target): Is the slot available (binary classification)
data['is_slot_available'] = data['status'].apply(lambda x: 1 if x == 'SCHEDULED' else 0)

# Features and target
features = ['doctor', 'day_of_week', 'time_of_day']
target = 'is_slot_available'

X = data[features]
y = data[target]


# Aggregate appointments by doctor and day of the week
appointments_per_day = data.groupby(['doctor', 'day_of_week']).size().reset_index(name='appointment_count')

# Define high-demand days (e.g., days with top 25% of appointment count)
threshold = appointments_per_day['appointment_count'].quantile(0.75)
appointments_per_day['high_demand'] = (appointments_per_day['appointment_count'] > threshold).astype(int)

# Features and target variable
X_demand = appointments_per_day[['doctor', 'day_of_week']]
y_demand = appointments_per_day['high_demand']

# Train a model to predict high-demand days
model_demand = RandomForestClassifier(n_estimators=100, random_state=42)
model_demand.fit(X_demand, y_demand)

# Predictions for high-demand days
appointments_per_day['predicted_high_demand'] = model_demand.predict(X_demand)


In [13]:
from sklearn.ensemble import RandomForestClassifier
# Aggregate appointments by doctor and day of the week
appointments_per_day = data.groupby(['doctor', 'day_of_week']).size().reset_index(name='appointment_count')

# Define high-demand days (e.g., days with top 25% of appointment count)
threshold = appointments_per_day['appointment_count'].quantile(0.75)
appointments_per_day['high_demand'] = (appointments_per_day['appointment_count'] > threshold).astype(int)

# Features and target variable
X_demand = appointments_per_day[['doctor', 'day_of_week']]
y_demand = appointments_per_day['high_demand']

# Train a model to predict high-demand days
model_demand = RandomForestClassifier(n_estimators=100, random_state=42)
model_demand.fit(X_demand, y_demand)

# Predictions for high-demand days
appointments_per_day['predicted_high_demand'] = model_demand.predict(X_demand)


In [14]:
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_demand, y_demand, test_size=0.2, random_state=42)

# Train the model
model_demand = RandomForestClassifier(n_estimators=100, random_state=42)
model_demand.fit(X_train, y_train)

# Make predictions
y_pred = model_demand.predict(X_test)

# Evaluate the model
from sklearn.metrics import accuracy_score
print(f'Accuracy: {accuracy_score(y_test, y_pred)}')

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Now, the 'y_test' variable will be defined

print("Accuracy for Slot Availability Prediction:", accuracy_score(y_test, y_pred))
print("Confusion Matrix for Slot Availability Prediction:")
print(confusion_matrix(y_test, y_pred))


Accuracy: 0.6666666666666666


ValueError: Found input variables with inconsistent numbers of samples: [200, 6]

In [15]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder

# Load the data
data = pd.read_csv('appointments_data.csv')

# Extract features
data['appointment_date'] = pd.to_datetime(data['appointment_date'])
data['start_time'] = pd.to_datetime(data['start_time'], format='%H:%M:%S').dt.time

# Extract time of day
data['hour_of_day'] = pd.to_datetime(data['start_time'], format='%H:%M:%S').dt.hour
data['time_of_day'] = pd.cut(data['hour_of_day'], bins=[0, 6, 12, 18, 24], labels=['night', 'morning', 'afternoon', 'evening'])

# Extract other features
data['day_of_week'] = data['appointment_date'].dt.dayofweek
data['doctor_availability'] = data.groupby(['doctor', 'day_of_week', 'time_of_day'])['start_time'].transform('count')

# Encode categorical features
encoder = LabelEncoder()
data['doctor'] = encoder.fit_transform(data['doctor'])
data['patient'] = encoder.fit_transform(data['patient'])
data['status'] = encoder.fit_transform(data['status'])
data['time_of_day'] = encoder.fit_transform(data['time_of_day'])

# Define the target: "Best Slot" based on availability and doctor trends
data['best_slot'] = (data['doctor_availability'] > data['doctor_availability'].median()).astype(int)

# Features and target
features = ['doctor', 'day_of_week', 'time_of_day', 'doctor_availability']
target = 'best_slot'

X = data[features]
y = data[target]

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Random Forest model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Evaluate the model
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))


  data['doctor_availability'] = data.groupby(['doctor', 'day_of_week', 'time_of_day'])['start_time'].transform('count')


              precision    recall  f1-score   support

           0       1.00      1.00      1.00       124
           1       1.00      1.00      1.00        76

    accuracy                           1.00       200
   macro avg       1.00      1.00      1.00       200
weighted avg       1.00      1.00      1.00       200



In [16]:
# Predict for a specific doctor on a specific day
doctor_id = 1  # Example doctor
day_of_week = 2  # Example day (e.g., Monday)
time_of_day = 1  # Example time of day (e.g., morning)

# Prepare the data for prediction
input_data = pd.DataFrame([[doctor_id, day_of_week, time_of_day, 3]], columns=features)  # Example availability count = 3

# Make prediction
best_slot_prediction = model.predict(input_data)

if best_slot_prediction == 1:
    print("This is a good time slot.")
else:
    print("This time slot is not optimal.")


This time slot is not optimal.


In [17]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder

# Load data
data = pd.read_csv('appointments_data.csv')

# Feature extraction
data['appointment_date'] = pd.to_datetime(data['appointment_date'])
data['start_time'] = pd.to_datetime(data['start_time'], format='%H:%M:%S').dt.time
data['hour_of_day'] = pd.to_datetime(data['start_time'], format='%H:%M:%S').dt.hour
data['time_of_day'] = pd.cut(data['hour_of_day'], bins=[0, 6, 12, 18, 24], labels=['night', 'morning', 'afternoon', 'evening'])
data['day_of_week'] = data['appointment_date'].dt.dayofweek
data['doctor_availability'] = data.groupby(['doctor', 'day_of_week', 'time_of_day'])['start_time'].transform('count')

# Ranking the slots based on availability
data['slot_score'] = data['doctor_availability'] / data['doctor_availability'].max()

# Encode categorical features
encoder = LabelEncoder()
data['doctor'] = encoder.fit_transform(data['doctor'])
data['patient'] = encoder.fit_transform(data['patient'])
data['status'] = encoder.fit_transform(data['status'])
data['time_of_day'] = encoder.fit_transform(data['time_of_day'])

# Target: Predicting the best slots (higher score indicates better slots)
data['best_slot'] = (data['slot_score'] > 0.8).astype(int)  # Classifying based on the top 20% availability

# Define features and target
features = ['doctor', 'day_of_week', 'time_of_day', 'doctor_availability']
target = 'best_slot'

X = data[features]
y = data[target]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Predict on test data
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

# Rank the slots based on predicted best slots
data['predicted_best_slot'] = model.predict(data[features])

# Let's recommend the top N slots for a specific doctor
top_recommended_slots = data[data['doctor'] == 1].sort_values(by='predicted_best_slot', ascending=False).head(10)

print("Top Recommended Time Slots for Doctor 1:")
print(top_recommended_slots[['doctor', 'day_of_week', 'time_of_day', 'start_time', 'predicted_best_slot']])


  data['doctor_availability'] = data.groupby(['doctor', 'day_of_week', 'time_of_day'])['start_time'].transform('count')


              precision    recall  f1-score   support

           0       1.00      1.00      1.00       166
           1       1.00      1.00      1.00        34

    accuracy                           1.00       200
   macro avg       1.00      1.00      1.00       200
weighted avg       1.00      1.00      1.00       200

Top Recommended Time Slots for Doctor 1:
     doctor  day_of_week  time_of_day start_time  predicted_best_slot
66        1            0            3   04:41:11                    1
48        1            0            3   01:06:15                    1
274       1            0            3   06:51:03                    1
301       1            0            3   06:40:16                    1
171       1            0            3   05:15:11                    1
566       1            0            3   04:33:30                    1
559       1            0            3   05:36:55                    1
906       1            0            3   05:25:29                    1
97

In [18]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Generate Dummy Data
# Generate Dummy Data
np.random.seed(42)
data = pd.DataFrame({
    'doctor': np.random.randint(1, 5, 1000),  # 4 doctors (IDs 1 to 4)
    'day_of_week': np.random.randint(0, 7, 1000),  # Days (0=Monday, 6=Sunday)
    'time_of_day': np.random.randint(0, 4, 1000),  # 4 slots (morning, afternoon, evening, night)
    'start_time': pd.date_range("2025-01-01", periods=1000, freq="min").time,  # Random times
    'status': np.random.choice(['SCHEDULED', 'COMPLETED', 'CANCELLED'], 1000),
    'patient_demand': np.random.randint(1, 20, 1000),  # Random demand score
})

# Create a target variable: 1=Best Slot, 0=Not a Best Slot
data['slot_score'] = data['patient_demand'] / data.groupby(['doctor', 'day_of_week'])['patient_demand'].transform('sum')
threshold = 0.1  # Reduced threshold
data['best_slot'] = (data['slot_score'] > threshold).astype(int)

# Debugging: Check slot_score distribution and number of best slots
print(f"Slot Score Distribution:\n{data['slot_score'].describe()}")
print(f"Number of Best Slots: {data['best_slot'].sum()}")

# Features and Target
features = ['doctor', 'day_of_week', 'time_of_day', 'patient_demand']
target = 'best_slot'

X = data[features]
y = data[target]

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Random Forest Classifier
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Predictions and Evaluation
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

# Predict on Full Dataset
data['predicted_best_slot'] = model.predict(data[features])

# Automatically Recommend Top Slots for Each Doctor
top_slots_per_doctor = (
    data[data['predicted_best_slot'] == 1]
    .sort_values(by=['doctor', 'slot_score'], ascending=[True, False])  # Rank by score
    .groupby('doctor')
    .head(5)  # Top 5 slots per doctor
)

print("Top Recommended Slots for All Doctors:")
print(top_slots_per_doctor[['doctor', 'day_of_week', 'time_of_day', 'start_time', 'slot_score']])


Slot Score Distribution:
count    1000.000000
mean        0.028000
std         0.016157
min         0.001901
25%         0.014165
50%         0.027460
75%         0.040886
max         0.083770
Name: slot_score, dtype: float64
Number of Best Slots: 0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       200

    accuracy                           1.00       200
   macro avg       1.00      1.00      1.00       200
weighted avg       1.00      1.00      1.00       200

Top Recommended Slots for All Doctors:
Empty DataFrame
Columns: [doctor, day_of_week, time_of_day, start_time, slot_score]
Index: []


In [19]:
# Generate Dummy Data with More Variance
np.random.seed(42)
data = pd.DataFrame({
    'doctor': np.random.randint(1, 5, 1000),  # 4 doctors (IDs 1 to 4)
    'day_of_week': np.random.randint(0, 7, 1000),  # Days (0=Monday, 6=Sunday)
    'time_of_day': np.random.randint(0, 4, 1000),  # 4 slots (morning, afternoon, evening, night)
    'start_time': pd.date_range("2025-01-01", periods=1000, freq="min").time,  # Random times
    'status': np.random.choice(['SCHEDULED', 'COMPLETED', 'CANCELLED'], 1000),
    'patient_demand': np.random.randint(5, 100, 1000),  # Increased demand range
})

# Normalize Slot Score
data['slot_score'] = data['patient_demand'] / data.groupby(['doctor', 'day_of_week'])['patient_demand'].transform('sum')

# Calculate Dynamic Threshold: Top 10% Slots
threshold = data['slot_score'].quantile(0.9)  # Top 10% cutoff
data['best_slot'] = (data['slot_score'] >= threshold).astype(int)

# Debugging: Check slot_score distribution and best slots count
print(f"Slot Score Distribution:\n{data['slot_score'].describe()}")
print(f"Dynamic Threshold for Best Slots: {threshold}")
print(f"Number of Best Slots: {data['best_slot'].sum()}")

# Features and Target
features = ['doctor', 'day_of_week', 'time_of_day', 'patient_demand']
target = 'best_slot'

X = data[features]
y = data[target]

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Random Forest Classifier
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Predictions and Evaluation
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

# Predict on Full Dataset
data['predicted_best_slot'] = model.predict(data[features])

# Recommend Top Slots for Each Doctor
top_slots_per_doctor = (
    data[data['predicted_best_slot'] == 1]
    .sort_values(by=['doctor', 'slot_score'], ascending=[True, False])  # Rank by score
    .groupby('doctor')
    .head(5)  # Top 5 slots per doctor
)

print("Top Recommended Slots for All Doctors:")
print(top_slots_per_doctor[['doctor', 'day_of_week', 'time_of_day', 'start_time', 'slot_score']])


Slot Score Distribution:
count    1000.000000
mean        0.028000
std         0.015399
min         0.002176
25%         0.015174
50%         0.027050
75%         0.038974
max         0.073991
Name: slot_score, dtype: float64
Dynamic Threshold for Best Slots: 0.04774756163734571
Number of Best Slots: 100
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       182
           1       1.00      0.83      0.91        18

    accuracy                           0.98       200
   macro avg       0.99      0.92      0.95       200
weighted avg       0.99      0.98      0.98       200

Top Recommended Slots for All Doctors:
     doctor  day_of_week  time_of_day start_time  slot_score
180       1            3            0   03:00:00    0.063554
845       1            3            0   14:05:00    0.063554
127       1            3            2   02:07:00    0.062257
995       1            3            3   16:35:00    0.062257
732       1            

In [6]:
import joblib

# Save the model
joblib.dump(model, "slot_recommendation_model.pkl")
print("Model saved!")


Model saved!


In [12]:
# Load required libraries
import joblib
import pandas as pd

# Load the saved model
model = joblib.load("slot_recommendation_model.pkl")
print("Model loaded!")

# Define the new input
# Define the new input with all required features
new_input = {
    "doctor": 50,
    "day_of_week": 3,   # Thursday
    "time_of_day": 2,   # Afternoon slot
    "patient_demand": 0.5  # Example value, replace with actual demand if available
}

# Convert input into a DataFrame
input_df = pd.DataFrame([new_input])

# Make prediction
prediction = model.predict(input_df)

# Interpret the prediction
if prediction[0] == 1:
    print("This is a recommended slot!")
else:
    print("This slot is not recommended.")



Model loaded!
This slot is not recommended.
