In [49]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib


In [51]:
def load_and_preprocess_data(filepath='OPD manage/opd_hospital_data (2).csv'):
    # Load data
    df = pd.read_csv(filepath)
    
    # Calculate waiting time
    df['Consultation_Start_Time'] = pd.to_datetime(df['Consultation_Start_Time'], format='%H:%M:%S')
    df['Waiting_Time'] = (df['Consultation_Start_Time'] - pd.to_datetime('00:00:00', format='%H:%M:%S')).dt.total_seconds() / 60
    
    return df

In [53]:
def train_model(df):
    # Prepare features and target
    X = df.drop(['Waiting_Time', 'Consultation_Start_Time', 'Consultation_End_Time', 'Date', 'Patient_ID', 'Doctor_ID'], axis=1)
    y = df['Waiting_Time']
    
    # Identify numeric and categorical columns
    numeric_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
    categorical_features = X.select_dtypes(include=['object']).columns.tolist()
    
    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Create preprocessor
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', Pipeline([
                ('imputer', SimpleImputer(strategy='median')),
                ('scaler', StandardScaler())
            ]), numeric_features),
            ('cat', Pipeline([
                ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
                ('onehot', OneHotEncoder(handle_unknown='ignore'))
            ]), categorical_features)
        ])
    
    # Create model pipeline
    model = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))
    ])
    
    # Train the model
    model.fit(X_train, y_train)
    
    # Evaluate the model
    y_pred = model.predict(X_test)
    
    print("\nModel Performance Metrics:")
    print(f"Mean Absolute Error: {mean_absolute_error(y_test, y_pred):.2f} minutes")
    print(f"Mean Squared Error: {mean_squared_error(y_test, y_pred):.2f}")
    print(f"Root Mean Squared Error: {np.sqrt(mean_squared_error(y_test, y_pred)):.2f} minutes")
    print(f"R-squared Score: {r2_score(y_test, y_pred):.2f}")
    
    # Perform cross-validation
    cv_scores = cross_val_score(model, X, y, cv=5, scoring='neg_mean_absolute_error')
    print("\nCross-Validation Scores:")
    print("MAE Scores:", -cv_scores)
    print(f"Mean CV MAE: {-cv_scores.mean():.2f}")
    print(f"Standard Deviation: {cv_scores.std():.2f}")
    
    # Feature Importance
    feature_names = (
        model.named_steps['preprocessor']
        .named_transformers_['num'].get_feature_names_out(numeric_features).tolist() + 
        model.named_steps['preprocessor']
        .named_transformers_['cat'].get_feature_names_out(categorical_features).tolist()
    )
    
    importances = model.named_steps['regressor'].feature_importances_
    feature_importance = pd.DataFrame({
        'feature': feature_names,
        'importance': importances
    }).sort_values('importance', ascending=False)
    
    print("\nTop 10 Most Important Features:")
    print(feature_importance.head(10))
    
    # Save the model
    joblib.dump(model, 'opd_queue_prediction_model.joblib')
    
    return model, X_test, y_test

In [55]:
def test_model_with_sample_patients(model, X_test, y_test):
    print("\n--- Model Testing ---")
    
    # Test with a few samples from the test set
    num_samples_to_test = min(5, len(X_test))
    test_samples = X_test.head(num_samples_to_test)
    
    predictions = model.predict(test_samples)
    
    print("\nSample Predictions:")
    for i, (index, sample) in enumerate(test_samples.iterrows(), 1):
        print(f"\nSample Patient {i}:")
        # Print unique identifiable features
        for col in sample.index:
            print(f"{col}: {sample[col]}")
        print(f"Predicted Waiting Time: {predictions[i-1]:.2f} minutes")

In [57]:
# ipython-input-11-fb745421c5b3
def test_model_with_sample_patients(model, X_test, y_test):
    print("\n--- Model Testing ---")
    
    # Test with a few samples from the test set
    num_samples_to_test = min(5, len(X_test))
    test_samples = X_test.head(num_samples_to_test)
    
    predictions = model.predict(test_samples)
    
    print("\nSample Predictions:")
    for i, (index, sample) in enumerate(test_samples.iterrows(), 1):
        print(f"\nSample Patient {i}:")
        # Print unique identifiable features
        for col in sample.index:
            print(f"{col}: {sample[col]}")
        print(f"Predicted Waiting Time: {predictions[i-1]:.2f} minutes")

    # Provide a custom sample prediction method
    def predict_waiting_time(patient_data, model):  # Pass model as an argument
        """
        Helper function to predict waiting time for a new patient
        
        Parameters:
        patient_data (dict): Dictionary with patient features
        model: The trained model to use for prediction
        
        Returns:
        float: Predicted waiting time in minutes
        """
        sample_patient = pd.DataFrame([patient_data])
        prediction = model.predict(sample_patient)
        return prediction[0]
    
    print("\nExample of predicting for a custom patient:")
    sample_patient_1 = {
        'Age': 35,
        'Patient_Type': 'New',
        'Patient_Condition_Severity': 3,
        'Total_Patients_in_Queue': 15,
        'Staff_Availability': 3,
        'Day_of_the_Week': 'Tuesday',
        'Time_of_Day': 'Second Half',
        'Consultation_Duration': 30,
        'medical problem': 'Back Pain'
    }
    
    custom_prediction = predict_waiting_time(sample_patient_1, model)  # Pass model to the function
    print("Custom Patient Predicted Waiting Time:", custom_prediction)
    
    return predictions

In [61]:
def main():
    # Load data
    file_path = 'OPD manage/opd_hospital_data (2).csv'  # Replace with your actual file path
    df = load_and_preprocess_data(file_path)
    
    # Train model
    model, X_test, y_test = train_model(df)
    
    # Test model
    test_model_with_sample_patients(model, X_test, y_test)

if __name__ == "__main__":
    main()


Model Performance Metrics:
Mean Absolute Error: 357.74 minutes
Mean Squared Error: 188044.68
Root Mean Squared Error: 433.64 minutes
R-squared Score: -0.08

Cross-Validation Scores:
MAE Scores: [345.71992171 358.19375642 348.26994092 351.57373608 358.39593475]
Mean CV MAE: 352.43
Standard Deviation: 5.14

Top 10 Most Important Features:
                        feature  importance
0                           Age    0.101141
3         Consultation_Duration    0.089061
1       Total_Patients_in_Queue    0.066089
2            Staff_Availability    0.038734
7641     Day_of_the_Week_Monday    0.008926
7644   Day_of_the_Week_Thursday    0.008787
7645    Day_of_the_Week_Tuesday    0.008499
7646  Day_of_the_Week_Wednesday    0.008360
7643     Day_of_the_Week_Sunday    0.008249
7642   Day_of_the_Week_Saturday    0.008140

--- Model Testing ---

Sample Predictions:

Sample Patient 1:
Age: 63
Appointment_Time: 12:09:34
Patient_Type: New
Patient_Condition_Severity: Routine
Total_Patients_in_Queue:

ValueError: columns are missing: {'Breaks_Lunch_Schedules', 'Appointment_Time'}