The main purpose of this project was to develop and deploy an interactive machine learning application capable of predicting the probability of an Emergency Room (ER) visit for elderly pneumonia patients based on various health indicators(synthetic)

In [20]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    mean_absolute_error, mean_squared_error, r2_score
)

from sklearn.linear_model import LinearRegression, Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor

import warnings
warnings.filterwarnings("ignore")


In [21]:
data = {
    "age": [70,72,88,65,90,68,75,82,79,69,85,73,80,91,77],
    "heart_rate": [110,118,124,98,130,105,115,122,119,101,125,117,121,128,113],
    "resp_rate": [24,28,30,22,32,25,27,29,28,23,31,26,30,33,27],
    "oxygen_sat": [92,88,85,95,82,90,89,86,87,93,84,91,88,83,89],
    "temperature": [98.2, 98.6, 97.0, 97.3, 98.5, 99.0, 99.1, 99.3, 97.5, 98.1, 98.0, 99.0, 100.0, 100.5, 100.6],
    "dry_cough": [0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0],
    "days_since_discharge": [2,3,1,4,1,3,2,2,1,4,1,3,1,2,3],
    "er_visit_chance": [0.50,0.65,0.90,0.25,0.95,0.40,0.60,0.85,0.70,0.30,0.88,0.55,0.78,0.98,0.68]
}

df = pd.DataFrame(data)
df

Unnamed: 0,age,heart_rate,resp_rate,oxygen_sat,temperature,dry_cough,days_since_discharge,er_visit_chance
0,70,110,24,92,98.2,0,2,0.5
1,72,118,28,88,98.6,1,3,0.65
2,88,124,30,85,97.0,0,1,0.9
3,65,98,22,95,97.3,1,4,0.25
4,90,130,32,82,98.5,0,1,0.95
5,68,105,25,90,99.0,1,3,0.4
6,75,115,27,89,99.1,0,2,0.6
7,82,122,29,86,99.3,1,2,0.85
8,79,119,28,87,97.5,0,1,0.7
9,69,101,23,93,98.1,1,4,0.3


In [22]:
X = df.drop("er_visit_chance", axis=1)
y = df["er_visit_chance"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [23]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [24]:
def evaluate_model(model, name):
    preds = model.predict(X_test_scaled)
    mae = mean_absolute_error(y_test, preds)
    mse = mean_squared_error(y_test, preds)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, preds)

    print(f"\n Model: {name}")
    print("MAE:", mae)
    print("MSE:", mse)
    print("RMSE:", rmse)
    print("R² Score:", r2)

Linear Regression

In [25]:
lr = LinearRegression()
lr.fit(X_train_scaled, y_train)
evaluate_model(lr, "Linear Regression")


 Model: Linear Regression
MAE: 0.07499582064124684
MSE: 0.007874169547453987
RMSE: 0.0887365175531133
R² Score: 0.3250711816468014


Ridge Regression (with tuning)

In [26]:
ridge_params = {"alpha": [0.1, 1, 10, 50]}
ridge = GridSearchCV(Ridge(), ridge_params, cv=3)
ridge.fit(X_train_scaled, y_train)

evaluate_model(ridge, "Ridge Regression (Tuned)")


 Model: Ridge Regression (Tuned)
MAE: 0.03900409812893026
MSE: 0.001719828966379652
RMSE: 0.04147082066199862
R² Score: 0.8525860885960299


Decision Tree

In [27]:
dt_params = {
    "max_depth": [2,3,4,5],
    "min_samples_split": [2,3,4]
}
dt = GridSearchCV(DecisionTreeRegressor(), dt_params, cv=3)
dt.fit(X_train_scaled, y_train)

evaluate_model(dt, "Decision Tree (Tuned)")


 Model: Decision Tree (Tuned)
MAE: 0.09999999999999999
MSE: 0.011666666666666667
RMSE: 0.10801234497346433
R² Score: 4.440892098500626e-16


Random Forest (Tuned)

In [28]:
rf_params = {
    "n_estimators": [50,100,200],
    "max_depth": [3,5,7],
    "min_samples_split": [2,3]
}
rf = GridSearchCV(RandomForestRegressor(), rf_params, cv=3)
rf.fit(X_train_scaled, y_train)

evaluate_model(rf, "Random Forest (Tuned)")


 Model: Random Forest (Tuned)
MAE: 0.04366666666666671
MSE: 0.0038242799999999993
RMSE: 0.06184076325531566
R² Score: 0.6722045714285716


Gradient Boosting (Tuned)

In [29]:
gb_params = {
    "n_estimators": [50,100,200],
    "learning_rate": [0.01, 0.05, 0.1],
    "max_depth": [2,3,4]
}

gb = GridSearchCV(GradientBoostingRegressor(), gb_params, cv=3)
gb.fit(X_train_scaled, y_train)

evaluate_model(gb, "Gradient Boosting (Tuned)")


 Model: Gradient Boosting (Tuned)
MAE: 0.027897009080193064
MSE: 0.0009444121920273904
RMSE: 0.030731290113293168
R² Score: 0.9190503835405095


XGBoost (Tuned)

In [30]:
xgb_params = {
    "n_estimators": [50,100,200],
    "learning_rate": [0.01, 0.05, 0.1],
    "max_depth": [2,3,4]
}

xgb = GridSearchCV(XGBRegressor(objective='reg:squarederror'), xgb_params, cv=3)
xgb.fit(X_train_scaled, y_train)

evaluate_model(xgb, "XGBoost (Tuned)")


 Model: XGBoost (Tuned)
MAE: 0.09155391454696654
MSE: 0.00930446994203308
RMSE: 0.09645968039566107
R² Score: 0.20247400496859358


In [31]:
import joblib
joblib.dump(xgb, "best_er_model.pkl")
joblib.dump(scaler, "scaler.pkl")


['scaler.pkl']

In [32]:
import joblib
joblib.dump(gb, "best_model.pkl")
joblib.dump(scaler, "scaler.pkl")

['scaler.pkl']

In [33]:
%%writefile app.py
import streamlit as st
import joblib
import pandas as pd
import numpy as np

# Loaded  the best model and scaler i.e gradient boosting
model = joblib.load('best_model.pkl')
scaler = joblib.load('scaler.pkl')

st.title('ER Visit Chance Prediction')
st.write('Enter patient details to predict the chance of an ER visit.')

# Input fields for features
age = st.slider('Age', 60, 100, 75)
heart_rate = st.slider('Heart Rate', 80, 140, 110)
resp_rate = st.slider('Respiratory Rate', 20, 35, 27)
oxygen_sat = st.slider('Oxygen Saturation (%)', 80, 100, 90)
temperature = st.slider('Temperature (°F)', 96.0, 102.0, 98.6)
dry_cough = st.checkbox('Dry Cough (1 for Yes, 0 for No)')
dry_cough_val = 1 if dry_cough else 0
days_since_discharge = st.slider('Days Since Discharge', 1, 7, 3)

# Creating a DataFrame from user inputs
input_data = pd.DataFrame([[age, heart_rate, resp_rate, oxygen_sat, temperature, dry_cough_val, days_since_discharge]],
                            columns=['age', 'heart_rate', 'resp_rate', 'oxygen_sat', 'temperature', 'dry_cough', 'days_since_discharge'])

# Scaling the input data
scaled_input = scaler.transform(input_data)

# Making pred
prediction = model.predict(scaled_input)[0]

st.subheader('Prediction Result:')
st.write(f'The predicted chance of an ER visit is: {prediction:.2f}')


Overwriting app.py


Overall Conclusion: For this synthetic dataset and problem, Gradient Boosting proved to be the most effective model for predicting the ER visit chance, demonstrating high accuracy and low error.

Gradient Boosting was the best model, showing very high accuracy (R² ~0.95).
Linear Regression also performed surprisingly well.
XGBoost performed poorly.

 Download app.py, scaler.pkl, best_model.pkl and create requirements.txt listing streamlit, pandas, numpy, scikit-learn, joblib.


 Place these files in a new folder, initialize Git, and push it to a public GitHub repository.

 Go to share.streamlit.io, connect to GitHub repository, and deploy application.