This project involved building a predictive model for the probability of ER visits for elderly pneumonia patients. Here's a summary of what has been done:

Data Preparation: We started by creating a dataset with patient features such as age, heart rate, respiratory rate, oxygen saturation, WBC count, CRP, and days since discharge. This data was then split into training and testing sets and scaled using StandardScaler.

Model Training and Evaluation: Several regression models were trained and evaluated to predict the er_visit_chance:

LinearRegression
RidgeRegression (tuned using GridSearchCV)
DecisionTreeRegressor (tuned using GridSearchCV)
RandomForestRegressor (tuned using GridSearchCV)
GradientBoostingRegressor (tuned using GridSearchCV)
XGBRegressor (tuned using GridSearchCV)
The models were evaluated based on MAE, MSE, RMSE, and R² score. The Random Forest Regressor demonstrated the best performance with an R² score of approximately 0.94.

Model Saving: The best-performing Random Forest model (rf) and the StandardScaler were saved using joblib for later use.

Streamlit Application Deployment: A Streamlit web application (streamlit_app.py) was created to provide an interactive interface for predicting the ER visit probability. This application loads the saved Random Forest model and scaler, takes patient inputs, and displays the predicted probability.

Public Access with ngrok: The Streamlit application was deployed locally and then exposed to the internet using ngrok, providing a public URL for access.

In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    mean_absolute_error, mean_squared_error, r2_score
)

from sklearn.linear_model import LinearRegression, Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor

import warnings
warnings.filterwarnings("ignore")


In [None]:
data = {
    "age": [70,72,88,65,90,68,75,82,79,69,85,73,80,91,77],
    "heart_rate": [110,118,124,98,130,105,115,122,119,101,125,117,121,128,113],
    "resp_rate": [24,28,30,22,32,25,27,29,28,23,31,26,30,33,27],
    "oxygen_sat": [92,88,85,95,82,90,89,86,87,93,84,91,88,83,89],
    "wbc_count": [12,15,17,10,18,13,14,16,15,11,17,14,16,18,13],
    "crp": [60,74,85,50,91,67,72,80,78,55,88,73,82,95,70],
    "days_since_discharge": [2,3,1,4,1,3,2,2,1,4,1,3,1,2,3],

    # Target: probability of visiting ER within 7 days (0–1)
    "er_visit_chance": [0.65,0.72,0.91,0.32,0.95,0.58,0.70,0.88,0.75,0.41,0.90,0.66,0.81,0.96,0.73]
}

df = pd.DataFrame(data)
df


Unnamed: 0,age,heart_rate,resp_rate,oxygen_sat,wbc_count,crp,days_since_discharge,er_visit_chance
0,70,110,24,92,12,60,2,0.65
1,72,118,28,88,15,74,3,0.72
2,88,124,30,85,17,85,1,0.91
3,65,98,22,95,10,50,4,0.32
4,90,130,32,82,18,91,1,0.95
5,68,105,25,90,13,67,3,0.58
6,75,115,27,89,14,72,2,0.7
7,82,122,29,86,16,80,2,0.88
8,79,119,28,87,15,78,1,0.75
9,69,101,23,93,11,55,4,0.41


In [None]:
X = df.drop("er_visit_chance", axis=1)
y = df["er_visit_chance"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [None]:
def evaluate_model(model, name):
    preds = model.predict(X_test_scaled)
    mae = mean_absolute_error(y_test, preds)
    mse = mean_squared_error(y_test, preds)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, preds)

    print(f"\n Model: {name}")
    print("MAE:", mae)
    print("MSE:", mse)
    print("RMSE:", rmse)
    print("R² Score:", r2)


Linear Regression

In [None]:
lr = LinearRegression()
lr.fit(X_train_scaled, y_train)
evaluate_model(lr, "Linear Regression")



 Model: Linear Regression
MAE: 0.10708143261067814
MSE: 0.01893730839036363
RMSE: 0.1376128932562775
R² Score: -0.41793490443654413


Ridge Regression (with tuning)

In [None]:
ridge_params = {"alpha": [0.1, 1, 10, 50]}
ridge = GridSearchCV(Ridge(), ridge_params, cv=3)
ridge.fit(X_train_scaled, y_train)

evaluate_model(ridge, "Ridge Regression (Tuned)")



 Model: Ridge Regression (Tuned)
MAE: 0.05080436150052057
MSE: 0.0036951645638009785
RMSE: 0.060787865267674754
R² Score: 0.7233237847403595


Decision Tree

In [None]:
dt_params = {
    "max_depth": [2,3,4,5],
    "min_samples_split": [2,3,4]
}
dt = GridSearchCV(DecisionTreeRegressor(), dt_params, cv=3)
dt.fit(X_train_scaled, y_train)

evaluate_model(dt, "Decision Tree (Tuned)")



 Model: Decision Tree (Tuned)
MAE: 0.049999999999999954
MSE: 0.0025666666666666615
RMSE: 0.050662280511902164
R² Score: 0.8078202995008323


Random Forest (Tuned)

In [None]:
rf_params = {
    "n_estimators": [50,100,200],
    "max_depth": [3,5,7],
    "min_samples_split": [2,3]
}
rf = GridSearchCV(RandomForestRegressor(), rf_params, cv=3)
rf.fit(X_train_scaled, y_train)

evaluate_model(rf, "Random Forest (Tuned)")



 Model: Random Forest (Tuned)
MAE: 0.019666666666667
MSE: 0.0007549466666666814
RMSE: 0.02747629281156178
R² Score: 0.9434732113144748


Gradient Boosting (Tuned)

In [None]:
gb_params = {
    "n_estimators": [50,100,200],
    "learning_rate": [0.01, 0.05, 0.1],
    "max_depth": [2,3,4]
}

gb = GridSearchCV(GradientBoostingRegressor(), gb_params, cv=3)
gb.fit(X_train_scaled, y_train)

evaluate_model(gb, "Gradient Boosting (Tuned)")



 Model: Gradient Boosting (Tuned)
MAE: 0.02881842838593555
MSE: 0.001633548567863427
RMSE: 0.04041718159228111
R² Score: 0.8776877112248682


XGBoost (Tuned)

In [None]:
xgb_params = {
    "n_estimators": [50,100,200],
    "learning_rate": [0.01, 0.05, 0.1],
    "max_depth": [2,3,4]
}

xgb = GridSearchCV(XGBRegressor(objective='reg:squarederror'), xgb_params, cv=3)
xgb.fit(X_train_scaled, y_train)

evaluate_model(xgb, "XGBoost (Tuned)")



 Model: XGBoost (Tuned)
MAE: 0.10254947900772095
MSE: 0.01576207738568545
RMSE: 0.1255471122156358
R² Score: -0.18018882255548263


In [None]:
import joblib
joblib.dump(xgb, "best_er_model.pkl")
joblib.dump(scaler, "scaler.pkl")


['scaler.pkl']

In [None]:
pip install streamlit



In [None]:
pip install pyngrok



In [None]:
import streamlit as st

# --------------------------
# Streamlit App
# --------------------------
st.title("ER Visit Probability Prediction for Elderly Pneumonia Patients")
st.write("Enter patient features to predict the probability of ER visit within 7 days.")

# Input widgets
age = st.number_input("Age", min_value=50, max_value=100, value=70)
heart_rate = st.number_input("Heart Rate", min_value=50, max_value=150, value=110)
resp_rate = st.number_input("Respiratory Rate", min_value=10, max_value=40, value=24)
oxygen_sat = st.number_input("Oxygen Saturation (%)", min_value=70, max_value=100, value=92)
wbc_count = st.number_input("WBC Count", min_value=0, max_value=50, value=12)
crp = st.number_input("CRP", min_value=0, max_value=200, value=60)
days_since_discharge = st.number_input("Days Since Discharge", min_value=0, max_value=30, value=2)

# Prediction
input_df = pd.DataFrame({
    "age": [age],
    "heart_rate": [heart_rate],
    "resp_rate": [resp_rate],
    "oxygen_sat": [oxygen_sat],
    "wbc_count": [wbc_count],
    "crp": [crp],
    "days_since_discharge": [days_since_discharge]
})

# Scale the input features
input_scaled = scaler.transform(input_df)

# Use the trained Random Forest model 'rf' to predict
pred_prob = rf.predict(input_scaled)[0]

st.subheader("Predicted Probability of ER Visit within 7 Days")
st.write(f"{pred_prob:.2f} ({pred_prob*100:.1f}%)")



In [None]:
from pyngrok import ngrok

# IMPORTANT: Replace 'YOUR_AUTHTOKEN' with your actual ngrok authtoken
NGROK_AUTH_TOKEN = "368XfqDHGI5AgZYt1Vq5kWddo5t_3UvJDs15NzCfUCRDfKrds"
ngrok.set_auth_token(NGROK_AUTH_TOKEN)

print("Ngrok authtoken set. Remember to replace 'YOUR_AUTHTOKEN' with your actual token.")

Ngrok authtoken set. Remember to replace 'YOUR_AUTHTOKEN' with your actual token.


In [None]:
from pyngrok import ngrok
import subprocess
import time

# Start Streamlit in the background
streamlit_process = subprocess.Popen(["streamlit", "run", "streamlit_app.py", "--server.port", "8501", "--server.headless", "true"])

# Give Streamlit some time to start
time.sleep(5)

# Open a ngrok tunnel to the Streamlit port
public_url = ngrok.connect(8501)
print(f"Streamlit App URL: {public_url}")

Streamlit App URL: NgrokTunnel: "https://negational-paulina-fastidiously.ngrok-free.dev" -> "http://localhost:8501"


In [None]:
import joblib

# Save the trained Random Forest model
joblib.dump(rf, 'best_rf_model.pkl')
print("Random Forest model saved as 'best_rf_model.pkl'")

Random Forest model saved as 'best_rf_model.pkl'


In [None]:
%%writefile streamlit_app.py
import streamlit as st
import pandas as pd
import joblib

# Load the trained model and scaler
rf = joblib.load('best_rf_model.pkl') # Changed to load Random Forest model
scaler = joblib.load('scaler.pkl')

# --------------------------
# Streamlit App
# --------------------------
st.title("ER Visit Probability Prediction for Elderly Pneumonia Patients")
st.write("Enter patient features to predict the probability of ER visit within 7 days.")

# Input widgets
age = st.number_input("Age", min_value=50, max_value=100, value=70)
heart_rate = st.number_input("Heart Rate", min_value=50, max_value=150, value=110)
resp_rate = st.number_input("Respiratory Rate", min_value=10, max_value=40, value=24)
oxygen_sat = st.number_input("Oxygen Saturation (%)", min_value=70, max_value=100, value=92)
wbc_count = st.number_input("WBC Count", min_value=0, max_value=50, value=12)
crp = st.number_input("CRP", min_value=0, max_value=200, value=60)
days_since_discharge = st.number_input("Days Since Discharge", min_value=0, max_value=30, value=2)

# Prediction button
if st.button("Predict ER Visit Probability"):
    # Prediction
    input_df = pd.DataFrame({
        "age": [age],
        "heart_rate": [heart_rate],
        "resp_rate": [resp_rate],
        "oxygen_sat": [oxygen_sat],
        "wbc_count": [wbc_count],
        "crp": [crp],
        "days_since_discharge": [days_since_discharge]
    })

    # Scale the input features
    input_scaled = scaler.transform(input_df)

    # Use the trained Random Forest model 'rf' to predict
    pred_prob = rf.predict(input_scaled)[0] # Changed to use Random Forest model

    st.subheader("Predicted Probability of ER Visit within 7 Days")
    st.write(f"{pred_prob:.2f} ({pred_prob*100:.1f}%)")

Overwriting streamlit_app.py
