In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import mlflow
import mlflow.sklearn
import mlflow.pyfunc
import subprocess

In [3]:
# Loading the dataset
data = pd.read_csv("../../../datasets/electronic-health-record/output/transformed_electronic_health_record_dataset.csv")
data.head()


Unnamed: 0,Hospital Service Area,Hospital County,Permanent Facility Id,Age Group,Zip Code - 3 digits,Length of Stay,Patient Disposition,CCSR Diagnosis Code,CCSR Procedure Code,APR DRG Code,...,Ethnicity_Spanish/Hispanic,Type of Admission_Elective,Type of Admission_Emergency,Type of Admission_Newborn,Type of Admission_Trauma,Type of Admission_Urgent,APR Medical Surgical Description_Medical,APR Medical Surgical Description_Surgical,Emergency Department Indicator_N,Emergency Department Indicator_Y
0,5.438521,6.293987,1,0,120,2,4.213964,3.540219,2.281583,640,...,True,False,False,True,False,False,True,False,True,False
1,5.032723,4.842077,245,0,117,2,4.213964,3.540219,2.32178,640,...,True,False,False,True,False,False,True,False,True,False
2,5.463602,5.37699,1637,0,113,2,4.213964,3.540219,2.32178,640,...,False,False,False,True,False,False,True,False,True,False
3,5.032723,5.211568,541,0,117,1,4.213964,3.540219,2.32178,640,...,False,False,False,True,False,False,True,False,True,False
4,5.463602,5.412338,1463,0,999,6,4.213964,8.35,4.212766,303,...,False,True,False,False,False,False,False,True,True,False


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24359 entries, 0 to 24358
Data columns (total 34 columns):
 #   Column                                     Non-Null Count  Dtype  
---  ------                                     --------------  -----  
 0   Hospital Service Area                      24359 non-null  float64
 1   Hospital County                            24359 non-null  float64
 2   Permanent Facility Id                      24359 non-null  int64  
 3   Age Group                                  24359 non-null  int64  
 4   Zip Code - 3 digits                        24359 non-null  int64  
 5   Length of Stay                             24359 non-null  int64  
 6   Patient Disposition                        24359 non-null  float64
 7   CCSR Diagnosis Code                        24359 non-null  float64
 8   CCSR Procedure Code                        24359 non-null  float64
 9   APR DRG Code                               24359 non-null  int64  
 10  APR MDC Code          

In [5]:
# Select features and target
features = ['Total Charges', 'Total Costs', 'CCSR Procedure Code', 'CCSR Diagnosis Code', 'APR Severity of Illness Code']
target = 'Length of Stay'

X = data[features]
y = data[target]

In [6]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(19487, 5) (4872, 5) (19487,) (4872,)


In [7]:
command = "mlflow ui"
process = subprocess.Popen(command, shell=True)

# Set our tracking server uri for logging
mlflow.set_tracking_uri(uri="http://127.0.0.1:5000")

In [8]:
# Create a new MLflow Experiment
mlflow.set_experiment("Length of Stay")

2024/04/26 14:45:04 INFO mlflow.tracking.fluent: Experiment with name 'Length of Stay' does not exist. Creating a new experiment.


<Experiment: artifact_location='mlflow-artifacts:/427006878424607100', creation_time=1714122904298, experiment_id='427006878424607100', last_update_time=1714122904298, lifecycle_stage='active', name='Length of Stay', tags={}>

In [9]:
# Define hyperparameters
params = {'n_estimators': 100, 'min_samples_leaf': 10, 'max_features': 'sqrt', 'max_depth': 20}

In [10]:
# Start MLflow run
with mlflow.start_run(run_name='stay'):

    # Train Random Forest model
    model = RandomForestRegressor(**params)
    model.fit(X_train, y_train)

    # Make predictions
    y_pred = model.predict(X_test)

    # Calculate evaluation metrics
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)

    # Log hyperparameters
    mlflow.log_params(params)

    # Log evaluation metrics
    mlflow.log_metric("mse", mse)
    mlflow.log_metric("rmse", rmse)
    mlflow.log_metric("r2", r2)

    #printing the values
    print("| ", "Random Forest"," | ", r2," | ", mse, " | ",rmse, " | ")

    # Save model
    mlflow.sklearn.log_model(model, "random_forest_model")


|  Random Forest  |  0.6780207901814971  |  23.143743763840337  |  4.810794504428592  | 


In [11]:
feature_names = X.columns.tolist()

# Prepare a DataFrame for results
result = pd.DataFrame(X_test, columns=feature_names)
result["actual_Length_of_Stay"] = y_test
result["predicted_Length_of_Stay"] = y_pred

# Display the random four rows of the result
print(result.sample(5))

       Total Charges  Total Costs  CCSR Procedure Code  CCSR Diagnosis Code  \
5359        6.997789     8.007747             2.651376             2.415954   
7665        8.213477     8.641590             2.276778             3.340426   
23810       8.293902     8.802095             3.600000            10.523810   
18140       7.502062     7.704284            11.280347             5.006849   
22381       8.961740     9.344106             4.946058             7.567460   

       APR Severity of Illness Code  actual_Length_of_Stay  \
5359                       0.000000                      2   
7665                       0.711242                      3   
23810                      1.144531                      1   
18140                      0.711242                      4   
22381                      0.711242                      8   

       predicted_Length_of_Stay  
5359                   2.045178  
7665                   2.909041  
23810                  4.320960  
18140           

***Comparison between different models***

|  Model  |  R-square  |  Mean Square Error  |  Root Mean Square Error  |
| :-: | :-: | :-: | :-: |
|  Random Forest  |  0.6787462631550304  |  23.091597041023945  |  4.805371686043021  | 
|  XGBoost  |  0.6725437778951293  |  23.537429334466328  |  4.851538862512216  | 
|  Decision Tree  |  0.557420805506533  |  31.812425026875246  |  5.640250440084664  | 
|  Linear Regression  |  0.4070929204236463  |  42.61793652210357  |  6.5282414570926806  | 
|  K Neighbors  |  0.5964003358883267  |  29.010591133004926  |  5.386148079379635  | 