In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np
import joblib
import warnings

warnings.filterwarnings('ignore')

## 1. Data Loading

In [3]:
df = pd.read_csv('A_dataset_pengajuan_bpjs_dengan_status.csv', delimiter=';')

## 2. Data Preprocessing

In [4]:
# Konversi kolom kategorikal menjadi numerik dengan one-hot encoding
categorical_cols = ["KELAS_RS", "ADL1", "ADL2", "PAYOR_ID"]
df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

# Konversi kolom tanggal menjadi bertipe data datetime dan hitung fitur tambahan yang relevan
df["ADMISSION_DATE"] = pd.to_datetime(df["ADMISSION_DATE"])
df["DISCHARGE_DATE"] = pd.to_datetime(df["DISCHARGE_DATE"])
df["BIRTH_DATE"] = pd.to_datetime(df["BIRTH_DATE"])

df["LOS_CALCULATED"] = (df["DISCHARGE_DATE"] - df["ADMISSION_DATE"]).dt.days
df["AGE_AT_ADMISSION"] = (df["ADMISSION_DATE"] - df["BIRTH_DATE"]).dt.days / 365.25

# Handle kolom 'VERSI_INACBG', sesuaikan format desimalnya
df['VERSI_INACBG'] = df['VERSI_INACBG'].str.replace(',', '.', regex=False).astype(float)

# Hapus kolom yang tidak relevan untuk user input
columns_to_drop = [
    "KODE_RS", "KODE_TARIF", "PTD", #dihapus karena merupakan ID unik
    "INACBG", "DESKRIPSI_INACBG", "DESKRIPSI_SP", "DESKRIPSI_SR", "DESKRIPSI_SI", "DESKRIPSI_SD",
    "NAMA_FILE", "ADMISSION_DATE", "DISCHARGE_DATE", "BIRTH_DATE", #dihapus karena informasi sudah diekstrak
    "SEP", "C1", "C2", "C3", "C4", "DIAGLIST", "PROCLIST"
]

df = df.drop(columns=columns_to_drop, errors="ignore")

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 44 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   KELAS_RAWAT         10000 non-null  int64  
 1   BIRTH_WEIGHT        10000 non-null  int64  
 2   SEX                 10000 non-null  int64  
 3   DISCHARGE_STATUS    10000 non-null  int64  
 4   TARIF_INACBG        10000 non-null  int64  
 5   TARIF_SUBACUTE      10000 non-null  int64  
 6   TARIF_CHRONIC       10000 non-null  int64  
 7   TARIF_SP            10000 non-null  int64  
 8   TARIF_SR            10000 non-null  int64  
 9   TARIF_SI            10000 non-null  int64  
 10  TARIF_SD            10000 non-null  int64  
 11  TOTAL_TARIF         10000 non-null  int64  
 12  TARIF_RS            10000 non-null  int64  
 13  TARIF_POLI_EKS      10000 non-null  int64  
 14  LOS                 10000 non-null  int64  
 15  ICU_INDIKATOR       10000 non-null  int64  
 16  ICU_L

## 3. Feature and Target Split

In [6]:
X = df.drop(columns=["TARIF_RS", "status"], errors='ignore')
y = df["TARIF_RS"]

X = X.select_dtypes(include=np.number)

print(f"Number of features (X): {X.shape[1]}")
print(f"Shape of target (y): {y.shape}")

Number of features (X): 41
Shape of target (y): (10000,)


## 4. Train-Test Split

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}")

X_train shape: (8000, 41)
X_test shape: (2000, 41)
y_train shape: (8000,)
y_test shape: (2000,)


## 5. Model Training (Random Forest Regressor)

In [8]:
regressor_model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
regressor_model.fit(X_train, y_train)

print("Random Forest Regressor model training complete.")

Random Forest Regressor model training complete.


## 6. Model Evaluation

In [9]:
y_pred = regressor_model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")
print(f"R-squared (R2): {r2:.2f}")

Mean Squared Error (MSE): 2123416750616.00
Root Mean Squared Error (RMSE): 1457194.82
R-squared (R2): 0.97


## 7. Save Model and Feature Columns

In [11]:
joblib.dump(regressor_model, 'C_regressor_model.pkl')

print("Model regressor disimpan.")

Model regressor disimpan.
