In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statistics


from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFECV
from sklearn.metrics import (
    mean_squared_error, r2_score, accuracy_score, auc, roc_curve, roc_auc_score, confusion_matrix
)
from sklearn.model_selection import (
    train_test_split,    
    StratifiedKFold,
    GridSearchCV, RandomizedSearchCV
)

# Loading Data

In [4]:
# 读取数据
data = pd.read_csv('./data_processed/data_first_record_with_commorbidities.csv', index_col=0)
print(data.shape)

# 填充缺失值
print(data.isna().any().any())
data.fillna(data.mean(), inplace=True)
print(data.isna().any().any())

# 查看 LOS 的情况
los_mean = data.los.mean()
print(f"Average days of LOS: {los_mean:.2f}±{data.los.std():.2f} days")
print(
    f"Quantile  0.20  0.50  0.80\n"
    f"          {data.los.quantile(0.20):.2f}  {data.los.quantile(0.50):.2f}  {data.los.quantile(0.80):.2f}"
)

(652, 34)
True
False
Average days of LOS: 3.29±4.35 days
Quantile  0.20  0.50  0.80
          1.05  1.91  4.12


In [None]:
data.columns

In [None]:
df = (
    data.loc[data.die_in_icu == 1, :]
    .drop(columns=['die_in_icu'])
    .assign(los = (data.los >= data.los.mean()).astype('int32'))
)

df

In [None]:
X = df.loc[:, df.columns != 'los']
y = df.loc[:, df.columns == 'los']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=84)

In [None]:
model = LogisticRegression(max_iter=1000)

model.fit(X_train, np.array(y_train).reshape(-1))

In [None]:
# res = pd.DataFrame(model.coef_.reshape(-1), index=X.columns, columns=['Coefficient'])
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100.0}")

cm = confusion_matrix(y_test, y_pred)
print(cm)

In [None]:
y_scores = model.predict_proba(X_test)[:, 1]
fpr, tpr, thresholds = roc_curve(y_test, y_scores)

roc_auc = auc(fpr, tpr)

plt.figure(figsize=(5, 5), dpi=300)

plt.plot(fpr, 
        tpr, 
        color='darkorange', 
        lw=2, 
        label=f'ROC curve (area = {roc_auc:.2f})')

plt.fill_between(fpr, tpr, color='darkorange', alpha=0.2)  # alpha 参数控制填充颜色的透明度


plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend(loc="lower right")
plt.show()

In [None]:
for random_seed in range(0, 1000): 
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_seed)

    model = LogisticRegression(max_iter=1000)
    model.fit(X_train, np.array(y_train).reshape(-1))
    y_pred = model.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    

    y_scores = model.predict_proba(X_test)[:, 1]
    fpr, tpr, thresholds = roc_curve(y_test, y_scores)
    
    roc_auc = auc(fpr, tpr)

    print(f"Seed: {random_seed} acc: {accuracy * 100.0} auc: {roc_auc}")