In [1]:
import pandas as pd
import numpy as np
import os
import json

# 读取数据
root_path = "/workspace/med_proj"
file_path = f"{root_path}/data/all_data.xlsx"
data = pd.read_excel(file_path)

id2cluster = pd.read_csv(f"{root_path}/data/48h/result.csv", index_col=None, header=0)
with open(f"{root_path}/results/dataprocess/48h/name_id_mapping.json", "r", encoding="utf-8") as f:
    name2id = json.load(f)
id2name = {v: k for k, v in name2id.items()}

name2cluster = {}
for index, row in id2cluster.iterrows():
    name2cluster[id2name[row['ID']]] = row['cluster']

# df_filtered = data[data['姓名'].isin(name2id.keys())]

# # 将 name2id 中的对应ID添加到新列中
# df_filtered['cluster'] = df_filtered['姓名'].map(name2cluster)
# df_filtered.rename(columns={'姓名': 'name'}, inplace=True)
# df_filtered['性别'] = df_filtered['性别'].replace({'男': 0, '女': 1})
# df_filtered['转归'] = df_filtered['转归'].replace({'存活': 1,'死亡': 0, '好转': 1, "未愈": 0, "其他":1})
# df_filtered['机械通气'] = df_filtered['机械通气'].replace({'无': 0, '有': 1})
# df_filtered['是否二感'] = df_filtered['是否二感'].replace({'否': 0, '是': 1, '疑似': 0})
# df_filtered['慢性病个数'] = df_filtered['慢性病个数'].replace({0: 0, 1: 1, 2: 1, 3: 2, 4: 2, 5: 2, 6: 2})


df_filtered = data[data['姓名'].isin(name2id.keys())].copy()
df_filtered.loc[:, 'cluster'] = df_filtered['姓名'].map(name2cluster)

df_filtered = df_filtered.rename(columns={'姓名': 'name'})

df_filtered.loc[:, '性别'] = df_filtered['性别'].map({'男': 0, '女': 1})
df_filtered['性别'] = df_filtered['性别'].astype(int)
df_filtered.loc[:, '转归'] = df_filtered['转归'].map({'存活': 0, '死亡': 1, '好转': 0, "未愈": 1, "其他": 0})
df_filtered['转归'] = df_filtered['转归'].astype(int)
df_filtered.loc[:, '机械通气'] = df_filtered['机械通气'].map({'无': 0, '有': 1})
df_filtered['机械通气'] = df_filtered['机械通气'].astype(int)
df_filtered.loc[:, '是否二感'] = df_filtered['是否二感'].map({'否': 0, '是': 1, '疑似': 0})
df_filtered['是否二感'] = df_filtered['是否二感'].astype(int)
df_filtered.loc[:, '慢性病个数'] = df_filtered['慢性病个数'].map({0: 0, 1: 1, 2: 1, 3: 2, 4: 2, 5: 2, 6: 2})
df_filtered['慢性病个数'] = df_filtered['慢性病个数'].astype(int)

df_filtered['血管活性药物'] = df_filtered['血管活性药物'].replace({'无': 0, '是': 1})


des_csv = f"{root_path}/results/multireg/48h/cluster_feature_adjust.csv"
if not os.path.exists(os.path.dirname(des_csv)):
    os.makedirs(os.path.dirname(des_csv), exist_ok=True)
df_filtered.to_csv(des_csv, index=False)

  df_filtered['血管活性药物'] = df_filtered['血管活性药物'].replace({'无': 0, '是': 1})


In [2]:
import pandas as pd
import statsmodels.api as sm
from sklearn.preprocessing import OneHotEncoder

df = df_filtered

# 生成 cluster 的虚拟变量，基准类别为 cluster=4
df = pd.get_dummies(df, columns=['cluster'], drop_first=False)
df = df.rename(columns={'cluster_2': 'cluster_2', 'cluster_3': 'cluster_3', 'cluster_4': 'cluster_4'})
# df = df.drop(columns=['cluster_4'])

# 生成慢性病个数的虚拟变量，基准类别为 0 (没有慢性病)
df = pd.get_dummies(df, columns=['慢性病个数'], drop_first=False)
df = df.rename(columns={'慢性病个数_1': '慢性病个数_1', '慢性病个数_2': '慢性病个数_2'})

In [3]:
df['cluster_1'] = df['cluster_1'].astype(int)
df['cluster_2'] = df['cluster_2'].astype(int)
df['cluster_3'] = df['cluster_3'].astype(int)
df['cluster_4'] = df['cluster_4'].astype(int)
df['慢性病个数_0'] = df['慢性病个数_0'].astype(int)
df['慢性病个数_1'] = df['慢性病个数_1'].astype(int)
df['慢性病个数_2'] = df['慢性病个数_2'].astype(int)

In [6]:
# 定义群组和阈值
group_conditions = [
    ("性别", 0, 1),
    ("SOFA", 8),
    ("APACHE", 15),
    ("年龄", 60)
]

# 遍历每个 cluster
for i in range(1, 4):
    print(f"\n--- Dealing with cluster {i} ---")

    # 遍历每组条件
    for condition in group_conditions:
        var, threshold = condition[0], condition[1:]

        print(f"\nEvaluating condition for {var}")

        if var == "性别":
            for gender in threshold:
                print(f"Filtering data where {var} = {gender}")
                df_filtered = df[(df['性别'] == gender)]
                df_filtered = df_filtered[(df_filtered[f'cluster_{i}'] == 1) | (df_filtered['cluster_4'] == 1)]

                print(len(df_filtered))

                X = df_filtered[[f'cluster_{i}', 'SOFA', 'APACHE', '机械通气', '血管活性药物', '年龄']]
                y = df_filtered['转归']
                print(f"Fitting logistic regression model for {var} = {gender}")

                X = sm.add_constant(X)
                logit_model = sm.Logit(y, X)
                result = logit_model.fit()

                print(f"Results for 性别 = {gender}")
                print(result.summary())

                OR_values = np.exp(result.params)
                conf = result.conf_int()
                conf['OR_lower'] = np.exp(conf[0])
                conf['OR_upper'] = np.exp(conf[1])

                OR_summary = pd.DataFrame({
                    'OR': OR_values,
                    'OR_lower': conf['OR_lower'],
                    'OR_upper': conf['OR_upper']
                })

                print("Odds Ratios:")
                print(OR_summary)

        else:
            for comparison in [(" <=", " <= "), (" >", " > ")]:
                operator, logic_op = comparison
                condition_desc = f"{var}{logic_op}{threshold[0]}"
                print(f"Filtering data where {condition_desc}")

                if operator == " <= ":
                    df_filtered = df[(df[var] <= threshold[0])]
                else:
                    df_filtered = df[(df[var] > threshold[0])]

                df_filtered = df_filtered[(df_filtered[f'cluster_{i}'] == 1) | (df_filtered['cluster_4'] == 1)]

                # 排除当前变量
                columns = [f'cluster_{i}', 'SOFA', 'APACHE', '机械通气', '血管活性药物', '年龄']
                if var in columns:
                    columns.remove(var)

                print(len(df_filtered))

                X = df_filtered[columns]
                y = df_filtered['转归']
                print(f"Fitting logistic regression model for {condition_desc}")

                X = sm.add_constant(X)
                logit_model = sm.Logit(y, X)
                result = logit_model.fit()

                print(f"Results for {condition_desc}")
                print(result.summary())

                OR_values = np.exp(result.params)
                conf = result.conf_int()
                conf['OR_lower'] = np.exp(conf[0])
                conf['OR_upper'] = np.exp(conf[1])

                OR_summary = pd.DataFrame({
                    'OR': OR_values,
                    'OR_lower': conf['OR_lower'],
                    'OR_upper': conf['OR_upper']
                })

                print("Odds Ratios:")
                print(OR_summary)


--- Dealing with cluster 1 ---

Evaluating condition for 性别
Filtering data where 性别 = 0
132
Fitting logistic regression model for 性别 = 0
Optimization terminated successfully.
         Current function value: 0.592796
         Iterations 6
Results for 性别 = 0
                           Logit Regression Results                           
Dep. Variable:                     转归   No. Observations:                  132
Model:                          Logit   Df Residuals:                      125
Method:                           MLE   Df Model:                            6
Date:                Wed, 25 Sep 2024   Pseudo R-squ.:                  0.1396
Time:                        14:55:44   Log-Likelihood:                -78.249
converged:                       True   LL-Null:                       -90.949
Covariance Type:            nonrobust   LLR p-value:                 0.0002878
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------

                           Logit Regression Results                           
Dep. Variable:                     转归   No. Observations:                  186
Model:                          Logit   Df Residuals:                      180
Method:                           MLE   Df Model:                            5
Date:                Wed, 25 Sep 2024   Pseudo R-squ.:                  0.1556
Time:                        14:55:45   Log-Likelihood:                -106.53
converged:                       True   LL-Null:                       -126.16
Covariance Type:            nonrobust   LLR p-value:                 2.105e-07
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         -4.1401      1.466     -2.824      0.005      -7.013      -1.267
cluster_2      0.9374      0.454      2.064      0.039       0.047       1.828
SOFA           0.2372      0.058      4.089      0.0

In [5]:
# 定义自变量和因变量



X = df[['cluster_1', 'cluster_2', 'cluster_3', 'SOFA', 'APACHE', '机械通气', '血管活性药物']]
y = df['转归']

# 添加常数项
X = sm.add_constant(X)

# 建立逻辑回归模型
logit_model = sm.Logit(y, X)
result = logit_model.fit()

# 输出结果
print(result.summary())

# 计算 OR 值（即系数的指数）
OR_values = np.exp(result.params)

# 计算 OR 值的置信区间（系数置信区间的指数）
conf = result.conf_int()
conf['OR_lower'] = np.exp(conf[0])
conf['OR_upper'] = np.exp(conf[1])

# 创建包含 OR 值和置信区间的 DataFrame
OR_summary = pd.DataFrame({
    'OR': OR_values,
    'OR_lower': conf['OR_lower'],
    'OR_upper': conf['OR_upper']
})

# 打印 OR 值及其置信区间
print(OR_summary)

Optimization terminated successfully.
         Current function value: 0.596869
         Iterations 6
                           Logit Regression Results                           
Dep. Variable:                     转归   No. Observations:                  313
Model:                          Logit   Df Residuals:                      305
Method:                           MLE   Df Model:                            7
Date:                Thu, 05 Sep 2024   Pseudo R-squ.:                  0.1110
Time:                        12:04:26   Log-Likelihood:                -186.82
converged:                       True   LL-Null:                       -210.16
Covariance Type:            nonrobust   LLR p-value:                 6.465e-08
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const          2.2879      0.495      4.621      0.000       1.317       3.258
cluster_1     -1.0436      0.