In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
import numpy as np

# 读取数据

In [2]:
data = pd.read_csv('./data_processed/data.csv', index_col=0)

data.head()

Unnamed: 0,los,gender,age,heart_rate,respiratory_rate,hematocrit,rdw,platelet,mcv,mch,...,race_BLACK,race_HISPANIC/LATINO,race_OTHER,race_WHITE,language_ENGLISH,marital_status_MARRIED,marital_status_SINGLE,marital_status_WIDOWED,insurance_Medicare,insurance_Other
327,4.10571,1,0.684717,0.2875,0.190476,0.451327,0.484848,0.125445,0.803419,0.742015,...,0,0,0,1,1,1,0,0,1,0
1022,1.9681,1,0.83386,0.2875,0.261905,0.513274,0.579545,0.161922,0.700855,0.68059,...,0,0,0,1,1,1,0,0,1,0
896,4.87824,1,0.514176,0.4125,0.690476,0.716814,0.640152,0.005338,0.794872,0.7543,...,0,0,1,0,1,1,0,0,1,0
908,5.04106,1,0.393528,0.40625,0.285714,0.362832,0.583333,0.059609,0.735043,0.761671,...,0,1,0,0,0,0,1,0,0,0
559,1.37475,0,0.464178,0.11875,0.214286,0.466077,0.481061,0.234875,0.786325,0.783784,...,0,0,0,1,1,1,0,0,0,1


# 检验 los 是否为正态

## Box-Cox

In [3]:
data_ = data.copy()

data_['los'], fitted_lambda = stats.boxcox(data['los'])

print(f"找到的lambda值: {fitted_lambda}")

找到的lambda值: -0.2827695959609285


In [4]:
stat, p = stats.normaltest(data_['los'])
print('Statistics=%.3f, p=%.3f' % (stat, p))

# 解读结果
if p > 0.05:
    print('样本看起来是正态分布的')
else:
    print('样本看起来不是正态分布的')

Statistics=1.268, p=0.531
样本看起来是正态分布的


# 组间差异

In [5]:
x1 = data[data['uc_only'] == 1]['los']
x2 = data[data['cd_only'] == 1]['los']

x1 = np.array(x1)
x2 = np.array(x2)


## U检验

In [6]:
u_statistic, p_value = stats.mannwhitneyu(x1, x2, alternative='two-sided')

print(f"U统计量: {u_statistic}")
print(f"P值: {p_value}")

# 根据P值判断显著性
if p_value < 0.05:
    print("两组数据的中位数存在显著差异。")
else:
    print("两组数据的中位数不存在显著差异。")


U统计量: 44260.0
P值: 0.1193678776009166
两组数据的中位数不存在显著差异。


## t 检验

In [7]:
# 进行独立样本t检验
t_statistic, p_value = stats.ttest_ind(x1, x2, equal_var=False)

print(f"t统计量: {t_statistic}")
print(f"P值: {p_value}")

# 根据P值判断显著性
if p_value < 0.05:
    print("两组数据的均值存在显著差异。")
else:
    print("两组数据的均值不存在显著差异。")


t统计量: 0.17597652267980374
P值: 0.8603730736172386
两组数据的均值不存在显著差异。


In [32]:
pd.pivot_table(
    data=data,
    values='los',
    index=['uc_only', 'cd_only'],
    columns='gender',
    aggfunc=['mean', 'std', 'count']
).reset_index()

Unnamed: 0_level_0,uc_only,cd_only,mean,mean,std,std,count,count
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,0,1,0,1,0,1
0,0,0,3.785936,1.926811,3.585054,1.089389,20,14
1,0,1,3.318524,3.220325,3.839162,3.717169,163,139
2,1,0,3.500538,3.17474,5.240216,4.683568,156,160
