In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_selection import VarianceThreshold, SelectFromModel
from sklearn.linear_model import LassoCV

In [2]:
data = pd.read_csv('./data_processed/data_norm_log.csv', index_col=0)
data.fillna(data.mean(), inplace=True)

X = data.loc[:, data.columns != 'los']
y = data.loc[:, data.columns == 'los']

In [3]:
# 使用方差阈值法进行特征选择
selector = VarianceThreshold(threshold=0.5)  # 设定阈值
X_selected = selector.fit_transform(X)

print(f"原始特征数量: {X.shape[1]}")
print(f"方差阈值法选择后的特征数量: {X_selected.shape[1]}")

原始特征数量: 22
方差阈值法选择后的特征数量: 9


In [5]:
# 使用带L1正则化的逻辑回归进行特征选择
lasso = LassoCV(cv=5, random_state=42).fit(X, y)
selector = SelectFromModel(lasso, prefit=True)

selected_features = selector.get_support(indices=True)

# 查看被选择的特征的索引
print(f"被选择的特征的索引: {selected_features}")

# 查看被选择的特征的系数
print(f"被选择的特征的系数: {lasso.coef_[selected_features]}")

被选择的特征的索引: [1 2 3 4 5 8]
被选择的特征的系数: [ 0.00585107  0.00989388  0.08942967 -0.02470985  0.02983032  0.01269654]


  y = column_or_1d(y, warn=True)


In [8]:
X.columns[selected_features]

Index(['age', 'heart_rate', 'respiratory_rate', 'hematocrit', 'rdw', 'mch'], dtype='object')