Import the library and initialize

In [1]:
# 导入必要的库
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# 导入敏感性分析模块
from sensitivity_analyzer import SensitivityAnalyzer
from pipeline_config import DATA_PATHS

# 设置图表样式
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("Libraries imported successfully!")

Libraries imported successfully!


Initialize the sensitivity analyzer

In [None]:
# 创建敏感性分析器实例
analyzer = SensitivityAnalyzer()

# 查看可用的子组
available_subgroups = analyzer.get_available_subgroups()
print("Available subgroups:")
for i, subgroup in enumerate(available_subgroups, 1):
    info = analyzer.get_subgroup_info(subgroup)
    print(f"{i}. {subgroup}: {info['name']} - {info['description']}")

print(f"\nTotal available subgroups: {len(available_subgroups)}")

INFO:sensitivity_analyzer:Sensitivity Analyzer initialized
INFO:sensitivity_analyzer:Defined 8 subgroups for sensitivity analysis
INFO:sensitivity_analyzer:Defined 8 subgroups for sensitivity analysis
INFO:sensitivity_analyzer:Defined 8 subgroups for sensitivity analysis
INFO:sensitivity_analyzer:Defined 8 subgroups for sensitivity analysis
INFO:sensitivity_analyzer:Defined 8 subgroups for sensitivity analysis
INFO:sensitivity_analyzer:Defined 8 subgroups for sensitivity analysis
INFO:sensitivity_analyzer:Defined 8 subgroups for sensitivity analysis
INFO:sensitivity_analyzer:Defined 8 subgroups for sensitivity analysis
INFO:sensitivity_analyzer:Defined 8 subgroups for sensitivity analysis


Available subgroups:
1. comorbidity_1: Single Comorbidity (1) - Patients with exactly 1 comorbidity
2. comorbidity_2: Double Comorbidity (2) - Patients with exactly 2 comorbidities
3. diabetes_diag_all: Any Diabetes Diagnosis - Patients with diabetes in any of the three diagnoses
4. diabetes_diag1: Primary Diabetes Diagnosis - Patients with diabetes as primary diagnosis
5. elderly_patients: Elderly Patients (70+) - Patients aged 70 years or older
6. young_patients: Young Patients (<50) - Patients aged less than 50 years
7. female_patients: Female Patients - Female patients only
8. male_patients: Male Patients - Male patients only

Total available subgroups: 8


Analyze subgroup 1 of comorbidities

In [3]:
# 分析合并症1子组
print("=" * 60)
print("ANALYZING: Single Comorbidity (1)")
print("=" * 60)

result1 = analyzer.run_single_subgroup('comorbidity_1')

# 显示结果
if result1['status'] == 'success':
    print(f"✅ Analysis completed successfully!")
    print(f"�� Data size: {result1['data_size']} patients")
    print(f"🏆 Best model: {result1['best_model']}")
    
    # 显示测试结果
    print("\n📈 Test Results:")
    display(result1['test_results'])
    
    # 显示选择的特征数量
    print("\n🔍 Selected Features:")
    for method, features in result1['selected_features'].items():
        print(f"  {method}: {len(features)} features")
    
else:
    print(f"❌ Analysis failed: {result1['error']}")

INFO:sensitivity_analyzer:Running analysis for subgroup: comorbidity_1
INFO:sensitivity_analyzer:Loading and preprocessing data...
INFO:data_loader:Merging all data tables...
INFO:data_loader:Loading diabetic data...


ANALYZING: Single Comorbidity (1)


INFO:data_loader:Diabetic data loaded: (101766, 50)
INFO:data_loader:Loading ID mapping data...
INFO:data_loader:ID mapping data loaded: (67, 2)
INFO:data_loader:ID mapping data split into three tables
INFO:data_loader:Data merged successfully: (101766, 51)
INFO:data_preprocessor:Applying feature engineering...
INFO:data_preprocessor:Original number of encounters: 101766
INFO:data_preprocessor:Encounters after keeping first admission: 71518
INFO:data_preprocessor:Removed 1545 records for hospice/death dispositions.
INFO:data_preprocessor:Encounters after removing hospice/death: 69973
INFO:data_preprocessor:Handling missing values based on notebook's standard...
INFO:data_preprocessor:Replaced '?' with NaN.
INFO:data_preprocessor:🔴 Dropped columns with >50.0% missing: ['weight', 'max_glu_serum', 'A1Cresult']
INFO:data_preprocessor:🟡 Filling specified columns with 'Unknown': ['medical_specialty', 'payer_code', 'race', 'diag_1', 'diag_2', 'diag_3', 'admission_type_desc', 'discharge_dispos

✅ Analysis completed successfully!
�� Data size: 3238 patients
🏆 Best model: None

📈 Test Results:



🔍 Selected Features:
  L1: 15 features
  MutualInfo: 15 features
  TreeImportance: 15 features


Analyze two subgroups of comorbidities

In [4]:
# 分析合并症2子组
print("=" * 60)
print("ANALYZING: Double Comorbidity (2)")
print("=" * 60)

result2 = analyzer.run_single_subgroup('comorbidity_2')

# 显示结果
if result2['status'] == 'success':
    print(f"✅ Analysis completed successfully!")
    print(f"�� Data size: {result2['data_size']} patients")
    print(f"🏆 Best model: {result2['best_model']}")
    
    # 显示测试结果
    print("\n📈 Test Results:")
    display(result2['test_results'])
    
    # 显示选择的特征数量
    print("\n🔍 Selected Features:")
    for method, features in result2['selected_features'].items():
        print(f"  {method}: {len(features)} features")
    
else:
    print(f"❌ Analysis failed: {result2['error']}")

INFO:sensitivity_analyzer:Running analysis for subgroup: comorbidity_2
INFO:sensitivity_analyzer:Defined 8 subgroups for sensitivity analysis
INFO:sensitivity_analyzer:Subgroup comorbidity_2 size: 66542
INFO:data_preprocessor:Splitting data into train/validation/test sets...
INFO:data_preprocessor:Dropping unused ID and target columns before splitting: ['encounter_id', 'patient_nbr', 'discharge_disposition_id', 'readmitted']
INFO:data_preprocessor:Data split - Train: (42586, 51), Val: (10647, 51), Test: (13309, 51)
INFO:data_preprocessor:Encoding categorical features using label encoding...


ANALYZING: Double Comorbidity (2)


INFO:data_preprocessor:Scaling numerical features...
INFO:data_preprocessor:Applying SMOTE for class balancing...
INFO:data_preprocessor:Before SMOTE - Class distribution: {0: 38688, 1: 3898}
INFO:data_preprocessor:After SMOTE - Class distribution: {0: 38688, 1: 38688}
INFO:feature_selector:Running all feature selection methods with top_n=15...
INFO:feature_selector:Selecting top 15 features using L1 regularization...
INFO:feature_selector:L1 feature selection completed. Selected features: ['number_inpatient', 'tolazamide', 'diabetesMed', 'age_group', 'num_procedures', 'diag_1_category', 'payer_code', 'number_diagnoses', 'time_in_hospital', 'metformin', 'discharge_disposition_desc', 'admission_source_desc', 'glipizide-metformin', 'number_outpatient', 'pioglitazone']
INFO:feature_selector:Selecting top 15 features using Mutual Information...
INFO:feature_selector:Mutual Information feature selection completed. Selected features: ['num_lab_procedures', 'num_medications', 'diag_2', 'diag_

✅ Analysis completed successfully!
�� Data size: 66542 patients
🏆 Best model: None

📈 Test Results:



🔍 Selected Features:
  L1: 15 features
  MutualInfo: 15 features
  TreeImportance: 15 features


Analysis of Subgroups for Diabetes diagnosis

In [5]:
# 分析糖尿病诊断子组
print("=" * 60)
print("ANALYZING: Any Diabetes Diagnosis")
print("=" * 60)

result3 = analyzer.run_single_subgroup('diabetes_diag_all')

# 显示结果
if result3['status'] == 'success':
    print(f"✅ Analysis completed successfully!")
    print(f"�� Data size: {result3['data_size']} patients")
    print(f"🏆 Best model: {result3['best_model']}")
    
    # 显示测试结果
    print("\n📈 Test Results:")
    display(result3['test_results'])
    
    # 显示选择的特征数量
    print("\n🔍 Selected Features:")
    for method, features in result3['selected_features'].items():
        print(f"  {method}: {len(features)} features")
    
else:
    print(f"❌ Analysis failed: {result3['error']}")

INFO:sensitivity_analyzer:Running analysis for subgroup: diabetes_diag_all
INFO:sensitivity_analyzer:Defined 8 subgroups for sensitivity analysis


ANALYZING: Any Diabetes Diagnosis


INFO:sensitivity_analyzer:Subgroup diabetes_diag_all size: 27577
INFO:data_preprocessor:Splitting data into train/validation/test sets...
INFO:data_preprocessor:Dropping unused ID and target columns before splitting: ['encounter_id', 'patient_nbr', 'discharge_disposition_id', 'readmitted']
INFO:data_preprocessor:Data split - Train: (17648, 51), Val: (4413, 51), Test: (5516, 51)
INFO:data_preprocessor:Encoding categorical features using label encoding...
INFO:data_preprocessor:Scaling numerical features...
INFO:data_preprocessor:Applying SMOTE for class balancing...
INFO:data_preprocessor:Before SMOTE - Class distribution: {0: 16134, 1: 1514}
INFO:data_preprocessor:After SMOTE - Class distribution: {0: 16134, 1: 16134}
INFO:feature_selector:Running all feature selection methods with top_n=15...
INFO:feature_selector:Selecting top 15 features using L1 regularization...
INFO:feature_selector:L1 feature selection completed. Selected features: ['number_inpatient', 'diag_1_category', 'age_gr

KeyboardInterrupt: 

Analysis of the main diagnosed diabetes subgroup

In [None]:
# 分析主诊断糖尿病子组
print("=" * 60)
print("ANALYZING: Primary Diabetes Diagnosis")
print("=" * 60)

result4 = analyzer.run_single_subgroup('diabetes_diag1')

# 显示结果
if result4['status'] == 'success':
    print(f"✅ Analysis completed successfully!")
    print(f"�� Data size: {result4['data_size']} patients")
    print(f"🏆 Best model: {result4['best_model']}")
    
    # 显示测试结果
    print("\n📈 Test Results:")
    display(result4['test_results'])
    
    # 显示选择的特征数量
    print("\n🔍 Selected Features:")
    for method, features in result4['selected_features'].items():
        print(f"  {method}: {len(features)} features")
    
else:
    print(f"❌ Analysis failed: {result4['error']}")

INFO:sensitivity_analyzer:Running analysis for subgroup: diabetes_diag1
INFO:sensitivity_analyzer:Defined 8 subgroups for sensitivity analysis
INFO:sensitivity_analyzer:Subgroup diabetes_diag1 size: 5748
INFO:data_preprocessor:Splitting data into train/validation/test sets...
INFO:data_preprocessor:Dropping unused ID and target columns before splitting: ['encounter_id', 'patient_nbr', 'discharge_disposition_id', 'readmitted']
INFO:data_preprocessor:Data split - Train: (3678, 51), Val: (920, 51), Test: (1150, 51)
INFO:data_preprocessor:Encoding categorical features using label encoding...


ANALYZING: Primary Diabetes Diagnosis


INFO:data_preprocessor:Scaling numerical features...
INFO:data_preprocessor:Applying SMOTE for class balancing...
INFO:data_preprocessor:Before SMOTE - Class distribution: {0: 3343, 1: 335}
INFO:data_preprocessor:After SMOTE - Class distribution: {0: 3343, 1: 3343}
INFO:feature_selector:Running all feature selection methods with top_n=15...
INFO:feature_selector:Selecting top 15 features using L1 regularization...
INFO:feature_selector:L1 feature selection completed. Selected features: ['number_inpatient', 'num_medications', 'diag_1', 'number_diagnoses', 'time_in_hospital', 'payer_code', 'glipizide', 'diag_2', 'change', 'metformin', 'number_outpatient', 'num_procedures', 'admission_type_desc', 'comorbidity', 'diag_2_category']
INFO:feature_selector:Selecting top 15 features using Mutual Information...
INFO:feature_selector:Mutual Information feature selection completed. Selected features: ['num_medications', 'num_lab_procedures', 'time_in_hospital', 'diag_1', 'age_midpoint', 'age_group

❌ Analysis failed: 'FeatureSelector' object has no attribute 'get_feature_importance'


Compare the results of all subgroups

In [None]:
# 比较所有已分析的子组
print("=" * 60)
print("COMPARING ALL SUBGROUPS")
print("=" * 60)

# 获取所有已完成的子组
completed_subgroups = [name for name, result in analyzer.subgroup_results.items() 
                      if result['status'] == 'success']

if len(completed_subgroups) >= 2:
    comparison = analyzer.compare_subgroups(completed_subgroups)
    
    print(f"📊 Comparing {len(completed_subgroups)} subgroups:")
    for subgroup in completed_subgroups:
        print(f"  - {subgroup}")
    
    print("\n📈 Comparison Report:")
    display(comparison['comparison_report'])
    
    # 显示统计摘要
    comparison_df = comparison['comparison_report']
    print("\n�� Summary Statistics:")
    print(f"  Best AUC: {comparison_df['AUC'].max():.4f}")
    print(f"  Average AUC: {comparison_df['AUC'].mean():.4f}")
    print(f"  AUC Standard Deviation: {comparison_df['AUC'].std():.4f}")
    
else:
    print("❌ Need at least 2 successful analyses to compare")

COMPARING ALL SUBGROUPS
❌ Need at least 2 successful analyses to compare


In [None]:
# 示例：分析自定义子组（女性老年患者）
print("=" * 60)
print("ANALYZING: Custom Subgroup (Female Elderly Patients)")
print("=" * 60)

# 定义自定义筛选条件
def female_elderly_filter(df):
    return (df['gender'] == 'Female') & (df['age'] >= 70)

# 运行分析
custom_result = analyzer.run_single_subgroup(
    'custom_female_elderly', 
    custom_filter=female_elderly_filter
)

# 显示结果
if custom_result['status'] == 'success':
    print(f"✅ Analysis completed successfully!")
    print(f"�� Data size: {custom_result['data_size']} patients")
    print(f"🏆 Best model: {custom_result['best_model']}")
    
    # 显示测试结果
    print("\n📈 Test Results:")
    display(custom_result['test_results'])
    
else:
    print(f"❌ Analysis failed: {custom_result['error']}")

INFO:sensitivity_analyzer:Running analysis for subgroup: custom_female_elderly


ANALYZING: Custom Subgroup (Female Elderly Patients)


TypeError: '>=' not supported between instances of 'str' and 'int'

In [None]:
# 创建结果总结
print("=" * 60)
print("SENSITIVITY ANALYSIS SUMMARY")
print("=" * 60)

# 统计所有结果
all_results = analyzer.subgroup_results
successful_analyses = [name for name, result in all_results.items() 
                      if result['status'] == 'success']

print(f"📊 Total analyses: {len(all_results)}")
print(f"✅ Successful: {len(successful_analyses)}")
print(f"❌ Failed: {len(all_results) - len(successful_analyses)}")

# 显示每个子组的最佳AUC
print("\n🏆 Best AUC for each subgroup:")
for subgroup_name in successful_analyses:
    result = all_results[subgroup_name]
    best_auc = result['test_results']['AUC'].max()
    data_size = result['data_size']
    print(f"  {subgroup_name}: {best_auc:.4f} (n={data_size})")

# 创建AUC比较图
if len(successful_analyses) >= 2:
    plt.figure(figsize=(12, 6))
    
    # 提取AUC数据
    auc_data = []
    for subgroup_name in successful_analyses:
        result = all_results[subgroup_name]
        for model in result['test_results'].index:
            auc_data.append({
                'Subgroup': subgroup_name,
                'Model': model,
                'AUC': result['test_results'].loc[model, 'AUC']
            })
    
    auc_df = pd.DataFrame(auc_data)
    
    # 绘制AUC比较图
    pivot_auc = auc_df.pivot(index='Subgroup', columns='Model', values='AUC')
    pivot_auc.plot(kind='bar', ax=plt.gca())
    plt.title('AUC Comparison Across Subgroups and Models')
    plt.ylabel('AUC Score')
    plt.xlabel('Subgroup')
    plt.xticks(rotation=45)
    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.tight_layout()
    plt.show()

print("\n🎉 Sensitivity analysis completed!")