In [127]:
import pandas as pd

# 1) ver180 불러오기
path180 = "/content/drive/MyDrive/Colab Notebooks/cohort_ver180_static_vars.csv"
df180 = pd.read_csv(path180)

print("ver180 shape:", df180.shape)

# 2) delay 컬럼 지정
delay_cols = ["door_to_ecg", "door_to_trop", "door_to_anti", "door_to_pci"]

print("\n===== 원본 delay 통계 (앞부분) =====")
print(df180[delay_cols].describe(percentiles=[0.5, 0.9, 0.95, 0.99]))


ver180 shape: (1929, 12)

===== 원본 delay 통계 (앞부분) =====
        door_to_ecg  door_to_trop  door_to_anti   door_to_pci
count  1.929000e+03   1929.000000   1929.000000   1929.000000
mean   1.125436e+04    560.964749    572.434422    413.646967
std    4.572375e+05   2587.656220    720.428559   1636.466180
min   -1.000000e+00     -1.000000     -1.000000     -1.000000
50%   -1.000000e+00    352.000000    429.000000     -1.000000
90%    3.600000e+01    994.200000   1097.600000    547.400000
95%    1.112000e+02   1279.800000   1608.600000   1818.400000
99%    3.964800e+02   4598.720000   3477.320000   8006.760000
max    2.001780e+07  99390.000000  12805.000000  22383.000000


In [128]:
# 3) 각 delay 컬럼별 99% cutoff 계산
cutoffs = {col: df180[col].quantile(0.99) for col in delay_cols}

print("\n===== 99th percentile cutoff =====")
for col, c in cutoffs.items():
    print(f"{col}: {c:.3f}")

# 4) Winsorization (상한 99%로 잘라내기)
df182 = df180.copy()

for col in delay_cols:
    upper = cutoffs[col]
    # 상한만 자르기 (하한은 그대로 두되, 혹시 이상한 값 막으려면 lower=-1 등도 가능)
    df182[col] = df182[col].clip(upper=upper)

print("\n===== Winsorization 후 delay 통계 =====")
print(df182[delay_cols].describe(percentiles=[0.5, 0.9, 0.95, 0.99]))

print("\n원본 shape:", df180.shape)
print("Winsorization 후 shape (row 수 유지):", df182.shape)



===== 99th percentile cutoff =====
door_to_ecg: 396.480
door_to_trop: 4598.720
door_to_anti: 3477.320
door_to_pci: 8006.760

===== Winsorization 후 delay 통계 =====
       door_to_ecg  door_to_trop  door_to_anti  door_to_pci
count  1929.000000   1929.000000   1929.000000  1929.000000
mean     18.200933    458.926076    554.263556   360.326179
std      57.327200    665.413536    580.584223  1192.110749
min      -1.000000     -1.000000     -1.000000    -1.000000
50%      -1.000000    352.000000    429.000000    -1.000000
90%      36.000000    994.200000   1097.600000   547.400000
95%     111.200000   1279.800000   1608.600000  1818.400000
99%     389.625600   4553.158400   3471.070400  7954.747200
max     396.480000   4598.720000   3477.320000  8006.760000

원본 shape: (1929, 12)
Winsorization 후 shape (row 수 유지): (1929, 12)


In [129]:
# 5) ver182 저장
path182 = "/content/drive/MyDrive/Colab Notebooks/cohort_ver182_winsor_99p.csv"
df182.to_csv(path182, index=False)

print("\n[INFO] Winsorization 적용된 ver182 저장 완료:")
print(path182)



[INFO] Winsorization 적용된 ver182 저장 완료:
/content/drive/MyDrive/Colab Notebooks/cohort_ver182_winsor_99p.csv
