In [1]:
!pip install pandas==2.0.3



In [2]:
import numpy as np
import pandas as pd

%load_ext memory_profiler

In [3]:
GJ_DTYPES = {
    "HCHK_YEAR": "int16",
    "IDV_ID": "int32",
    "SEX": "category",
    "AGE_GROUP": "category",
    "SIDO": "category",
    "HEIGHT": "float32",
    "WEIGHT": "float32",
    "SIGHT_LEFT": "float32",
    "SIGHT_RIGHT": "float32",
    "HEAR_LEFT": "category",
    "HEAR_RIGHT": "category",
    "BP_HIGH": "float32",
    "BP_LWST": "float32",
    "BLDS": "float32",
    "TOT_CHOLE": "float32",
    "HMG": "float32",
    "OLIG_PROTE_CD": "float32",
    "SGOT_AST": "float32",
    "SGPT_ALT": "float32",
    "GAMMA_GTP": "float32",
    "SMK_STAT_TYPE_CD": "category",
    "DRK_YN": "category",
}

CODEBOOK = {
    "SEX": {1: "남자", 2: "여자"},
    "SIDO": {11: "서울특별시", 26: "부산광역시", 27: "대구광역시", 28: "인천광역시", 29: "광주광역시", 30: "대전광역시",
             31: "울산광역시", 36: "세종특별자치시", 42: "강원도", 43: "충청북도", 44: "충청남도", 45: "전라북도",
             46: "전라남도", 47: "경상북도", 48: "경상남도", 49: "제주특별자치도"},
    "HEAR_LEFT": {1: "정상", 2: "비정상"},
    "HEAR_RIGHT": {1: "정상", 2: "비정상"},
    "SMK_STAT_TYPE_CD": {1: "비흡연", 2: "금연", 3: "흡연"},
    "DRK_YN": {0: "N", 1: "Y"},
}

CODEBOOK = {col: {v: str(k) for k, v in d.items()} for col, d in CODEBOOK.items()}
GJ_DTYPES_PA = {k: v + "[pyarrow]" if v.startswith(("int", "float")) else v for k, v in GJ_DTYPES.items()}


def scoring_health(patient):
    bmi = ((patient['WEIGHT'] / (patient['HEIGHT']/100)**2) >= 30) * 1  # 체질량(BMI)
    blds = (patient['BLDS'] >= 125) * 1  # 공복혈당 BLDS
    chole = (patient['TOT_CHOLE'] >= 130) * 1  # 총콜레스테롤 TOT_CHOLE
    hmg = (patient['HMG'] < 12) * 1  # 혈색소 HMG
    sg = ((patient['SGOT_AST'] >= 40) | (patient['SGPT_ALT'] >= 40)) * 1  # 혈청지오티 SGOT_AST, SGPT_ALT
    smoke = (patient['SMK_STAT_TYPE_CD'] == 3) * 1  # 흡연 SMOKE
    drink = (patient['DRK_YN'] == 1) * 1 # 음주 DRINK
    
    patient_score = np.sum([bmi, blds, chole, hmg, sg, smoke, drink], axis=0)
    return patient_score

In [4]:
%%time
%%memit

df = pd.read_csv('../data/open-nhis-gj-all.csv', engine='pyarrow')
df = df.pipe(lambda df: df[df["HCHK_YEAR"]==2018])
df.replace(CODEBOOK, inplace=True)
df = df.fillna(method='bfill').astype(GJ_DTYPES, errors='ignore')
df = df.assign(SCORE=lambda df: scoring_health(df))
df.query("SCORE >= 1", inplace=True)
df = df.groupby(["SEX"]).agg({"SCORE": "mean"})
df = df.rename(columns={"SCORE": "평균 건강 점수"})
print(df)

     평균 건강 점수
SEX          
1    1.395252
2    1.292970
peak memory: 10325.75 MiB, increment: 10196.98 MiB
CPU times: user 17.8 s, sys: 5.27 s, total: 23.1 s
Wall time: 6.39 s


In [5]:
%%time
%%memit
df = (pd
      .read_csv('../data/open-nhis-gj-small.csv', engine='pyarrow')
      .pipe(lambda df: df[df["HCHK_YEAR"]==2018])
      .replace(CODEBOOK)
      .fillna(method='bfill')
      .astype(GJ_DTYPES, errors='ignore')
      .assign(SCORE=lambda df: scoring_health(df))
      .query("SCORE >= 1")
      .groupby(["SEX"])
      .agg({"SCORE": "mean"})
      .rename(columns={"SCORE": "평균 건강 점수"})
      )
print(df)

     평균 건강 점수
SEX          
1    1.406731
2    1.296927
peak memory: 7657.24 MiB, increment: 29.63 MiB
CPU times: user 251 ms, sys: 318 ms, total: 569 ms
Wall time: 645 ms
