In [1]:
!pip install numpy datasets



In [2]:
import numpy as np
from datasets import load_dataset


def scoring_health(patient):
    bmi = ((patient['WEIGHT'] / (patient['HEIGHT']/100)**2) >= 30) * 1  # 체질량(BMI)
    blds = (patient['BLDS'] >= 125) * 1  # 공복혈당 BLDS
    chole = (patient['TOT_CHOLE'] >= 130) * 1  # 총콜레스테롤 TOT_CHOLE
    hmg = (patient['HMG'] < 12) * 1  # 혈색소 HMG
    sg = ((patient['SGOT_AST'] >= 40) | (patient['SGPT_ALT'] >= 40)) * 1  # 혈청지오티 SGOT_AST, SGPT_ALT
    smoke = (patient['SMK_STAT_TYPE_CD'] == 3) * 1  # 흡연 SMOKE
    drink = (patient['DRK_YN'] == 1) * 1 # 음주 DRINK
    
    patient_score = np.sum([bmi, blds, chole, hmg, sg, smoke, drink], axis=0)
    patient['SCORE'] = patient_score
    
    return patient

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
dataset = load_dataset('sackoh/pycon-kr-2023-health-check', split='train', streaming=True)
updated_dataset = dataset.map(scoring_health, remove_columns=["HCHK_YEAR"])
print(list(updated_dataset.take(5)))

[{'IDV_ID': 725578, 'SEX': '여자', 'AGE_GROUP': 1, 'SIDO': '부산광역시', 'HEIGHT': 140, 'WEIGHT': 45, 'SIGHT_LEFT': 0.9, 'SIGHT_RIGHT': 0.9, 'HEAR_LEFT': '정상', 'HEAR_RIGHT': '정상', 'BP_HIGH': 100, 'BP_LWST': 70, 'BLDS': 91, 'TOT_CHOLE': 216, 'HMG': 13.4, 'OLIG_PROTE_CD': 1.0, 'SGOT_AST': 20.0, 'SGPT_ALT': 11.0, 'GAMMA_GTP': 10, 'SMK_STAT_TYPE_CD': '비흡연', 'DRK_YN': None, 'SCORE': 1}, {'IDV_ID': 118183, 'SEX': '여자', 'AGE_GROUP': 1, 'SIDO': '경상남도', 'HEIGHT': 145, 'WEIGHT': 35, 'SIGHT_LEFT': 0.6, 'SIGHT_RIGHT': 0.6, 'HEAR_LEFT': '정상', 'HEAR_RIGHT': '정상', 'BP_HIGH': 120, 'BP_LWST': 90, 'BLDS': 75, 'TOT_CHOLE': 162, 'HMG': 13.0, 'OLIG_PROTE_CD': 1.0, 'SGOT_AST': 26.0, 'SGPT_ALT': 12.0, 'GAMMA_GTP': 19, 'SMK_STAT_TYPE_CD': '비흡연', 'DRK_YN': None, 'SCORE': 1}, {'IDV_ID': 667818, 'SEX': '여자', 'AGE_GROUP': 1, 'SIDO': '서울특별시', 'HEIGHT': 145, 'WEIGHT': 35, 'SIGHT_LEFT': 0.3, 'SIGHT_RIGHT': 1.0, 'HEAR_LEFT': '정상', 'HEAR_RIGHT': '정상', 'BP_HIGH': 111, 'BP_LWST': 74, 'BLDS': 85, 'TOT_CHOLE': 149, 'HMG': 12.7, 