# 라이브러리

In [3]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
import matplotlib.pyplot as plt
import seaborn as sns
import koreanize_matplotlib
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# 데이터 뽑아오는 함수

In [4]:
# 데이터 병합 및 year 열 추가 함수 정의의: SQL
def get_data_from_db(query):

    # mysql 접속 정보 정의
    username = "admin"
    password = "admin1234"
    host = "hk-toss-middle-project.cjkcuqkegqpx.eu-north-1.rds.amazonaws.com"
    database_name = "raw_data"

    # 데이터베이스 연결 문자열, connection string
    db_connection_str = f'mysql+pymysql://{username}:{password}@{host}/{database_name}'

    # 데이터 베이스 connector 얻어내기
    db_connection = create_engine(db_connection_str)
    db_connection

    df = pd.read_sql(query, con=db_connection)
    return df

# Transformer 정의

## 문자열 앞뒤 공백 제거 Transformer

In [4]:
# 앞뒤공백 제거 transformer
class StripTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, columns:list):
        # 입력한 columns가 문자열이면 리스트로 변환
        if isinstance(columns, str):
            self.columns = [columns]
        else:
            self.columns = columns

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        # 선택한 열 중에 데이터프레임에 실제로 존재하는 열만 선택
        columns_y = [col for col in self.columns if col in X.columns]

        # 선택한 열의 값들의 앞뒤 공백을 제거
        for col in columns_y:

            # 문자열 형식인지 확인
            if X[col].dtype == 'object':
                try:
                    X[col] = X[col].str.strip()
                except:
                    pass

        return X

## 열 탈락 / 열 이름 변경 Transformer

In [5]:
class RenameDropColumnsTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, rename_map:dict = {}, drop_columns:list = []):
        self.rename_map = rename_map
        # drop_columns가 문자열이면 리스트로 변환
        if isinstance(drop_columns, str):
            self.drop_columns = [drop_columns]
        else:
            self.drop_columns = drop_columns

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        # 실제로 데이터프레임에 존재하는 열만 drop
        self.drop_columns = [col for col in self.drop_columns if col in X.columns]
        # drop_columns가 존재하면 drop
        if self.drop_columns:
            X.drop(columns=self.drop_columns, inplace=True)

        # 실제로 데이터프레임에 존재하는 열만 선택
        rename_map = {key: value for key, value in self.rename_map.items() if key in X.columns}
        # rename_map이 존재하면 rename
        if self.rename_map:
            X.rename(columns=self.rename_map, inplace=True)
            
        return X

# PHI Table
- transform 내용 정리리리리리리리

## PHI Table 에만 필요한 Transformer
- 내용내용

In [6]:
class PHITableTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        # 단독 가구인 경우만 뽑기기
        X = X.loc[X["PHI_PID"].str.len() < 9, :]

        return X

## 3개년 병합해서 불러오기

In [10]:
query = f"""SELECT *, 2019 AS D_YEAR FROM raw_a_phi
            UNION ALL
            SELECT *, 2020 AS D_YEAR FROM raw_b_phi
            UNION ALL
            SELECT *, 2021 AS D_YEAR FROM raw_c_phi"""
phi = get_data_from_db(query)

## PHI Table 전처리 파이프라인

In [11]:
rename_map_phi = {'PHI_PID1' : 'PIDWON',
                  'PHI1' : 'PHI_Y',
                  'PHI2' : 'PHI_M',
                  'PHI3' : 'PHI_FORM',
                  'PHI4' : 'PHI_TYPE',
                  'PHI4_1_D' : 'PHI_TYPE_D',
                  'PHI4_2_D' : 'PHI_TYPE_C',
                  'PHI4_3_D' : 'PHI_TYPE_A',
                  'PHI4_4_D' : 'PHI_TYPE_N',
                  'PHI4_5_D' : 'PHI_TYPE_T',
                  'PHI4_6_D' : 'PHI_TYPE_AL',
                  'PHI4_7_D' : 'PHI_TYPE_M',
                  'PHI4_8_D' : 'PHI_TYPE_O',
                  'PHI5' : 'PHI_PREMIUM_YN',
                  'PHI6' : 'PHI_PREMIUM',
                  'PHR1' : 'PHI_CLAIM_YN',
                  'PHR2' : 'PHI_BENEFIT_YN',
                  'PHR3' : 'PHI_BENEFIT'}
drop_columns_phi = ['PHI_PID', 'PHI_PID2', 'PHI_PID3', 'PHI_PID4', 'PHI_PID5']
phi_pipeline = Pipeline([
    ('strip', StripTransformer(columns=['PHI_PID'])),
    ('phi_table', PHITableTransformer()),
    ('rename_drop', RenameDropColumnsTransformer(rename_map=rename_map_phi, drop_columns=drop_columns_phi))
])
phi_striped = phi_pipeline.fit_transform(phi)
phi_striped

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.drop(columns=self.drop_columns, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.rename(columns=self.rename_map, inplace=True)


Unnamed: 0,HHID,PHI_N,PIDWON,PHI_Y,PHI_M,PHI_FORM,PHI_TYPE,PHI_TYPE_D,PHI_TYPE_C,PHI_TYPE_A,...,PHI_TYPE_T,PHI_TYPE_AL,PHI_TYPE_M,PHI_TYPE_O,PHI_PREMIUM_YN,PHI_PREMIUM,PHI_CLAIM_YN,PHI_BENEFIT_YN,PHI_BENEFIT,D_YEAR
0,112003011.0,101.0,11200301.0,2004.0,-9.0,3.0,1238,1.0,1.0,1.0,...,2.0,2.0,2.0,1.0,1.0,120000.0,1.0,1.0,60000.0,2019
1,112003011.0,102.0,11200302.0,2004.0,-9.0,3.0,1238,1.0,1.0,1.0,...,2.0,2.0,2.0,1.0,1.0,118000.0,2.0,,,2019
2,112010011.0,101.0,11201001.0,2015.0,3.0,2.0,,,,,...,,,,,1.0,48710.0,2.0,,,2019
3,112010011.0,102.0,11201001.0,2000.0,7.0,1.0,12,1.0,1.0,2.0,...,2.0,2.0,2.0,2.0,3.0,62800.0,2.0,,,2019
4,112010011.0,103.0,11201002.0,2012.0,2.0,3.0,1238,1.0,1.0,1.0,...,2.0,2.0,2.0,1.0,1.0,201717.0,2.0,,,2019
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64270,400007011.0,201.0,40000703.0,-9.0,-9.0,1.0,1,1.0,2.0,2.0,...,2.0,2.0,2.0,2.0,1.0,-9.0,2.0,,,2021
64271,400007011.0,202.0,40000704.0,-9.0,-9.0,1.0,1,1.0,2.0,2.0,...,2.0,2.0,2.0,2.0,1.0,-9.0,2.0,,,2021
64272,400008011.0,102.0,40000802.0,2014.0,12.0,3.0,13,1.0,2.0,1.0,...,2.0,2.0,2.0,2.0,1.0,71740.0,1.0,1.0,506871.0,2021
64273,400008011.0,401.0,40000802.0,2018.0,5.0,1.0,2,2.0,1.0,2.0,...,2.0,2.0,2.0,2.0,1.0,21550.0,2.0,,,2021


# IND Table

## 만성질환 분류표

- 고혈압과 당뇨병 (HTN):
  - 고혈압, 당뇨병 (HTN, DM)

- 심뇌혈관 질환 (CVD):
  - 협심증, 심근경색증, 뇌출혈, 뇌경색 (AP, MI, CH, CI)

- 간질환 (LIV):
  - 만성간염(B형, C형), 알코올성 간질환, 간경화증(간경변증) (CLD, ALD, LC)

- 만성 하기도 질환(Lower Respiratory Infection, LRI):
  - 천식, 폐기종, 만성폐쇄성폐질환(COPD), 기관지확장증 (AST, PEM, COPD, BPE)

- 근골격계 질환 Musculoskeletal Disorders (MSD):
  - 무릎골관절염(무릎퇴행성관절염), 무릎 외 골관절염(퇴행성관절염), 류마티스 관절염, 어깨관절질환, 추간판(디스크) 질환, 기타 척추 질환 (OAK, OAE, RA, OAS, VD, VD_OLD, VDE, VDE_OLD)

- 갑상선 기능 장애 hyperthyroidism (HPT):
  - 갑상선 기능저하증, 갑상선 기능항진증 (HPOT, HPT)

In [1]:
# 질병 코드 그룹화
D_HTN = ["HTN", "DM"]
D_CVD = ["AP", "MI", "CH", "CI"]
D_LIV = ["CLD", "ALD", "LC"]
D_LRI = ["AST", "PEM", "COPD", "BPE"]
D_MSD_OLD = ["OAK", "OAE", "RA", "VD_OLD", "VDE_OLD"]
D_MSD = ["OAK", "OAE", "RA", "OAS", "VD", "VDE"]
D_MSD_ALL = ["OAK", "OAE", "RA", "OAS", "VD", "VDE", "VD_OLD", "VDE_OLD"]
D_HPT = ["HPOT", "HPT"]

# 열 이름으로 리스트화 (CD1 - 질병유무, CD2 - 진단시기)
D_HTN_CD1 = ["CD1_"+d for d in D_HTN]
D_CVD_CD1 = ["CD1_"+d for d in D_CVD]
D_LIV_CD1 = ["CD1_"+d for d in D_LIV]
D_LRI_CD1 = ["CD1_"+d for d in D_LRI]
D_MSD_OLD_CD1 = ["CD1_"+d for d in D_MSD_OLD]
D_MSD_CD1 = ["CD1_"+d for d in D_MSD]
D_HPT_CD1 = ["CD1_"+d for d in D_HPT]
D_OLD_CD1 = ["CD1_"+d for d in D_HTN + D_CVD + D_LIV + D_LRI + D_MSD_OLD + D_HPT]
D_21_CD1 = ["CD1_"+d for d in D_HTN + D_CVD + D_LIV + D_LRI + D_MSD + D_HPT]

## 만성질환을 그룹화 시킨 후 Many-Hot Encoding 형태로 변환 Transformer

In [None]:
class DiseaseTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        # 질병 분류 별 하나라도 해당하면 1, 아니면 0
        # 질병 분류 별 그룹화
        X["CD_HTN"] = X.apply(lambda x: 1 if any(x[col] == 1 for col in D_HTN_CD1) else 0, axis=1)
        X["CD_CVD"] = X.apply(lambda x: 1 if any(x[col] == 1 for col in D_CVD_CD1) else 0, axis=1)
        X["CD_LIV"] = X.apply(lambda x: 1 if any(x[col] == 1 for col in D_LIV_CD1) else 0, axis=1)
        X["CD_LRI"] = X.apply(lambda x: 1 if any(x[col] == 1 for col in D_LRI_CD1) else 0, axis=1)
        X["CD_MSD"] = X.apply(lambda x: 1 if any(x[col] == 1 for col in list(set(D_OLD_CD1 + D_21_CD1))) else 0, axis=1)
        X["CD_HPT"] = X.apply(lambda x: 1 if any(x[col] == 1 for col in D_HPT_CD1) else 0, axis=1)

        # 필요없는 행 탈락
        X.drop(X.iloc[:, 2:25], axis=1, inplace = True)

        return X

In [None]:
# 3개년 데이터의 질병 코드가 상이함. 따라서, 따로 불러온 후 그룹화 진행행
query = f"""
select PIDWON,2019 AS D_YEAR,CD1_HTN,CD1_DM,CD1_AP,CD1_MI,CD1_CH,CD1_CI,CD1_CLD,CD1_ALD,CD1_LC,CD1_AST,CD1_PEM,CD1_COPD,CD1_BPE,CD1_OAK,CD1_OAE,CD1_RA,CD1_VD_OLD,NULL AS CD1_VD,CD1_VDE_OLD,NULL AS CD1_VDE,CD1_HPOT,CD1_HPT,NULL AS CD1_OAS
from raw_a_ind
UNION ALL
select PIDWON,2020 AS D_YEAR,CD1_HTN,CD1_DM,CD1_AP,CD1_MI,CD1_CH,CD1_CI,CD1_CLD,CD1_ALD,CD1_LC,CD1_AST,CD1_PEM,CD1_COPD,CD1_BPE,CD1_OAK,CD1_OAE,CD1_RA,CD1_VD_OLD,NULL AS CD1_VD,CD1_VDE_OLD,NULL AS CD1_VDE,CD1_HPOT,CD1_HPT,NULL AS CD1_OAS
from raw_b_ind
UNION ALL
select PIDWON,2021 AS D_YEAR,CD1_HTN,CD1_DM,CD1_AP,CD1_MI,CD1_CH,CD1_CI,CD1_CLD,CD1_ALD,CD1_LC,CD1_AST,CD1_PEM,CD1_COPD,CD1_BPE,CD1_OAK,CD1_OAE,CD1_RA,NULL AS CD1_VD_OLD,CD1_VD,NULL AS CD1_VDE_OLD,CD1_VDE,CD1_HPOT,CD1_HPT,CD1_OAS
from raw_c_ind
"""

phi_disease = get_data_from_db(query)

disease_pipeline = Pipeline([
    ('disease', DiseaseTransformer())
])

phi_disease = disease_pipeline.fit_transform(phi_disease)
phi_disease.head()

Unnamed: 0,PIDWON,D_YEAR,CD_HTN,CD_CVD,CD_LIV,CD_LRI,CD_MSD,CD_HPT
0,11200101.0,2019,1,1,0,0,1,0
1,11200201.0,2019,0,0,0,0,1,0
2,11200202.0,2019,0,0,0,0,0,0
3,11200301.0,2019,0,0,0,0,0,0
4,11200302.0,2019,1,0,0,0,1,0


## 건강정보 이해능력 (C_IND) - 2021년 데이터에만 존재
- 변수 이름 변경

# MS Table