In [378]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
import matplotlib.pyplot as plt
import seaborn as sns
import koreanize_matplotlib
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn import set_config
set_config(transform_output='pandas')
from sklearn.pipeline import FunctionTransformer

# 데이터 뽑아오는 함수

In [379]:
# 데이터 병합 및 year 열 추가 함수 정의의: SQL
def get_data_from_db(query):

    # mysql 접속 정보 정의
    username = "admin"
    password = "admin1234"
    host = "hk-toss-middle-project.cjkcuqkegqpx.eu-north-1.rds.amazonaws.com"
    database_name = "raw_data"

    # 데이터베이스 연결 문자열, connection string
    db_connection_str = f'mysql+pymysql://{username}:{password}@{host}/{database_name}'

    # 데이터 베이스 connector 얻어내기
    db_connection = create_engine(db_connection_str)
    db_connection

    df = pd.read_sql(query, con=db_connection)
    return df

# Transformer 정의

## 문자열 앞뒤 공백 제거 Transformer

In [380]:
# 앞뒤공백 제거 transformer
class StripTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, columns:list):
        # 입력한 columns가 문자열이면 리스트로 변환
        if isinstance(columns, str):
            self.columns = [columns]
        else:
            self.columns = columns

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        # 선택한 열 중에 데이터프레임에 실제로 존재하는 열만 선택
        columns_y = [col for col in self.columns if col in X.columns]

        # 선택한 열의 값들의 앞뒤 공백을 제거
        for col in columns_y:

            # 문자열 형식인지 확인
            if X[col].dtype == 'object':
                try:
                    X[col] = X[col].str.strip()
                except:
                    pass

        return X

## 열 탈락 / 열 이름 변경 Transformer

In [381]:
class RenameDropColumnsTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, rename_map:dict = {}, drop_columns:list = []):
        self.rename_map = rename_map
        # drop_columns가 문자열이면 리스트로 변환
        if isinstance(drop_columns, str):
            self.drop_columns = [drop_columns]
        else:
            self.drop_columns = drop_columns

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        # 실제로 데이터프레임에 존재하는 열만 drop
        self.drop_columns = [col for col in self.drop_columns if col in X.columns]
        # drop_columns가 존재하면 drop
        if self.drop_columns:
            X.drop(columns=self.drop_columns, inplace=True)

        # 실제로 데이터프레임에 존재하는 열만 선택
        rename_map = {key: value for key, value in self.rename_map.items() if key in X.columns}
        # rename_map이 존재하면 rename
        if self.rename_map:
            X.rename(columns=self.rename_map, inplace=True)
            
        return X

## ManyHotEncoding을 위한 리스트 열 만들어주기

In [382]:
class ColumnsWithList(BaseEstimator, TransformerMixin):
    def __init__(self, from_columns, to_column):
        self.from_columns = from_columns
        self.to_column= to_column

    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X[self.to_column] = X[self.from_columns].apply(lambda row: ','.join([str(int(x)) for x in [row[col] for col in self.from_columns]]), axis=1)
        X.drop(columns=self.from_columns, inplace=True)
        return X

## ManyHotEncoding

In [383]:
class ManyHotEncoding(BaseEstimator, TransformerMixin):
    def __init__(self, columns, prefix = ""):
        self.columns = columns
        self.prefix = prefix
        self.mlb = MultiLabelBinarizer()

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        # MultiLabelBinarizer를 사용하여 데이터를 변환합니다.
        encoded = self.mlb.fit_transform(X[self.columns])
        encoded = encoded[:, 1:]

        # 숫자 클래스와 name_map을 사용하여 열 이름 생성
        mapped_classes = [f"{self.prefix}_{cls}" for cls in self.mlb.classes_[1:]]

        # 변환된 데이터를 DataFrame으로 반환
        encoded_df = pd.DataFrame(encoded, columns=mapped_classes)
        X.drop(columns=self.columns, inplace=True)
        data = pd.concat([encoded_df, X], axis=1)
        return data

## 가중치는 존재하지만 조사표상 응답이 모두 결측인 경우 처리

In [384]:
class SameIdImpute(BaseEstimator, TransformerMixin):
    def __init__(self, id_cols = []):
        self.id_cols = id_cols
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        self.columns_to_fill = X.columns.difference(self.id_cols)
        X = X.sort_values(self.id_cols)
        X.loc[X[self.columns_to_fill].isnull().all(axis=1)] = X.ffill().bfill().loc[X[self.columns_to_fill].isnull().all(axis=1)]

        return X

In [385]:
class CustomFeatureCombination(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, X):
        return self

    def transform(self, X):
        X['DEBT_RATIO'] = np.where(
        X['PROP'] > 0,
        X['DEBT'] / X['PROP'],
        X['DEBT'])
        
        return X

In [386]:
columns_hh = "HHID,CUSTM_BENF_YN,CUSTM_BENF1_D,CUSTM_BENF2_D,CUSTM_BENF3_D,CUSTM_BENF4_D,CASE WHEN H_INC_TOT >= 0 THEN 10000 * H_INC_TOT ELSE H_INC_TOT END AS H_INC_TOT,10000*DEBT AS DEBT,10000*PROP AS PROP,OTC_MED,MED_SUP,H_OOP"

query_hh = f"""
SELECT 2019 AS D_YEAR, {columns_hh} from raw_a_hh
UNION ALL
SELECT 2020 AS D_YEAR, {columns_hh} from raw_b_hh
UNION ALL
SELECT 2021 AS D_YEAR, {columns_hh} from raw_c_hh
"""

# HH Table

In [387]:
hh = get_data_from_db(query=query_hh)
hh

Unnamed: 0,D_YEAR,HHID,CUSTM_BENF_YN,CUSTM_BENF1_D,CUSTM_BENF2_D,CUSTM_BENF3_D,CUSTM_BENF4_D,H_INC_TOT,DEBT,PROP,OTC_MED,MED_SUP,H_OOP
0,2019,112001011.0,1.0,1.0,1.0,2.0,2.0,6270000.0,0.0,4000000.0,61000.0,0.0,86060.0
1,2019,112002011.0,1.0,1.0,1.0,2.0,1.0,9790000.0,0.0,0.0,60000.0,0.0,8780.0
2,2019,112003011.0,2.0,,,,,44970000.0,7500000.0,76500000.0,74000.0,0.0,247670.0
3,2019,112010011.0,2.0,,,,,24160000.0,0.0,230000000.0,162600.0,3800.0,3558000.0
4,2019,112012011.0,2.0,,,,,16490000.0,0.0,520000000.0,0.0,0.0,2041164.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
18867,2021,400004011.0,2.0,,,,,29650000.0,80000000.0,110000000.0,500000.0,0.0,0.0
18868,2021,400005011.0,2.0,,,,,28250000.0,0.0,33000000.0,370000.0,0.0,123300.0
18869,2021,400007011.0,2.0,,,,,60800000.0,30000000.0,70000000.0,400000.0,0.0,4422620.0
18870,2021,400008011.0,2.0,,,,,77460000.0,70000000.0,195000000.0,143000.0,0.0,3600770.0


In [388]:
hh_nan_0 = ['CUSTM_BENF1_D','CUSTM_BENF2_D','CUSTM_BENF3_D','CUSTM_BENF4_D','H_OOP','H_INC_TOT']
hh_9_med = ['DEBT','PROP','MED_SUP','H_INC_TOT']
hh_2_0 = ['CUSTM_BENF_YN','CUSTM_BENF1_D','CUSTM_BENF2_D','CUSTM_BENF3_D','CUSTM_BENF4_D']

In [389]:
def log_transform(X):    
    # 배열 복사하여 원본 데이터 보존
    X_transformed = np.array(X, copy=True)
    
    # 양수인 값만 로그 변환 적용
    mask_positive = X_transformed > 0
    X_transformed[mask_positive] = np.log(X_transformed[mask_positive])
    
    # 0 이하인 값은 0으로 설정
    X_transformed[~mask_positive] = 0
    
    return X_transformed

In [390]:
imputer_1_hh = ColumnTransformer([
    ('Imputer_1', SimpleImputer(strategy='constant', missing_values=np.nan, fill_value=0.0), hh_nan_0),
    ],
    remainder='passthrough',
    verbose_feature_names_out=False
)

imputer_2_hh = ColumnTransformer([
    ('Imputer_2', SimpleImputer(strategy='median', missing_values=-9.0), hh_9_med)
    ],
    remainder='passthrough',
    verbose_feature_names_out=False
)

imputer_3_hh = ColumnTransformer([
    ('Imputer_3', SimpleImputer(strategy='constant', missing_values=2.0, fill_value=0.0), hh_2_0)
    ],
    remainder='passthrough',
    verbose_feature_names_out=False
)

# scale_1_hh = ColumnTransformer([
#     ('log', FunctionTransformer(func=np.log1p, inverse_func=np.expm1), ['H_INC_TOT','DEBT','PROP','H_OOP','OTC_MED'])
#     ],
#     remainder='passthrough',
#     verbose_feature_names_out=False
# )

pipeline_hh = Pipeline([
    ('Same_id', SameIdImpute(['HHID','D_YEAR'])),
    ('Imputer_1', imputer_1_hh),
    ('Imputer_2', imputer_2_hh),
    ('Imputer_3', imputer_3_hh),
    # ('Scale_1', scale_1_hh),
    ('Custom_feature', CustomFeatureCombination())
])

In [391]:
hh_piped = pipeline_hh.fit_transform(hh).fillna(0)
hh_piped[['CUSTM_BENF_YN','CUSTM_BENF1_D','CUSTM_BENF2_D','CUSTM_BENF3_D','CUSTM_BENF4_D']] = hh_piped[['CUSTM_BENF_YN','CUSTM_BENF1_D','CUSTM_BENF2_D','CUSTM_BENF3_D','CUSTM_BENF4_D']].astype(int)

In [392]:
hh_piped

Unnamed: 0,CUSTM_BENF_YN,CUSTM_BENF1_D,CUSTM_BENF2_D,CUSTM_BENF3_D,CUSTM_BENF4_D,DEBT,PROP,MED_SUP,H_INC_TOT,H_OOP,D_YEAR,HHID,OTC_MED,DEBT_RATIO
0,1,1,1,0,0,0.0,4000000.0,0.0,6270000.0,86060.0,2019,112001011.0,61000.0,0.000000
6748,1,1,1,1,0,0.0,4000000.0,35000.0,7320000.0,157444.0,2020,112001011.0,102000.0,0.000000
12965,1,1,1,0,0,0.0,4000000.0,15000.0,6680000.0,104520.0,2021,112001011.0,130000.0,0.000000
1,1,1,1,0,1,0.0,0.0,0.0,9790000.0,8780.0,2019,112002011.0,60000.0,0.000000
6749,1,1,1,0,1,0.0,200000.0,0.0,10970000.0,28770.0,2020,112002011.0,412000.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12963,0,0,0,0,0,80000000.0,175000000.0,0.0,74000000.0,885770.0,2020,400008011.0,660000.0,0.457143
18870,0,0,0,0,0,70000000.0,195000000.0,0.0,77460000.0,3600770.0,2021,400008011.0,143000.0,0.358974
6747,0,0,0,0,0,0.0,70000000.0,0.0,28000000.0,7843129.0,2019,400009011.0,0.0,0.000000
12964,0,0,0,0,0,0.0,100000000.0,0.0,36700000.0,390000.0,2020,400009011.0,130000.0,0.000000


# IND

## 만성질환 분류표

- 고혈압과 당뇨병 (HTN):
  - 고혈압, 당뇨병 (HTN, DM)

- 심뇌혈관 질환 (CVD):
  - 협심증, 심근경색증, 뇌출혈, 뇌경색 (AP, MI, CH, CI)

- 간질환 (LIV):
  - 만성간염(B형, C형), 알코올성 간질환, 간경화증(간경변증) (CLD, ALD, LC)

- 만성 하기도 질환(Lower Respiratory Infection, LRI):
  - 천식, 폐기종, 만성폐쇄성폐질환(COPD), 기관지확장증 (AST, PEM, COPD, BPE)

- 근골격계 질환 Musculoskeletal Disorders (MSD):
  - 무릎골관절염(무릎퇴행성관절염), 무릎 외 골관절염(퇴행성관절염), 류마티스 관절염, 어깨관절질환, 추간판(디스크) 질환, 기타 척추 질환 (OAK, OAE, RA, OAS, VD, VD_OLD, VDE, VDE_OLD)

- 갑상선 기능 장애 hyperthyroidism (HPT):
  - 갑상선 기능저하증, 갑상선 기능항진증 (HPOT, HPT)

In [393]:
# 질병 코드 그룹화
D_HTN = ["HTN", "DM"]
D_CVD = ["AP", "MI", "CH", "CI"]
D_LIV = ["CLD", "ALD", "LC"]
D_LRI = ["AST", "PEM", "COPD", "BPE"]
D_MSD_OLD = ["OAK", "OAE", "RA", "VD_OLD", "VDE_OLD"]
D_MSD = ["OAK", "OAE", "RA", "OAS", "VD", "VDE"]
D_MSD_ALL = ["OAK", "OAE", "RA", "OAS", "VD", "VDE", "VD_OLD", "VDE_OLD"]
D_HPT = ["HPOT", "HPT"]

# 열 이름으로 리스트화 (CD1 - 질병유무, CD2 - 진단시기)
D_HTN_CD1 = ["CD1_"+d for d in D_HTN]
D_CVD_CD1 = ["CD1_"+d for d in D_CVD]
D_LIV_CD1 = ["CD1_"+d for d in D_LIV]
D_LRI_CD1 = ["CD1_"+d for d in D_LRI]
D_MSD_OLD_CD1 = ["CD1_"+d for d in D_MSD_OLD]
D_MSD_CD1 = ["CD1_"+d for d in D_MSD]
D_HPT_CD1 = ["CD1_"+d for d in D_HPT]
D_OLD_CD1 = ["CD1_"+d for d in D_HTN + D_CVD + D_LIV + D_LRI + D_MSD_OLD + D_HPT]
D_21_CD1 = ["CD1_"+d for d in D_HTN + D_CVD + D_LIV + D_LRI + D_MSD + D_HPT]

## 만성질환을 그룹화 시킨 후 Many-Hot Encoding 형태로 변환 Transformer

In [394]:
class DiseaseTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        # 질병 분류 별 하나라도 해당하면 1, 아니면 0
        # 질병 분류 별 그룹화
        X["CD_HTN"] = X.apply(lambda x: 1 if any(x[col] == 1 for col in D_HTN_CD1) else 0, axis=1)
        X["CD_CVD"] = X.apply(lambda x: 1 if any(x[col] == 1 for col in D_CVD_CD1) else 0, axis=1)
        X["CD_LIV"] = X.apply(lambda x: 1 if any(x[col] == 1 for col in D_LIV_CD1) else 0, axis=1)
        X["CD_LRI"] = X.apply(lambda x: 1 if any(x[col] == 1 for col in D_LRI_CD1) else 0, axis=1)
        X["CD_MSD"] = X.apply(lambda x: 1 if any(x[col] == 1 for col in list(set(D_OLD_CD1 + D_21_CD1))) else 0, axis=1)
        X["CD_HPT"] = X.apply(lambda x: 1 if any(x[col] == 1 for col in D_HPT_CD1) else 0, axis=1)

        # 필요없는 행 탈락
        X.drop(X.iloc[:, 2:25], axis=1, inplace = True)

        return X

In [395]:
# 3개년 데이터의 질병 코드가 상이함. 따라서, 따로 불러온 후 그룹화 진행행
query = f"""
select PIDWON,2019 AS D_YEAR,CD1_HTN,CD1_DM,CD1_AP,CD1_MI,CD1_CH,CD1_CI,CD1_CLD,CD1_ALD,CD1_LC,CD1_AST,CD1_PEM,CD1_COPD,CD1_BPE,CD1_OAK,CD1_OAE,CD1_RA,CD1_VD_OLD,NULL AS CD1_VD,CD1_VDE_OLD,NULL AS CD1_VDE,CD1_HPOT,CD1_HPT,NULL AS CD1_OAS
from raw_a_ind
UNION ALL
select PIDWON,2020 AS D_YEAR,CD1_HTN,CD1_DM,CD1_AP,CD1_MI,CD1_CH,CD1_CI,CD1_CLD,CD1_ALD,CD1_LC,CD1_AST,CD1_PEM,CD1_COPD,CD1_BPE,CD1_OAK,CD1_OAE,CD1_RA,CD1_VD_OLD,NULL AS CD1_VD,CD1_VDE_OLD,NULL AS CD1_VDE,CD1_HPOT,CD1_HPT,NULL AS CD1_OAS
from raw_b_ind
UNION ALL
select PIDWON,2021 AS D_YEAR,CD1_HTN,CD1_DM,CD1_AP,CD1_MI,CD1_CH,CD1_CI,CD1_CLD,CD1_ALD,CD1_LC,CD1_AST,CD1_PEM,CD1_COPD,CD1_BPE,CD1_OAK,CD1_OAE,CD1_RA,NULL AS CD1_VD_OLD,CD1_VD,NULL AS CD1_VDE_OLD,CD1_VDE,CD1_HPOT,CD1_HPT,CD1_OAS
from raw_c_ind
"""

phi_disease = get_data_from_db(query)

In [396]:
disease_pipeline = Pipeline([
    ('disease', DiseaseTransformer())
])

In [397]:
ind_disease_piped = disease_pipeline.fit_transform(phi_disease)
ind_disease_piped

Unnamed: 0,PIDWON,D_YEAR,CD_HTN,CD_CVD,CD_LIV,CD_LRI,CD_MSD,CD_HPT
0,11200101.0,2019,1,1,0,0,1,0
1,11200201.0,2019,0,0,0,0,1,0
2,11200202.0,2019,0,0,0,0,0,0
3,11200301.0,2019,0,0,0,0,0,0
4,11200302.0,2019,1,0,0,0,1,0
...,...,...,...,...,...,...,...,...
45225,40000703.0,2021,1,0,0,0,1,0
45226,40000704.0,2021,0,0,0,0,0,0
45227,40000802.0,2021,0,0,0,0,0,0
45228,40000804.0,2021,0,0,0,0,0,0


# 나머지 IND

In [398]:
columns_ind = """
PIDWON, DEATH_I_YN, SEX, MARR, EDU, LIVE_T_YN,
HEALTH_INS, DISA_YN, DISA_TY,
ECO1,
CARE1, CARE4_2,
P1, P2, S1, D1, HT, WT, SE1,
HS1, HS_MED_YN, HS_SRH, HS6_YN, HS7_YN, HS8_YN,
I_PHI1_1, I_PHI1_2, I_PHI1_3, I_PHI5, I_PHI6,
(COALESCE(ERGUN, 0)+COALESCE(OUGUN, 0)+COALESCE(INGUN, 0)) AS MED_VISIT_TOT,
EROOP+INOOP+OUOOP_1 AS OOP_TOT, I_PHI_N, I_FFS_YN
"""

query_ind = f"""
SELECT 2019 AS D_YEAR, 2019-BIRTH_Y AS AGE, 
(COALESCE(CARE6_1_4, 0) + COALESCE(CARE6_2_4, 0) + COALESCE(CARE6_3_4, 0) + COALESCE(CARE6_4_4, 0)) AS CARE_TOT,
(P2_1 * 60 + P2_2) AS P2_2, CASE WHEN WTMG = 4 THEN 0 ELSE 1 END AS WT_MG,
{columns_ind}
FROM raw_a_ind
UNION ALL
SELECT 2020 AS D_YEAR, 2020-BIRTH_Y AS AGE, 
(COALESCE(CARE6_1_4, 0) + COALESCE(CARE6_2_4, 0) + COALESCE(CARE6_3_4, 0) + COALESCE(CARE6_4_4, 0)) AS CARE_TOT,
P2_2, CASE WHEN WTMG = 4 THEN 0 ELSE 1 END AS WT_MG,
{columns_ind}
FROM raw_b_ind
UNION ALL
SELECT 2021 AS D_YEAR, 2021-BIRTH_Y AS AGE, 
(COALESCE(CARE6_1A_4, 0) + COALESCE(CARE6_1B_4, 0) + COALESCE(CARE6_2_4, 0) + COALESCE(CARE6_3_4, 0) + COALESCE(CARE6_4_4, 0)) AS CARE_TOT,
P2_2, CASE WHEN WTMG = 4 THEN 0 ELSE 1 END AS WT_MG,
{columns_ind}
FROM raw_c_ind
"""

In [399]:
ind = get_data_from_db(query_ind)
ind

Unnamed: 0,D_YEAR,AGE,CARE_TOT,P2_2,WT_MG,PIDWON,DEATH_I_YN,SEX,MARR,EDU,...,HS8_YN,I_PHI1_1,I_PHI1_2,I_PHI1_3,I_PHI5,I_PHI6,MED_VISIT_TOT,OOP_TOT,I_PHI_N,I_FFS_YN
0,2019,81.0,0.0,30.0,0,11200101.0,2.0,2.0,3.0,1.0,...,4.0,,,,,,36.0,,,
1,2019,73.0,0.0,,0,11200201.0,2.0,2.0,3.0,1.0,...,4.0,,,,,,5.0,,,
2,2019,,0.0,,1,11200202.0,,,,,...,,,,,,,0.0,,,
3,2019,44.0,0.0,40.0,0,11200301.0,2.0,1.0,1.0,4.0,...,2.0,2.0,0.0,,2.0,2.0,12.0,,1.0,1.0
4,2019,41.0,0.0,20.0,1,11200302.0,2.0,2.0,1.0,4.0,...,4.0,2.0,0.0,,2.0,2.0,0.0,,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45225,2021,41.0,0.0,20.0,1,40000703.0,2.0,2.0,1.0,4.0,...,2.0,1.0,2.0,0.0,2.0,-1.0,27.0,,1.0,2.0
45226,2021,43.0,0.0,60.0,1,40000704.0,2.0,1.0,1.0,5.0,...,2.0,,,,2.0,-1.0,24.0,,1.0,2.0
45227,2021,41.0,0.0,40.0,0,40000802.0,2.0,2.0,1.0,5.0,...,2.0,2.0,1.0,4.0,2.0,2.0,28.0,,2.0,1.0
45228,2021,41.0,0.0,40.0,1,40000804.0,2.0,1.0,1.0,5.0,...,1.0,2.0,1.0,4.0,2.0,2.0,15.0,,1.0,1.0


In [400]:
ind_nan_0 = ['DEATH_I_YN','DISA_TY','ECO1','CARE1','CARE4_2','P1','P2','P2_2','D1','SE1','HS_MED_YN','HS6_YN','HS7_YN','HS8_YN','I_PHI1_1','I_PHI1_2','I_PHI1_3','I_PHI5','I_PHI6','OOP_TOT','I_PHI_N','I_FFS_YN','S1']
ind_nan_mean = ['HT']
ind_nan_mdn = ['AGE','WT','HS1','HS_SRH']
ind_3_0 = ['S1']
ind_2_0 = ['DEATH_I_YN','SEX','DISA_YN','CARE1','P1','SE1','I_FFS_YN','I_PHI5','I_PHI6','HS_MED_YN','HS6_YN','HS7_YN','HS8_YN']
ind_9_0 = ['P2_2','D1','I_PHI1_1','I_PHI1_2','I_PHI1_3','I_PHI5','I_PHI6']
ind_8_0 = ['P2','HS_MED_YN']
ind_1_0 = ['I_PHI6']
ind_onehot = ['MARR','EDU','LIVE_T_YN','HEALTH_INS','DISA_TY','ECO1','CARE4_2','P1','D1']
ind_manyhot = ['I_PHI1_1','I_PHI1_2','I_PHI1_3']

In [401]:
imputer_1_ind = ColumnTransformer([
    ('Imputer_1', SimpleImputer(strategy='constant', missing_values=np.nan, fill_value=0.0), ind_nan_0),
    ],
    remainder='passthrough',
    verbose_feature_names_out=False
)

imputer_2_ind = ColumnTransformer([
    ('Imputer_2', SimpleImputer(strategy='mean', missing_values=np.nan), ind_nan_mean),
    ],
    remainder='passthrough',
    verbose_feature_names_out=False
)

imputer_3_ind = ColumnTransformer([
    ('Imputer_3', SimpleImputer(strategy='median', missing_values=np.nan), ind_nan_mdn),
    ],
    remainder='passthrough',
    verbose_feature_names_out=False
)

imputer_4_ind = ColumnTransformer([
    ('Imputer_4', SimpleImputer(strategy='constant', missing_values=2.0, fill_value=0.0), ind_2_0),
    ],
    remainder='passthrough',
    verbose_feature_names_out=False
)

imputer_5_ind = ColumnTransformer([
    ('Imputer_5', SimpleImputer(strategy='constant', missing_values=-9.0, fill_value=0.0), ind_9_0),
    ],
    remainder='passthrough',
    verbose_feature_names_out=False
)

imputer_6_ind = ColumnTransformer([
    ('Imputer_6', SimpleImputer(strategy='constant', missing_values=8.0, fill_value=0.0), ind_8_0),
    ],
    remainder='passthrough',
    verbose_feature_names_out=False
)

imputer_7_ind = ColumnTransformer([
    ('Imputer_7', SimpleImputer(strategy='constant', missing_values=-1.0, fill_value=0.0), ind_1_0),
    ],
    remainder='passthrough',
    verbose_feature_names_out=False
)

imputer_8_ind = ColumnTransformer([
    ('Imputer_8', SimpleImputer(strategy='constant', missing_values=3.0, fill_value=0.0), ind_3_0),
    ],
    remainder='passthrough',
    verbose_feature_names_out=False
)

pipeline_ind = Pipeline([
    ('Same_id', SameIdImpute(['D_YEAR','PIDWON','WT_MG','CARE_TOT','P2_2','MED_VISIT_TOT'])),
    ('Imputer_1', imputer_1_ind),
    ('Imputer_2', imputer_2_ind),
    ('Imputer_3', imputer_3_ind),
    ('Imputer_4', imputer_4_ind),
    ('Imputer_5', imputer_5_ind),
    ('Imputer_6', imputer_6_ind),
    ('Imputer_7', imputer_7_ind)
])

In [402]:
ind_piped = pipeline_ind.fit_transform(ind)

In [403]:
ind_piped[['WT_MG','DEATH_I_YN','SEX', 'MARR', 'EDU', 'LIVE_T_YN', 'HEALTH_INS', 'DISA_YN', 'DISA_TY','ECO1','CARE1', 'CARE4_2','P1','S1','D1','SE1','HS1','HS_MED_YN', 'HS6_YN', 'HS7_YN', 'HS8_YN','I_PHI1_1','I_PHI1_2', 'I_PHI1_3','I_PHI5', 'I_PHI6','I_FFS_YN']] = ind_piped[['WT_MG','DEATH_I_YN','SEX', 'MARR', 'EDU', 'LIVE_T_YN', 'HEALTH_INS', 'DISA_YN', 'DISA_TY','ECO1','CARE1', 'CARE4_2','P1','S1','D1','SE1','HS1','HS_MED_YN', 'HS6_YN', 'HS7_YN', 'HS8_YN','I_PHI1_1','I_PHI1_2', 'I_PHI1_3','I_PHI5', 'I_PHI6','I_FFS_YN']].astype(int)

In [404]:
ind_piped

Unnamed: 0,I_PHI6,P2,HS_MED_YN,P2_2,D1,I_PHI1_1,I_PHI1_2,I_PHI1_3,I_PHI5,DEATH_I_YN,...,S1,D_YEAR,CARE_TOT,WT_MG,PIDWON,MARR,EDU,LIVE_T_YN,HEALTH_INS,MED_VISIT_TOT
0,0,7.0,1,30.0,1,0,0,0,0,0,...,3,2019,0.0,0,11200101.0,3,1,1,5,36.0
1,0,0.0,0,0.0,1,0,0,0,0,0,...,3,2019,0.0,0,11200201.0,3,1,1,5,5.0
2,0,0.0,0,30.0,1,2,0,4,0,0,...,3,2019,0.0,1,11200202.0,3,1,1,5,0.0
3,0,7.0,0,40.0,5,2,0,0,0,0,...,2,2019,0.0,0,11200301.0,1,4,1,1,12.0
4,0,3.0,0,20.0,3,2,0,0,0,0,...,2,2019,0.0,1,11200302.0,1,4,1,2,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45225,0,2.0,0,20.0,3,1,2,0,0,0,...,2,2021,0.0,1,40000703.0,1,4,1,2,27.0
45226,0,7.0,0,60.0,4,0,0,0,0,0,...,2,2021,0.0,1,40000704.0,1,5,1,1,24.0
45227,0,7.0,0,40.0,3,2,1,4,0,0,...,3,2021,0.0,0,40000802.0,1,5,1,1,28.0
45228,0,5.0,0,40.0,3,2,1,4,0,0,...,2,2021,0.0,1,40000804.0,1,5,1,1,15.0


In [405]:
final_ind = pd.merge(ind_piped, ind_disease_piped, on=['PIDWON','D_YEAR'], how='left')
final_ind

Unnamed: 0,I_PHI6,P2,HS_MED_YN,P2_2,D1,I_PHI1_1,I_PHI1_2,I_PHI1_3,I_PHI5,DEATH_I_YN,...,EDU,LIVE_T_YN,HEALTH_INS,MED_VISIT_TOT,CD_HTN,CD_CVD,CD_LIV,CD_LRI,CD_MSD,CD_HPT
0,0,7.0,1,30.0,1,0,0,0,0,0,...,1,1,5,36.0,1,1,0,0,1,0
1,0,0.0,0,0.0,1,0,0,0,0,0,...,1,1,5,5.0,0,0,0,0,1,0
2,0,0.0,0,30.0,1,2,0,4,0,0,...,1,1,5,0.0,0,0,0,0,0,0
3,0,7.0,0,40.0,5,2,0,0,0,0,...,4,1,1,12.0,0,0,0,0,0,0
4,0,3.0,0,20.0,3,2,0,0,0,0,...,4,1,2,0.0,1,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45225,0,2.0,0,20.0,3,1,2,0,0,0,...,4,1,2,27.0,1,0,0,0,1,0
45226,0,7.0,0,60.0,4,0,0,0,0,0,...,5,1,1,24.0,0,0,0,0,0,0
45227,0,7.0,0,40.0,3,2,1,4,0,0,...,5,1,1,28.0,0,0,0,0,0,0
45228,0,5.0,0,40.0,3,2,1,4,0,0,...,5,1,1,15.0,0,0,0,0,0,0


# MS

In [None]:
query_ms = """
select
    PIDWON, 2019 AS D_YEAR,
	CASE
		WHEN INMED_DZ1_OLD > 10 THEN INMED_DZ1_OLD + 1
		WHEN INMED_DZ1_OLD IS NULL THEN 0
		ELSE INMED_DZ1_OLD
	END AS INMED_DZ1,
    CASE
		WHEN INMED_DZ2_OLD > 10 THEN INMED_DZ2_OLD + 1
		WHEN INMED_DZ2_OLD IS NULL THEN 0
		ELSE INMED_DZ2_OLD
	END AS INMED_DZ2,
    CASE
		WHEN INMED_DZ3_OLD > 10 THEN INMED_DZ3_OLD + 1
		WHEN INMED_DZ3_OLD IS NULL THEN 0
		ELSE INMED_DZ3_OLD
	END AS INMED_DZ3,
	CASE 
		WHEN OUMED_DZ1_OLD > 10 THEN OUMED_DZ1_OLD + 1
		WHEN OUMED_DZ1_OLD IS NULL THEN 0
		ELSE OUMED_DZ1_OLD
	END AS OUMED_DZ1,
	CASE 
		WHEN OUMED_DZ2_OLD > 10 THEN OUMED_DZ2_OLD + 1
		WHEN OUMED_DZ2_OLD IS NULL THEN 0
		ELSE OUMED_DZ2_OLD
	END AS OUMED_DZ2,
	CASE 
		WHEN OUMED_DZ3_OLD > 10 THEN OUMED_DZ3_OLD + 1
		WHEN OUMED_DZ3_OLD IS NULL THEN 0
		ELSE OUMED_DZ3_OLD
	END AS OUMED_DZ3,
	COALESCE(OUDENT_DZ, 0) AS OUDENT_DZ, COALESCE(OUORT_DZ, 0) AS OUORT_DZ
FROM raw_a_ms
UNION ALL
SELECT 
    PIDWON, 2020 AS D_YEAR,
	COALESCE(INMED_DZ1, 0), COALESCE(INMED_DZ2, 0), COALESCE(INMED_DZ3, 0),
	COALESCE(OUMED_DZ1, 0), COALESCE(OUMED_DZ2, 0), COALESCE(OUMED_DZ3, 0),
	COALESCE(OUDENT_DZ, 0) AS OUDENT_DZ,
	CASE 
		WHEN OUORT_DZ_Y2020 = 1 THEN 3
		WHEN OUORT_DZ_Y2020 = 2 THEN 7
		WHEN OUORT_DZ_Y2020 = 3 THEN 4
		WHEN OUORT_DZ_Y2020 = 4 THEN 6
		WHEN OUORT_DZ_Y2020 = 5 THEN 9
		WHEN OUORT_DZ_Y2020 = 6 THEN 8
		WHEN OUORT_DZ_Y2020 = 7 THEN 2
		WHEN OUORT_DZ_Y2020 = 8 THEN 1
		WHEN OUORT_DZ_Y2020 = 9 THEN 5
		WHEN OUORT_DZ_Y2020 = 11 THEN 5
		WHEN OUORT_DZ_Y2020 = 12 THEN 11
        ELSE 0
	END AS OUORT_DZ
FROM raw_b_ms
UNION ALL
SELECT 
    PIDWON, 2021 AS D_YEAR,
	COALESCE(INMED_DZ1, 0), COALESCE(INMED_DZ2, 0), COALESCE(INMED_DZ3, 0),
	COALESCE(OUMED_DZ1, 0), COALESCE(OUMED_DZ2, 0), COALESCE(OUMED_DZ3, 0),
	COALESCE(OUDENT_DZ, 0) AS OUDENT_DZ, COALESCE(OUORT_DZ, 0) AS OUORT_DZ
FROM raw_c_ms
"""

In [None]:
ms = get_data_from_db(query_ms)


In [None]:
ms

In [None]:
HTN = list(range(1,3))
HPT = list(range(3,5))
CVD = list(range(5,9))
LIV = list(range(9,13))
LRI = list(range(13,17))
MSD = list(range(17,27))
CNR = list(range(27,34))
OBG = list(range(34,36))
ETC = list(range(36,52))

list_mappings = {
    'HTN': HTN,
    'HPT': HPT,
    'CVD': CVD,
    'LIV': LIV,
    'LRI': LRI,
    'MSD': MSD,
    'CNR': CNR,
    'OBG': OBG,
    'ETC': ETC
}

In [None]:
ms = ms.drop(columns=['INMED_DZ1','INMED_DZ2','INMED_DZ3','OUMED_DZ1','OUMED_DZ2','OUMED_DZ3']).groupby(['PIDWON','D_YEAR']).agg({
    'OUDENT_DZ': lambda x: (x != 0).sum(),  # 0이 아닌 값의 개수 계산
    'OUORT_DZ': lambda x: (x != 0).sum(),   # 0이 아닌 값의 개수 계산
    'MED_HTN': lambda x: (x != 0).sum(),
    'MED_HPT': lambda x: (x != 0).sum(),
    'MED_CVD': lambda x: (x != 0).sum(),
    'MED_LIV': lambda x: (x != 0).sum(),
    'MED_LRI': lambda x: (x != 0).sum(),
    'MED_MSD': lambda x: (x != 0).sum(),
    'MED_CNR': lambda x: (x != 0).sum(),
    'MED_OBG': lambda x: (x != 0).sum(),
    'MED_ETC': lambda x: (x != 0).sum()
}).reset_index()

In [None]:
ms

In [None]:
ms.columns = ['PIDWON','D_YEAR','OUDENT_DZ_CTN','OUORT_DZ_CTN','INMED_HTN_CTN','INMED_HPT_CTN','INMED_CVD_CTN','INMED_LIV_CTN','INMED_LRI_CTN','INMED_MSD_CTN','INMED_CNR_CTN','INMED_OBG_CTN','INMED_ETC_CTN']

In [None]:
ms

In [414]:
class PHITableTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        # 단독 가구인 경우만 뽑기기
        X = X.loc[X["PHI_PID"].str.len() < 9, :]

        return X

In [415]:
columns_phi = "HHID,PHI_PID,PHI_PID1,PHI3,PHI4_1_D,PHI4_2_D,PHI4_3_D,PHI4_4_D,PHI4_5_D,PHI4_6_D,PHI4_7_D,PHI4_8_D,PHI5,PHI6,PHR3"

query = f"""SELECT 2019 AS D_YEAR, {columns_phi} FROM raw_a_phi
            WHERE PHR2 = 1
            UNION ALL
            SELECT 2020 AS D_YEAR, {columns_phi} FROM raw_b_phi
            WHERE PHR2 = 1
            UNION ALL
            SELECT 2021 AS D_YEAR, {columns_phi} FROM raw_c_phi
            WHERE PHR2 = 1"""
phi = get_data_from_db(query)
phi

Unnamed: 0,D_YEAR,HHID,PHI_PID,PHI_PID1,PHI3,PHI4_1_D,PHI4_2_D,PHI4_3_D,PHI4_4_D,PHI4_5_D,PHI4_6_D,PHI4_7_D,PHI4_8_D,PHI5,PHI6,PHR3
0,2019,112003011.0,11200301,11200301.0,3.0,1.0,1.0,1.0,2.0,2.0,2.0,2.0,1.0,1.0,120000.0,60000.0
1,2019,112033011.0,11203301,11203301.0,2.0,,,,,,,,,1.0,87000.0,120000.0
2,2019,112049011.0,11204901,11204901.0,2.0,,,,,,,,,1.0,-9.0,300000.0
3,2019,112058011.0,11205801,11205801.0,1.0,1.0,1.0,1.0,2.0,2.0,2.0,2.0,2.0,3.0,60770.0,1000000.0
4,2019,112064011.0,11206402,11206402.0,3.0,-9.0,-9.0,-9.0,-9.0,-9.0,-9.0,-9.0,-9.0,1.0,75000.0,3800000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5000,2021,300070011.0,30007001,30007001.0,3.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,1.0,121260.0,545760.0
5001,2021,300070011.0,30007002,30007002.0,3.0,1.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,1.0,144168.0,1730000.0
5002,2021,300081011.0,30008104,30008104.0,3.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,1.0,39000.0,52600.0
5003,2021,300085011.0,30008504,30008504.0,3.0,1.0,1.0,1.0,2.0,2.0,2.0,1.0,2.0,1.0,109058.0,134379.0


In [416]:
phi_nan0 = ['PHI4_1_D','PHI4_2_D','PHI4_3_D','PHI4_4_D','PHI4_5_D','PHI4_6_D','PHI4_7_D','PHI4_8_D','PHR3']
phi_90 = ['PHI3', 'PHI4_1_D','PHI4_2_D','PHI4_3_D','PHI4_4_D','PHI4_5_D','PHI4_6_D','PHI4_7_D','PHI4_8_D','PHI5']
phi_9med = ['PHI6','PHR3']
phi_20 = ['PHI4_1_D','PHI4_2_D','PHI4_3_D','PHI4_4_D','PHI4_5_D','PHI4_6_D','PHI4_7_D','PHI4_8_D']
phi_onehot = ['PHI3']
rename_map_phi = {'PHI_PID1' : 'PIDWON',
                  'PHI3' : 'PHI_FORM',
                  'PHI5' : 'PHI_PREMIUM_YN',
                  'PHI6' : 'PHI_PREMIUM',
                  'PHR3' : 'PHI_BENEFIT'}
drop_columns_phi = ['PHI_PID']


In [None]:
imputer_1_phi = ColumnTransformer([
    ('Imputer_1', SimpleImputer(strategy='constant', missing_values=np.nan, fill_value=0.0), phi_nan0)
    ],
    remainder='passthrough',
    verbose_feature_names_out=False
)

imputer_2_phi = ColumnTransformer([
    ('Imputer_2', SimpleImputer(strategy='median', missing_values=-9.0, fill_value=None), phi_9med)
    ],
    remainder='passthrough',
    verbose_feature_names_out=False
)

imputer_3_phi = ColumnTransformer([
    ('Imputer_3', SimpleImputer(strategy='constant', missing_values=-9.0, fill_value=0.0), phi_90)
    ],
    remainder='passthrough',
    verbose_feature_names_out=False
)

imputer_4_phi = ColumnTransformer([
    ('Imputer_4', SimpleImputer(strategy='constant', missing_values=2.0, fill_value=0.0), phi_20)
    ],
    remainder='passthrough',
    verbose_feature_names_out=False
)

scale_1_phi = ColumnTransformer([
    ('log', FunctionTransformer(func=np.log1p, inverse_func=np.expm1), ['PHI6','PHR3'])
    ],
    remainder='passthrough',
    verbose_feature_names_out=False
)

# onehot_phi = ColumnTransformer([
#     ('OneHot', OneHotEncoder(sparse_output=False), phi_onehot)
#     ],
#     remainder='passthrough',
#     verbose_feature_names_out=False
# )

In [None]:
phi_pipeline = Pipeline([
    ('Same_id', SameIdImpute(id_cols=['PHI_PID','PHI_PID1'])),
    ('strip', StripTransformer(columns=['PHI_PID'])),
    ('phi',PHITableTransformer()),
    ('Imputer_1', imputer_1_phi),
    ('Imputer_2', imputer_2_phi),
    ('Imputer_3', imputer_3_phi),
    ('Imputer_4', imputer_4_phi),
    # ('Encoding', onehot_phi),
    ('Scale_1', scale_1_phi),
    ('rename_drop', RenameDropColumnsTransformer(rename_map=rename_map_phi, drop_columns=drop_columns_phi))
])

In [419]:
phi_piped = phi_pipeline.fit_transform(phi)
phi_piped[['PHI4_1_D','PHI4_2_D','PHI4_3_D','PHI4_4_D','PHI4_5_D','PHI4_6_D','PHI4_7_D','PHI4_8_D','PHI_FORM','PHI_PREMIUM_YN']] = phi_piped[['PHI4_1_D','PHI4_2_D','PHI4_3_D','PHI4_4_D','PHI4_5_D','PHI4_6_D','PHI4_7_D','PHI4_8_D','PHI_FORM','PHI_PREMIUM_YN']].astype(int)

In [420]:
phi_piped

Unnamed: 0,PHI4_1_D,PHI4_2_D,PHI4_3_D,PHI4_4_D,PHI4_5_D,PHI4_6_D,PHI4_7_D,PHI4_8_D,PHI_FORM,PHI_PREMIUM_YN,PHI_PREMIUM,PHI_BENEFIT,D_YEAR,HHID,PIDWON
1205,1,1,1,0,0,0,0,1,3,1,121650.0,186070.0,2020,112003011.0,11200301.0
1206,1,1,1,0,0,0,0,1,3,1,117430.0,841460.0,2020,112003011.0,11200302.0
1207,0,0,0,0,0,0,0,0,2,1,48710.0,350000.0,2020,112010011.0,11201001.0
1208,1,1,1,0,0,0,0,1,3,1,201717.0,3500000.0,2020,112010011.0,11201002.0
1209,0,1,0,0,0,0,0,0,1,1,25700.0,90000.0,2020,112010011.0,11201002.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5002,1,0,0,0,0,0,0,0,3,1,39000.0,52600.0,2021,300081011.0,30008104.0
1203,1,0,1,0,0,0,0,0,1,1,120000.0,1200000.0,2019,300085011.0,30008504.0
5003,1,1,1,0,0,0,1,0,3,1,109058.0,134379.0,2021,300085011.0,30008504.0
1204,1,1,1,0,0,0,0,0,3,1,59850.0,450000.0,2019,400001011.0,40000101.0


In [421]:
# 가구 데이터 추가
phi_hh = pd.merge(phi_piped, hh_piped, on=['HHID','D_YEAR'], how='left')

In [422]:
phi_hh.shape

(4934, 27)

In [423]:
# 개인 데이터 추가
phi_ind = pd.merge(phi_hh, final_ind, on=['PIDWON','D_YEAR'], how='left')

In [424]:
phi_ind.shape

(4934, 70)

In [None]:
model_data = pd.merge(phi_ind, ms, on=['PIDWON','D_YEAR'], how='left')

In [None]:
model_data = model_data.fillna(0)

In [428]:
phi_ind.to_csv("C:/Users/user/Documents/tmp_cat.csv", encoding='utf-8', index=False)