# 라이브러리

In [357]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
import matplotlib.pyplot as plt
import seaborn as sns
import koreanize_matplotlib
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn import set_config
set_config(transform_output='pandas')
from sklearn.pipeline import FunctionTransformer

# 데이터 뽑아오는 함수

In [358]:
# 데이터 병합 및 year 열 추가 함수 정의의: SQL
def get_data_from_db(query):

    # mysql 접속 정보 정의
    username = "admin"
    password = "admin1234"
    host = "hk-toss-middle-project.cjkcuqkegqpx.eu-north-1.rds.amazonaws.com"
    database_name = "raw_data"

    # 데이터베이스 연결 문자열, connection string
    db_connection_str = f'mysql+pymysql://{username}:{password}@{host}/{database_name}'

    # 데이터 베이스 connector 얻어내기
    db_connection = create_engine(db_connection_str)
    db_connection

    df = pd.read_sql(query, con=db_connection)
    return df

# Transformer 정의

## 문자열 앞뒤 공백 제거 Transformer

In [359]:
# 앞뒤공백 제거 transformer
class StripTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, columns:list):
        # 입력한 columns가 문자열이면 리스트로 변환
        if isinstance(columns, str):
            self.columns = [columns]
        else:
            self.columns = columns

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        # 선택한 열 중에 데이터프레임에 실제로 존재하는 열만 선택
        columns_y = [col for col in self.columns if col in X.columns]

        # 선택한 열의 값들의 앞뒤 공백을 제거
        for col in columns_y:

            # 문자열 형식인지 확인
            if X[col].dtype == 'object':
                try:
                    X[col] = X[col].str.strip()
                except:
                    pass

        return X

## 열 탈락 / 열 이름 변경 Transformer

In [360]:
class RenameDropColumnsTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, rename_map:dict = {}, drop_columns:list = []):
        self.rename_map = rename_map
        # drop_columns가 문자열이면 리스트로 변환
        if isinstance(drop_columns, str):
            self.drop_columns = [drop_columns]
        else:
            self.drop_columns = drop_columns

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        # 실제로 데이터프레임에 존재하는 열만 drop
        self.drop_columns = [col for col in self.drop_columns if col in X.columns]
        # drop_columns가 존재하면 drop
        if self.drop_columns:
            X.drop(columns=self.drop_columns, inplace=True)

        # 실제로 데이터프레임에 존재하는 열만 선택
        rename_map = {key: value for key, value in self.rename_map.items() if key in X.columns}
        # rename_map이 존재하면 rename
        if self.rename_map:
            X.rename(columns=self.rename_map, inplace=True)
            
        return X

## ManyHotEncoding을 위한 리스트 열 만들어주기

In [361]:
class ColumnsWithList(BaseEstimator, TransformerMixin):
    def __init__(self, from_columns, to_column):
        self.from_columns = from_columns
        self.to_column= to_column

    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X[self.to_column] = X[self.from_columns].apply(lambda row: ','.join([str(int(x)) for x in [row[col] for col in self.from_columns]]), axis=1)
        X.drop(columns=self.from_columns, inplace=True)
        return X

## ManyHotEncoding

In [362]:
class ManyHotEncoding(BaseEstimator, TransformerMixin):
    def __init__(self, columns, prefix = ""):
        self.columns = columns
        self.prefix = prefix
        self.mlb = MultiLabelBinarizer()

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        # MultiLabelBinarizer를 사용하여 데이터를 변환합니다.
        encoded = self.mlb.fit_transform(X[self.columns])
        encoded = encoded[:, 1:]

        # 숫자 클래스와 name_map을 사용하여 열 이름 생성
        mapped_classes = [f"{self.prefix}_{cls}" for cls in self.mlb.classes_[1:]]

        # 변환된 데이터를 DataFrame으로 반환
        encoded_df = pd.DataFrame(encoded, columns=mapped_classes)
        X.drop(columns=self.columns, inplace=True)
        data = pd.concat([encoded_df, X], axis=1)
        return data

## 가중치는 존재하지만 조사표상 응답이 모두 결측인 경우 처리

In [363]:
class SameIdImpute(BaseEstimator, TransformerMixin):
    def __init__(self, id_cols = []):
        self.id_cols = id_cols
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        self.columns_to_fill = X.columns.difference(self.id_cols)
        X = X.sort_values(self.id_cols)
        X.loc[X[self.columns_to_fill].isnull().all(axis=1)] = X.ffill().bfill().loc[X[self.columns_to_fill].isnull().all(axis=1)]

        return X

In [364]:
class CustomFeatureCombination(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, X):
        return self

    def transform(self, X):
        X['DEBT_RATIO'] = np.where(
        X['PROP'] > 0,
        X['DEBT'] / X['PROP'],
        X['DEBT'])
        
        return X

# HH Table

In [365]:
columns_hh = "HHID,CUSTM_BENF_YN,CUSTM_BENF1_D,CUSTM_BENF2_D,CUSTM_BENF3_D,CUSTM_BENF4_D,CASE WHEN H_INC_TOT >= 0 THEN 10000 * H_INC_TOT ELSE H_INC_TOT END AS H_INC_TOT,DEBT,PROP,OTC_MED,MED_SUP,H_OOP"

query_hh = f"""
SELECT 2019 AS D_YEAR, {columns_hh} from raw_a_hh
UNION ALL
SELECT 2020 AS D_YEAR, {columns_hh} from raw_b_hh
UNION ALL
SELECT 2021 AS D_YEAR, {columns_hh} from raw_c_hh
"""

In [366]:
hh = get_data_from_db(query=query_hh)
hh

Unnamed: 0,D_YEAR,HHID,CUSTM_BENF_YN,CUSTM_BENF1_D,CUSTM_BENF2_D,CUSTM_BENF3_D,CUSTM_BENF4_D,H_INC_TOT,DEBT,PROP,OTC_MED,MED_SUP,H_OOP
0,2019,112001011.0,1.0,1.0,1.0,2.0,2.0,6270000.0,0.0,400.0,61000.0,0.0,86060.0
1,2019,112002011.0,1.0,1.0,1.0,2.0,1.0,9790000.0,0.0,0.0,60000.0,0.0,8780.0
2,2019,112003011.0,2.0,,,,,44970000.0,750.0,7650.0,74000.0,0.0,247670.0
3,2019,112010011.0,2.0,,,,,24160000.0,0.0,23000.0,162600.0,3800.0,3558000.0
4,2019,112012011.0,2.0,,,,,16490000.0,0.0,52000.0,0.0,0.0,2041164.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
18867,2021,400004011.0,2.0,,,,,29650000.0,8000.0,11000.0,500000.0,0.0,0.0
18868,2021,400005011.0,2.0,,,,,28250000.0,0.0,3300.0,370000.0,0.0,123300.0
18869,2021,400007011.0,2.0,,,,,60800000.0,3000.0,7000.0,400000.0,0.0,4422620.0
18870,2021,400008011.0,2.0,,,,,77460000.0,7000.0,19500.0,143000.0,0.0,3600770.0


In [367]:
hh_nan_0 = ['CUSTM_BENF1_D','CUSTM_BENF2_D','CUSTM_BENF3_D','CUSTM_BENF4_D','H_OOP','H_INC_TOT']
hh_9_med = ['DEBT','PROP','MED_SUP','H_INC_TOT']
hh_2_0 = ['CUSTM_BENF_YN','CUSTM_BENF1_D','CUSTM_BENF2_D','CUSTM_BENF3_D','CUSTM_BENF4_D']

In [368]:
imputer_1_hh = ColumnTransformer([
    ('Imputer_1', SimpleImputer(strategy='constant', missing_values=np.nan, fill_value=0.0), hh_nan_0),
    ],
    remainder='passthrough',
    verbose_feature_names_out=False
)

imputer_2_hh = ColumnTransformer([
    ('Imputer_2', SimpleImputer(strategy='median', missing_values=-9.0), hh_9_med)
    ],
    remainder='passthrough',
    verbose_feature_names_out=False
)

imputer_3_hh = ColumnTransformer([
    ('Imputer_3', SimpleImputer(strategy='constant', missing_values=2.0, fill_value=0.0), hh_2_0)
    ],
    remainder='passthrough',
    verbose_feature_names_out=False
)

scale_1_hh = ColumnTransformer([
    ('log', FunctionTransformer(func=np.log1p, inverse_func=np.expm1), ['H_INC_TOT','DEBT','PROP','H_OOP','OTC_MED'])
    ],
    remainder='passthrough',
    verbose_feature_names_out=False
)

pipeline_hh = Pipeline([
    ('Same_id', SameIdImpute(['HHID','D_YEAR'])),
    ('Imputer_1', imputer_1_hh),
    ('Imputer_2', imputer_2_hh),
    ('Imputer_3', imputer_3_hh),
    ('Scale_1', scale_1_hh),
    ('Custom_feature', CustomFeatureCombination())
])

In [369]:
hh_piped = pipeline_hh.fit_transform(hh)
hh_piped

  result = func(self.values, **kwargs)


Unnamed: 0,H_INC_TOT,DEBT,PROP,H_OOP,OTC_MED,CUSTM_BENF_YN,CUSTM_BENF1_D,CUSTM_BENF2_D,CUSTM_BENF3_D,CUSTM_BENF4_D,MED_SUP,D_YEAR,HHID,DEBT_RATIO
0,15.651287,0.000000,5.993961,11.362812,11.018646,1.0,1.0,1.0,0.0,0.0,0.0,2019,112001011.0,0.000000
6748,15.806121,0.000000,5.993961,11.966831,11.532738,1.0,1.0,1.0,1.0,0.0,35000.0,2020,112001011.0,0.000000
12965,15.714629,0.000000,5.993961,11.557143,11.775297,1.0,1.0,1.0,0.0,0.0,15000.0,2021,112001011.0,0.000000
1,16.096872,0.000000,0.000000,9.080346,11.002117,1.0,1.0,1.0,0.0,1.0,0.0,2019,112002011.0,0.000000
6749,16.210675,0.000000,3.044522,10.267123,12.928781,1.0,1.0,1.0,0.0,1.0,0.0,2020,112002011.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12963,18.119576,8.987322,9.770013,13.694214,13.399997,0.0,0.0,0.0,0.0,0.0,0.0,2020,400008011.0,0.919888
18870,18.165272,8.853808,9.878221,15.096659,11.870607,0.0,0.0,0.0,0.0,0.0,0.0,2021,400008011.0,0.896296
6747,17.147715,0.000000,8.853808,15.875149,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,2019,400009011.0,0.000000
12964,17.418287,0.000000,9.210440,12.873905,11.775297,0.0,0.0,0.0,0.0,0.0,0.0,2020,400009011.0,0.000000


In [370]:
hh_piped['H_INC_TOT'].sort_values()

3411      0.000000
5551      0.000000
5543      0.000000
6645      0.000000
1787      0.000000
           ...    
12781    21.095519
9449           NaN
10434          NaN
12234          NaN
12556          NaN
Name: H_INC_TOT, Length: 18872, dtype: float64

# IND Table

## 만성질환 분류표

- 고혈압과 당뇨병 (HTN):
  - 고혈압, 당뇨병 (HTN, DM)

- 심뇌혈관 질환 (CVD):
  - 협심증, 심근경색증, 뇌출혈, 뇌경색 (AP, MI, CH, CI)

- 간질환 (LIV):
  - 만성간염(B형, C형), 알코올성 간질환, 간경화증(간경변증) (CLD, ALD, LC)

- 만성 하기도 질환(Lower Respiratory Infection, LRI):
  - 천식, 폐기종, 만성폐쇄성폐질환(COPD), 기관지확장증 (AST, PEM, COPD, BPE)

- 근골격계 질환 Musculoskeletal Disorders (MSD):
  - 무릎골관절염(무릎퇴행성관절염), 무릎 외 골관절염(퇴행성관절염), 류마티스 관절염, 어깨관절질환, 추간판(디스크) 질환, 기타 척추 질환 (OAK, OAE, RA, OAS, VD, VD_OLD, VDE, VDE_OLD)

- 갑상선 기능 장애 hyperthyroidism (HPT):
  - 갑상선 기능저하증, 갑상선 기능항진증 (HPOT, HPT)

In [371]:
# 질병 코드 그룹화
D_HTN = ["HTN", "DM"]
D_CVD = ["AP", "MI", "CH", "CI"]
D_LIV = ["CLD", "ALD", "LC"]
D_LRI = ["AST", "PEM", "COPD", "BPE"]
D_MSD_OLD = ["OAK", "OAE", "RA", "VD_OLD", "VDE_OLD"]
D_MSD = ["OAK", "OAE", "RA", "OAS", "VD", "VDE"]
D_MSD_ALL = ["OAK", "OAE", "RA", "OAS", "VD", "VDE", "VD_OLD", "VDE_OLD"]
D_HPT = ["HPOT", "HPT"]

# 열 이름으로 리스트화 (CD1 - 질병유무, CD2 - 진단시기)
D_HTN_CD1 = ["CD1_"+d for d in D_HTN]
D_CVD_CD1 = ["CD1_"+d for d in D_CVD]
D_LIV_CD1 = ["CD1_"+d for d in D_LIV]
D_LRI_CD1 = ["CD1_"+d for d in D_LRI]
D_MSD_OLD_CD1 = ["CD1_"+d for d in D_MSD_OLD]
D_MSD_CD1 = ["CD1_"+d for d in D_MSD]
D_HPT_CD1 = ["CD1_"+d for d in D_HPT]
D_OLD_CD1 = ["CD1_"+d for d in D_HTN + D_CVD + D_LIV + D_LRI + D_MSD_OLD + D_HPT]
D_21_CD1 = ["CD1_"+d for d in D_HTN + D_CVD + D_LIV + D_LRI + D_MSD + D_HPT]

## 만성질환을 그룹화 시킨 후 Many-Hot Encoding 형태로 변환 Transformer

In [372]:
class DiseaseTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        # 질병 분류 별 하나라도 해당하면 1, 아니면 0
        # 질병 분류 별 그룹화
        X["CD_HTN"] = X.apply(lambda x: 1 if any(x[col] == 1 for col in D_HTN_CD1) else 0, axis=1)
        X["CD_CVD"] = X.apply(lambda x: 1 if any(x[col] == 1 for col in D_CVD_CD1) else 0, axis=1)
        X["CD_LIV"] = X.apply(lambda x: 1 if any(x[col] == 1 for col in D_LIV_CD1) else 0, axis=1)
        X["CD_LRI"] = X.apply(lambda x: 1 if any(x[col] == 1 for col in D_LRI_CD1) else 0, axis=1)
        X["CD_MSD"] = X.apply(lambda x: 1 if any(x[col] == 1 for col in list(set(D_OLD_CD1 + D_21_CD1))) else 0, axis=1)
        X["CD_HPT"] = X.apply(lambda x: 1 if any(x[col] == 1 for col in D_HPT_CD1) else 0, axis=1)

        # 필요없는 행 탈락
        X.drop(X.iloc[:, 2:25], axis=1, inplace = True)

        return X

In [373]:
# 3개년 데이터의 질병 코드가 상이함. 따라서, 따로 불러온 후 그룹화 진행행
query = f"""
select PIDWON,2019 AS D_YEAR,CD1_HTN,CD1_DM,CD1_AP,CD1_MI,CD1_CH,CD1_CI,CD1_CLD,CD1_ALD,CD1_LC,CD1_AST,CD1_PEM,CD1_COPD,CD1_BPE,CD1_OAK,CD1_OAE,CD1_RA,CD1_VD_OLD,NULL AS CD1_VD,CD1_VDE_OLD,NULL AS CD1_VDE,CD1_HPOT,CD1_HPT,NULL AS CD1_OAS
from raw_a_ind
UNION ALL
select PIDWON,2020 AS D_YEAR,CD1_HTN,CD1_DM,CD1_AP,CD1_MI,CD1_CH,CD1_CI,CD1_CLD,CD1_ALD,CD1_LC,CD1_AST,CD1_PEM,CD1_COPD,CD1_BPE,CD1_OAK,CD1_OAE,CD1_RA,CD1_VD_OLD,NULL AS CD1_VD,CD1_VDE_OLD,NULL AS CD1_VDE,CD1_HPOT,CD1_HPT,NULL AS CD1_OAS
from raw_b_ind
UNION ALL
select PIDWON,2021 AS D_YEAR,CD1_HTN,CD1_DM,CD1_AP,CD1_MI,CD1_CH,CD1_CI,CD1_CLD,CD1_ALD,CD1_LC,CD1_AST,CD1_PEM,CD1_COPD,CD1_BPE,CD1_OAK,CD1_OAE,CD1_RA,NULL AS CD1_VD_OLD,CD1_VD,NULL AS CD1_VDE_OLD,CD1_VDE,CD1_HPOT,CD1_HPT,CD1_OAS
from raw_c_ind
"""

phi_disease = get_data_from_db(query)

In [374]:
disease_pipeline = Pipeline([
    ('disease', DiseaseTransformer())
])

In [375]:
ind_disease_piped = disease_pipeline.fit_transform(phi_disease)
ind_disease_piped

Unnamed: 0,PIDWON,D_YEAR,CD_HTN,CD_CVD,CD_LIV,CD_LRI,CD_MSD,CD_HPT
0,11200101.0,2019,1,1,0,0,1,0
1,11200201.0,2019,0,0,0,0,1,0
2,11200202.0,2019,0,0,0,0,0,0
3,11200301.0,2019,0,0,0,0,0,0
4,11200302.0,2019,1,0,0,0,1,0
...,...,...,...,...,...,...,...,...
45225,40000703.0,2021,1,0,0,0,1,0
45226,40000704.0,2021,0,0,0,0,0,0
45227,40000802.0,2021,0,0,0,0,0,0
45228,40000804.0,2021,0,0,0,0,0,0


## 나머지 IND Table

In [376]:
columns_ind = """
PIDWON, DEATH_I_YN, SEX, MARR, EDU, LIVE_T_YN,
HEALTH_INS, DISA_YN, DISA_TY,
ECO1,
CARE1, CARE4_2,
P1, P2, S1, D1, HT, WT, SE1,
HS1, HS_MED_YN, HS_SRH, HS6_YN, HS7_YN, HS8_YN,
I_PHI1_1, I_PHI1_2, I_PHI1_3, I_PHI5, I_PHI6,
EROOP+INOOP+OUOOP_1 AS OOP_TOT, I_PHI_N, I_FFS_YN
"""

query_ind = f"""
SELECT 2019 AS D_YEAR, 2019-BIRTH_Y AS AGE, 
(COALESCE(CARE6_1_4, 0) + COALESCE(CARE6_2_4, 0) + COALESCE(CARE6_3_4, 0) + COALESCE(CARE6_4_4, 0)) AS CARE_TOT,
(P2_1 * 60 + P2_2) AS P2_2, CASE WHEN WTMG = 4 THEN 0 ELSE 1 END AS WT_MG,
{columns_ind}
FROM raw_a_ind
UNION ALL
SELECT 2020 AS D_YEAR, 2020-BIRTH_Y AS AGE, 
(COALESCE(CARE6_1_4, 0) + COALESCE(CARE6_2_4, 0) + COALESCE(CARE6_3_4, 0) + COALESCE(CARE6_4_4, 0)) AS CARE_TOT,
P2_2, CASE WHEN WTMG = 4 THEN 0 ELSE 1 END AS WT_MG,
{columns_ind}
FROM raw_b_ind
UNION ALL
SELECT 2021 AS D_YEAR, 2021-BIRTH_Y AS AGE, 
(COALESCE(CARE6_1A_4, 0) + COALESCE(CARE6_1B_4, 0) + COALESCE(CARE6_2_4, 0) + COALESCE(CARE6_3_4, 0) + COALESCE(CARE6_4_4, 0)) AS CARE_TOT,
P2_2, CASE WHEN WTMG = 4 THEN 0 ELSE 1 END AS WT_MG,
{columns_ind}
FROM raw_c_ind
"""

In [377]:
ind = get_data_from_db(query_ind)
ind

Unnamed: 0,D_YEAR,AGE,CARE_TOT,P2_2,WT_MG,PIDWON,DEATH_I_YN,SEX,MARR,EDU,...,HS7_YN,HS8_YN,I_PHI1_1,I_PHI1_2,I_PHI1_3,I_PHI5,I_PHI6,OOP_TOT,I_PHI_N,I_FFS_YN
0,2019,81.0,0.0,30.0,0,11200101.0,2.0,2.0,3.0,1.0,...,3.0,4.0,,,,,,,,
1,2019,73.0,0.0,,0,11200201.0,2.0,2.0,3.0,1.0,...,3.0,4.0,,,,,,,,
2,2019,,0.0,,1,11200202.0,,,,,...,,,,,,,,,,
3,2019,44.0,0.0,40.0,0,11200301.0,2.0,1.0,1.0,4.0,...,2.0,2.0,2.0,0.0,,2.0,2.0,,1.0,1.0
4,2019,41.0,0.0,20.0,1,11200302.0,2.0,2.0,1.0,4.0,...,3.0,4.0,2.0,0.0,,2.0,2.0,,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45225,2021,41.0,0.0,20.0,1,40000703.0,2.0,2.0,1.0,4.0,...,2.0,2.0,1.0,2.0,0.0,2.0,-1.0,,1.0,2.0
45226,2021,43.0,0.0,60.0,1,40000704.0,2.0,1.0,1.0,5.0,...,2.0,2.0,,,,2.0,-1.0,,1.0,2.0
45227,2021,41.0,0.0,40.0,0,40000802.0,2.0,2.0,1.0,5.0,...,2.0,2.0,2.0,1.0,4.0,2.0,2.0,,2.0,1.0
45228,2021,41.0,0.0,40.0,1,40000804.0,2.0,1.0,1.0,5.0,...,2.0,1.0,2.0,1.0,4.0,2.0,2.0,,1.0,1.0


In [378]:
ind_nan_0 = ['DEATH_I_YN','DISA_TY','ECO1','CARE1','CARE4_2','P1','P2','P2_2','D1','SE1','HS_MED_YN','HS6_YN','HS7_YN','HS8_YN','I_PHI1_1','I_PHI1_2','I_PHI1_3','I_PHI5','I_PHI6','OOP_TOT','I_PHI_N','I_FFS_YN']
ind_nan_mean = ['HT']
ind_nan_mdn = ['AGE','WT','HS1','HS_SRH']
ind_3_0 = ['S1']
ind_2_0 = ['DEATH_I_YN','SEX','DISA_YN','CARE1','P1','SE1','I_FFS_YN','I_PHI5','I_PHI6','HS_MED_YN','HS6_YN','HS7_YN','HS8_YN']
ind_9_0 = ['P2_2','D1','I_PHI1_1','I_PHI1_2','I_PHI1_3','I_PHI5','I_PHI6']
ind_8_0 = ['P2','HS_MED_YN']
ind_1_0 = ['I_PHI6']
ind_onehot = ['MARR','EDU','LIVE_T_YN','HEALTH_INS','DISA_TY','ECO1','CARE4_2','P1','D1']
ind_manyhot = ['I_PHI1_1','I_PHI1_2','I_PHI1_3']

In [379]:
imputer_1_ind = ColumnTransformer([
    ('Imputer_1', SimpleImputer(strategy='constant', missing_values=np.nan, fill_value=0.0), ind_nan_0),
    ],
    remainder='passthrough',
    verbose_feature_names_out=False
)

imputer_2_ind = ColumnTransformer([
    ('Imputer_2', SimpleImputer(strategy='mean', missing_values=np.nan), ind_nan_mean),
    ],
    remainder='passthrough',
    verbose_feature_names_out=False
)

imputer_3_ind = ColumnTransformer([
    ('Imputer_3', SimpleImputer(strategy='median', missing_values=np.nan), ind_nan_mdn),
    ],
    remainder='passthrough',
    verbose_feature_names_out=False
)

imputer_4_ind = ColumnTransformer([
    ('Imputer_4', SimpleImputer(strategy='constant', missing_values=2.0, fill_value=0.0), ind_2_0),
    ],
    remainder='passthrough',
    verbose_feature_names_out=False
)

imputer_5_ind = ColumnTransformer([
    ('Imputer_5', SimpleImputer(strategy='constant', missing_values=-9.0, fill_value=0.0), ind_9_0),
    ],
    remainder='passthrough',
    verbose_feature_names_out=False
)

imputer_6_ind = ColumnTransformer([
    ('Imputer_6', SimpleImputer(strategy='constant', missing_values=8.0, fill_value=0.0), ind_8_0),
    ],
    remainder='passthrough',
    verbose_feature_names_out=False
)

imputer_7_ind = ColumnTransformer([
    ('Imputer_7', SimpleImputer(strategy='constant', missing_values=-1.0, fill_value=0.0), ind_1_0),
    ],
    remainder='passthrough',
    verbose_feature_names_out=False
)

imputer_8_ind = ColumnTransformer([
    ('Imputer_8', SimpleImputer(strategy='constant', missing_values=3.0, fill_value=0.0), ind_3_0),
    ],
    remainder='passthrough',
    verbose_feature_names_out=False
)

onehot_ind = ColumnTransformer([
    ('OneHot', OneHotEncoder(sparse_output=False), ind_onehot)
    ],
    remainder='passthrough',
    verbose_feature_names_out=False
)

pipeline_ind = Pipeline([
    ('Same_id', SameIdImpute(['D_YEAR','PIDWON','WT_MG','CARE_TOT','P2_2'])),
    ('Imputer_1', imputer_1_ind),
    ('Imputer_2', imputer_2_ind),
    ('Imputer_3', imputer_3_ind),
    ('Imputer_4', imputer_4_ind),
    ('Imputer_5', imputer_5_ind),
    ('Imputer_6', imputer_6_ind),
    ('Imputer_7', imputer_7_ind),
    ('OneHot', onehot_ind),
    ('Column', ColumnsWithList(from_columns=ind_manyhot, to_column='I_PHI1')),
    ('Encoding', ManyHotEncoding(columns='I_PHI1', prefix='I_PHI'))
])

In [380]:
ind_piped = pipeline_ind.fit_transform(ind)
ind_piped

Unnamed: 0,I_PHI_0,I_PHI_1,I_PHI_2,I_PHI_3,I_PHI_4,I_PHI_5,MARR_1.0,MARR_2.0,MARR_3.0,MARR_4.0,...,HS1,HS_SRH,HT,OOP_TOT,I_PHI_N,D_YEAR,CARE_TOT,WT_MG,PIDWON,S1
0,1,0,0,0,0,0,0.0,0.0,1.0,0.0,...,4.0,4.0,145.0,0.0,0.0,2019,0.0,0,11200101.0,3.0
1,1,0,0,0,0,0,0.0,0.0,1.0,0.0,...,2.0,5.0,147.0,0.0,0.0,2019,0.0,0,11200201.0,3.0
2,1,0,1,0,1,0,0.0,0.0,1.0,0.0,...,2.0,5.0,147.0,6696310.0,1.0,2019,0.0,1,11200202.0,3.0
3,1,0,1,0,0,0,1.0,0.0,0.0,0.0,...,3.0,3.0,175.0,0.0,1.0,2019,0.0,0,11200301.0,2.0
4,1,0,1,0,0,0,1.0,0.0,0.0,0.0,...,3.0,4.0,160.0,0.0,1.0,2019,0.0,1,11200302.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45225,1,1,1,0,0,0,1.0,0.0,0.0,0.0,...,1.0,3.0,173.0,0.0,1.0,2021,0.0,1,40000703.0,2.0
45226,1,0,0,0,0,0,1.0,0.0,0.0,0.0,...,3.0,3.0,178.0,0.0,1.0,2021,0.0,1,40000704.0,2.0
45227,0,1,1,0,1,0,1.0,0.0,0.0,0.0,...,3.0,4.0,158.0,0.0,2.0,2021,0.0,0,40000802.0,3.0
45228,0,1,1,0,1,0,1.0,0.0,0.0,0.0,...,1.0,2.0,175.0,0.0,1.0,2021,0.0,1,40000804.0,2.0


## 최종 IND Table

In [381]:
ind_merged = pd.merge(ind_piped, ind_disease_piped, on=['PIDWON','D_YEAR'], how='outer')
ind_merged

Unnamed: 0,I_PHI_0,I_PHI_1,I_PHI_2,I_PHI_3,I_PHI_4,I_PHI_5,MARR_1.0,MARR_2.0,MARR_3.0,MARR_4.0,...,CARE_TOT,WT_MG,PIDWON,S1,CD_HTN,CD_CVD,CD_LIV,CD_LRI,CD_MSD,CD_HPT
0,1,0,0,0,0,0,0.0,0.0,1.0,0.0,...,0.0,0,11200101.0,3.0,1,1,0,0,1,0
1,1,0,0,0,0,0,0.0,0.0,1.0,0.0,...,0.0,0,11200101.0,3.0,1,0,0,1,1,0
2,1,0,0,0,0,0,0.0,0.0,1.0,0.0,...,0.0,0,11200101.0,3.0,1,0,0,1,1,0
3,1,0,0,0,0,0,0.0,0.0,1.0,0.0,...,0.0,0,11200201.0,3.0,0,0,0,0,1,0
4,1,0,0,0,0,0,0.0,0.0,1.0,0.0,...,0.0,0,11200201.0,3.0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45225,1,0,0,0,0,0,1.0,0.0,0.0,0.0,...,0.0,1,40000804.0,,0,0,0,0,0,0
45226,0,1,1,0,1,0,1.0,0.0,0.0,0.0,...,0.0,1,40000804.0,2.0,0,0,0,0,0,0
45227,1,0,0,0,0,0,0.0,0.0,0.0,1.0,...,0.0,1,40000901.0,2.0,0,0,0,1,1,0
45228,1,0,0,0,0,0,0.0,0.0,0.0,1.0,...,0.0,1,40000901.0,2.0,0,0,0,0,1,1


# MS Table

In [382]:
query_ms = """
select
    PIDWON, 2019 AS D_YEAR,
	CASE
		WHEN INMED_DZ1_OLD > 10 THEN INMED_DZ1_OLD + 1
		WHEN INMED_DZ1_OLD IS NULL THEN 0
		ELSE INMED_DZ1_OLD
	END AS INMED_DZ1,
    CASE
		WHEN INMED_DZ2_OLD > 10 THEN INMED_DZ2_OLD + 1
		WHEN INMED_DZ2_OLD IS NULL THEN 0
		ELSE INMED_DZ2_OLD
	END AS INMED_DZ2,
    CASE
		WHEN INMED_DZ3_OLD > 10 THEN INMED_DZ3_OLD + 1
		WHEN INMED_DZ3_OLD IS NULL THEN 0
		ELSE INMED_DZ3_OLD
	END AS INMED_DZ3,
	CASE 
		WHEN OUMED_DZ1_OLD > 10 THEN OUMED_DZ1_OLD + 1
		WHEN OUMED_DZ1_OLD IS NULL THEN 0
		ELSE OUMED_DZ1_OLD
	END AS OUMED_DZ1,
	CASE 
		WHEN OUMED_DZ2_OLD > 10 THEN OUMED_DZ2_OLD + 1
		WHEN OUMED_DZ2_OLD IS NULL THEN 0
		ELSE OUMED_DZ2_OLD
	END AS OUMED_DZ2,
	CASE 
		WHEN OUMED_DZ3_OLD > 10 THEN OUMED_DZ3_OLD + 1
		WHEN OUMED_DZ3_OLD IS NULL THEN 0
		ELSE OUMED_DZ3_OLD
	END AS OUMED_DZ3,
	COALESCE(OUDENT_DZ, 0) AS OUDENT_DZ, COALESCE(OUORT_DZ, 0) AS OUORT_DZ
FROM raw_a_ms
UNION ALL
SELECT 
    PIDWON, 2020 AS D_YEAR,
	COALESCE(INMED_DZ1, 0), COALESCE(INMED_DZ2, 0), COALESCE(INMED_DZ3, 0),
	COALESCE(OUMED_DZ1, 0), COALESCE(OUMED_DZ2, 0), COALESCE(OUMED_DZ3, 0),
	COALESCE(OUDENT_DZ, 0) AS OUDENT_DZ,
	CASE 
		WHEN OUORT_DZ_Y2020 = 1 THEN 3
		WHEN OUORT_DZ_Y2020 = 2 THEN 7
		WHEN OUORT_DZ_Y2020 = 3 THEN 4
		WHEN OUORT_DZ_Y2020 = 4 THEN 6
		WHEN OUORT_DZ_Y2020 = 5 THEN 9
		WHEN OUORT_DZ_Y2020 = 6 THEN 8
		WHEN OUORT_DZ_Y2020 = 7 THEN 2
		WHEN OUORT_DZ_Y2020 = 8 THEN 1
		WHEN OUORT_DZ_Y2020 = 9 THEN 5
		WHEN OUORT_DZ_Y2020 = 11 THEN 5
		WHEN OUORT_DZ_Y2020 = 12 THEN 11
        ELSE 0
	END AS OUORT_DZ
FROM raw_b_ms
UNION ALL
SELECT 
    PIDWON, 2021 AS D_YEAR,
	COALESCE(INMED_DZ1, 0), COALESCE(INMED_DZ2, 0), COALESCE(INMED_DZ3, 0),
	COALESCE(OUMED_DZ1, 0), COALESCE(OUMED_DZ2, 0), COALESCE(OUMED_DZ3, 0),
	COALESCE(OUDENT_DZ, 0) AS OUDENT_DZ, COALESCE(OUORT_DZ, 0) AS OUORT_DZ
FROM raw_c_ms
"""

In [383]:
ms = get_data_from_db(query_ms)
ms

Unnamed: 0,PIDWON,D_YEAR,INMED_DZ1,INMED_DZ2,INMED_DZ3,OUMED_DZ1,OUMED_DZ2,OUMED_DZ3,OUDENT_DZ,OUORT_DZ
0,11200101.0,2019,0.0,0.0,0.0,13.0,37.0,51.0,0.0,0.0
1,11336101.0,2019,0.0,0.0,0.0,32.0,0.0,0.0,0.0,0.0
2,11336101.0,2019,0.0,0.0,0.0,50.0,0.0,0.0,0.0,0.0
3,11200101.0,2019,0.0,0.0,0.0,1.0,18.0,25.0,0.0,0.0
4,11336101.0,2019,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0
...,...,...,...,...,...,...,...,...,...,...
776149,25406801.0,2021,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
776150,25406802.0,2021,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
776151,25406802.0,2021,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
776152,25406802.0,2021,0.0,0.0,0.0,51.0,0.0,0.0,0.0,0.0


In [384]:
HTN = list(range(1,3))
HPT = list(range(3,5))
CVD = list(range(5,9))
LIV = list(range(9,13))
LRI = list(range(13,17))
MSD = list(range(17,27))
CNR = list(range(27,34))
OBG = list(range(34,36))
ETC = list(range(36,52))

list_mappings = {
    'HTN': HTN,
    'HPT': HPT,
    'CVD': CVD,
    'LIV': LIV,
    'LRI': LRI,
    'MSD': MSD,
    'CNR': CNR,
    'OBG': OBG,
    'ETC': ETC
}

In [385]:
for list_name, values in list_mappings.items():
    # 모든 관련 컬럼에서 값 확인
    condition = False
    for col in ['INMED_DZ1', 'INMED_DZ2', 'INMED_DZ3', 'OUMED_DZ1', 'OUMED_DZ2', 'OUMED_DZ3']:
        condition = condition | ms[col].isin(values)
    
    # 조건에 따라 새 컬럼 생성
    ms[list_name] = np.where(condition, 1, 0)

In [386]:
tmp = ms.drop(columns=['INMED_DZ1','INMED_DZ2','INMED_DZ3','OUMED_DZ1','OUMED_DZ2','OUMED_DZ3'])

In [387]:
ms_onehot = ['OUDENT_DZ','OUORT_DZ']
onehot_ms = ColumnTransformer([
    ('OneHot', OneHotEncoder(sparse_output=False), ms_onehot)
    ],
    remainder='passthrough',
    verbose_feature_names_out=False
)

pipeline_ms = Pipeline([
    ('OneHot', onehot_ms)
])

In [388]:
tmp1 = pipeline_ms.fit_transform(tmp)
tmp1

Unnamed: 0,OUDENT_DZ_0.0,OUDENT_DZ_1.0,OUDENT_DZ_2.0,OUDENT_DZ_3.0,OUDENT_DZ_4.0,OUDENT_DZ_5.0,OUDENT_DZ_6.0,OUDENT_DZ_7.0,OUORT_DZ_0.0,OUORT_DZ_1.0,...,D_YEAR,HTN,HPT,CVD,LIV,LRI,MSD,CNR,OBG,ETC
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,2019,0,0,0,0,1,0,0,0,1
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,2019,0,0,0,0,0,0,1,0,0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,2019,0,0,0,0,0,0,0,0,1
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,2019,1,0,0,0,0,1,0,0,0
4,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,2019,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
776149,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,2021,0,0,0,0,0,0,0,0,0
776150,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,2021,0,0,0,0,0,0,0,0,0
776151,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,2021,0,0,0,0,0,0,0,0,0
776152,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,2021,0,0,0,0,0,0,0,0,1


In [389]:
cols = list(tmp1.columns.difference(['PIDWON','D_YEAR']))

In [390]:
grouped_df = tmp1.groupby(['PIDWON', 'D_YEAR'])[cols].agg(lambda x: 0 if x.sum() == 0 else 1)

In [391]:
ms_final = grouped_df.reset_index().rename(mapper={'HTN':'INMED_HTN',
                                                    'HPT':'INMED_HPT',
                                                    'CVD':'INMED_CVD',
                                                    'LIV':'INMED_LIV',
                                                    'LRI':'INMED_LRI',
                                                    'MSD':'INMED_MSD',
                                                    'CNR':'INMED_CNR',
                                                    'OBG':'INMED_OBG',
                                                    'ETC':'INMED_ETC'}, axis=1)

In [392]:
ms_final

Unnamed: 0,PIDWON,D_YEAR,INMED_CNR,INMED_CVD,INMED_ETC,INMED_HPT,INMED_HTN,INMED_LIV,INMED_LRI,INMED_MSD,...,OUORT_DZ_10.0,OUORT_DZ_11.0,OUORT_DZ_2.0,OUORT_DZ_3.0,OUORT_DZ_4.0,OUORT_DZ_5.0,OUORT_DZ_6.0,OUORT_DZ_7.0,OUORT_DZ_8.0,OUORT_DZ_9.0
0,11200101.0,2019,0,0,1,0,1,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,11200101.0,2020,0,0,1,0,1,0,1,1,...,0,0,0,0,0,0,0,0,0,0
2,11200101.0,2021,0,0,1,0,1,0,1,1,...,0,0,0,0,0,0,0,0,0,0
3,11200201.0,2019,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,11200201.0,2020,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36780,40000804.0,2020,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
36781,40000804.0,2021,0,0,1,0,0,0,0,1,...,0,1,0,0,0,0,0,0,0,0
36782,40000901.0,2019,0,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
36783,40000901.0,2020,0,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# PHI Table
- transform 내용 정리리리리리리리

## PHI Table 에만 필요한 Transformer
- 내용내용

In [393]:
class PHITableTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        # 단독 가구인 경우만 뽑기기
        X = X.loc[X["PHI_PID"].str.len() < 9, :]

        return X

In [394]:
columns_phi = "HHID,PHI_PID,PHI_PID1,PHI3,PHI4_1_D,PHI4_2_D,PHI4_3_D,PHI4_4_D,PHI4_5_D,PHI4_6_D,PHI4_7_D,PHI4_8_D,PHI5,PHI6,PHR3"

query = f"""SELECT 2019 AS D_YEAR, {columns_phi} FROM raw_a_phi
            WHERE PHR2 = 1
            UNION ALL
            SELECT 2020 AS D_YEAR, {columns_phi} FROM raw_b_phi
            WHERE PHR2 = 1
            UNION ALL
            SELECT 2021 AS D_YEAR, {columns_phi} FROM raw_c_phi
            WHERE PHR2 = 1"""
phi = get_data_from_db(query)
phi

Unnamed: 0,D_YEAR,HHID,PHI_PID,PHI_PID1,PHI3,PHI4_1_D,PHI4_2_D,PHI4_3_D,PHI4_4_D,PHI4_5_D,PHI4_6_D,PHI4_7_D,PHI4_8_D,PHI5,PHI6,PHR3
0,2019,112003011.0,11200301,11200301.0,3.0,1.0,1.0,1.0,2.0,2.0,2.0,2.0,1.0,1.0,120000.0,60000.0
1,2019,112033011.0,11203301,11203301.0,2.0,,,,,,,,,1.0,87000.0,120000.0
2,2019,112049011.0,11204901,11204901.0,2.0,,,,,,,,,1.0,-9.0,300000.0
3,2019,112058011.0,11205801,11205801.0,1.0,1.0,1.0,1.0,2.0,2.0,2.0,2.0,2.0,3.0,60770.0,1000000.0
4,2019,112064011.0,11206402,11206402.0,3.0,-9.0,-9.0,-9.0,-9.0,-9.0,-9.0,-9.0,-9.0,1.0,75000.0,3800000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5000,2021,300070011.0,30007001,30007001.0,3.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,1.0,121260.0,545760.0
5001,2021,300070011.0,30007002,30007002.0,3.0,1.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,1.0,144168.0,1730000.0
5002,2021,300081011.0,30008104,30008104.0,3.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,1.0,39000.0,52600.0
5003,2021,300085011.0,30008504,30008504.0,3.0,1.0,1.0,1.0,2.0,2.0,2.0,1.0,2.0,1.0,109058.0,134379.0


## PHI Table 전처리 파이프라인

In [395]:
phi_nan0 = ['PHI4_1_D','PHI4_2_D','PHI4_3_D','PHI4_4_D','PHI4_5_D','PHI4_6_D','PHI4_7_D','PHI4_8_D','PHR3']
phi_90 = ['PHI3', 'PHI4_1_D','PHI4_2_D','PHI4_3_D','PHI4_4_D','PHI4_5_D','PHI4_6_D','PHI4_7_D','PHI4_8_D','PHI5']
phi_9med = ['PHI6','PHR3']
phi_20 = ['PHI4_1_D','PHI4_2_D','PHI4_3_D','PHI4_4_D','PHI4_5_D','PHI4_6_D','PHI4_7_D','PHI4_8_D']
phi_onehot = ['PHI3']
rename_map_phi = {'PHI_PID1' : 'PIDWON',
                  'PHI3' : 'PHI_FORM',
                  'PHI5' : 'PHI_PREMIUM_YN',
                  'PHI6' : 'PHI_PREMIUM',
                  'PHR3' : 'PHI_BENEFIT'}
drop_columns_phi = ['PHI_PID']


In [396]:
imputer_1_phi = ColumnTransformer([
    ('Imputer_1', SimpleImputer(strategy='constant', missing_values=np.nan, fill_value=0.0), phi_nan0)
    ],
    remainder='passthrough',
    verbose_feature_names_out=False
)

imputer_2_phi = ColumnTransformer([
    ('Imputer_2', SimpleImputer(strategy='median', missing_values=-9.0, fill_value=None), phi_9med)
    ],
    remainder='passthrough',
    verbose_feature_names_out=False
)

imputer_3_phi = ColumnTransformer([
    ('Imputer_3', SimpleImputer(strategy='constant', missing_values=-9.0, fill_value=0.0), phi_90)
    ],
    remainder='passthrough',
    verbose_feature_names_out=False
)

imputer_4_phi = ColumnTransformer([
    ('Imputer_4', SimpleImputer(strategy='constant', missing_values=2.0, fill_value=0.0), phi_20)
    ],
    remainder='passthrough',
    verbose_feature_names_out=False
)

scale_1_phi = ColumnTransformer([
    ('log', FunctionTransformer(func=np.log1p, inverse_func=np.expm1), ['PHI6','PHR3'])
    ],
    remainder='passthrough',
    verbose_feature_names_out=False
)

onehot_phi = ColumnTransformer([
    ('OneHot', OneHotEncoder(sparse_output=False), phi_onehot)
    ],
    remainder='passthrough',
    verbose_feature_names_out=False
)

In [397]:
phi_pipeline = Pipeline([
    ('Same_id', SameIdImpute(id_cols=['PHI_PID','PHI_PID1'])),
    ('strip', StripTransformer(columns=['PHI_PID'])),
    ('phi',PHITableTransformer()),
    ('Imputer_1', imputer_1_phi),
    ('Imputer_2', imputer_2_phi),
    ('Imputer_3', imputer_3_phi),
    ('Imputer_4', imputer_4_phi),
    ('Encoding', onehot_phi),
    ('Scale_1', scale_1_phi),
    ('rename_drop', RenameDropColumnsTransformer(rename_map=rename_map_phi, drop_columns=drop_columns_phi))
])

In [398]:
phi_piped = phi_pipeline.fit_transform(phi)
phi_piped

Unnamed: 0,PHI_PREMIUM,PHI_BENEFIT,PHI3_0.0,PHI3_1.0,PHI3_2.0,PHI3_3.0,PHI4_1_D,PHI4_2_D,PHI4_3_D,PHI4_4_D,PHI4_5_D,PHI4_6_D,PHI4_7_D,PHI4_8_D,PHI_PREMIUM_YN,D_YEAR,HHID,PIDWON
1205,11.708912,12.133884,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,2020,112003011.0,11200301.0
1206,11.673606,13.642895,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,2020,112003011.0,11200302.0
1207,10.793660,12.765691,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2020,112010011.0,11201001.0
1208,12.214626,15.068274,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,2020,112010011.0,11201002.0
1209,10.154285,11.407576,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2020,112010011.0,11201002.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5002,10.571343,10.870490,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2021,300081011.0,30008104.0
1203,11.695255,13.997833,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,2019,300085011.0,30008504.0
5003,11.599644,11.808427,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,2021,300085011.0,30008504.0
1204,10.999613,13.017005,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,2019,400001011.0,40000101.0


# PHI 기준으로 합치기

In [399]:
# 가구 데이터 추가
phi_hh = pd.merge(phi_piped, hh_piped, on=['HHID','D_YEAR'], how='left')

In [400]:
phi_hh.shape

(4934, 30)

In [401]:
# 개인 데이터 추가
phi_ind = pd.merge(phi_hh, ind_merged, on=['PIDWON','D_YEAR'], how='left')

In [402]:
phi_ind.shape

(4934, 130)

In [403]:
model_data = pd.merge(phi_ind, ms_final, on=['PIDWON','D_YEAR'], how='left')

In [404]:
model_data = model_data.fillna(0)

In [405]:
model_data.shape

(4934, 159)

In [406]:
model_data.to_csv("C:/Users/user/Documents/final_model_data_encoded.csv", encoding='utf-8', index=False)