# 라이브러리

In [296]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
import matplotlib.pyplot as plt
import seaborn as sns
import koreanize_matplotlib
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn import set_config
set_config(transform_output='pandas')

# 데이터 뽑아오는 함수

In [271]:
# 데이터 병합 및 year 열 추가 함수 정의의: SQL
def get_data_from_db(query):

    # mysql 접속 정보 정의
    username = "admin"
    password = "admin1234"
    host = "hk-toss-middle-project.cjkcuqkegqpx.eu-north-1.rds.amazonaws.com"
    database_name = "raw_data"

    # 데이터베이스 연결 문자열, connection string
    db_connection_str = f'mysql+pymysql://{username}:{password}@{host}/{database_name}'

    # 데이터 베이스 connector 얻어내기
    db_connection = create_engine(db_connection_str)
    db_connection

    df = pd.read_sql(query, con=db_connection)
    return df

# Transformer 정의

## 문자열 앞뒤 공백 제거 Transformer

In [80]:
# 앞뒤공백 제거 transformer
class StripTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, columns:list):
        # 입력한 columns가 문자열이면 리스트로 변환
        if isinstance(columns, str):
            self.columns = [columns]
        else:
            self.columns = columns

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        # 선택한 열 중에 데이터프레임에 실제로 존재하는 열만 선택
        columns_y = [col for col in self.columns if col in X.columns]

        # 선택한 열의 값들의 앞뒤 공백을 제거
        for col in columns_y:

            # 문자열 형식인지 확인
            if X[col].dtype == 'object':
                try:
                    X[col] = X[col].str.strip()
                except:
                    pass

        return X

## 열 탈락 / 열 이름 변경 Transformer

In [81]:
class RenameDropColumnsTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, rename_map:dict = {}, drop_columns:list = []):
        self.rename_map = rename_map
        # drop_columns가 문자열이면 리스트로 변환
        if isinstance(drop_columns, str):
            self.drop_columns = [drop_columns]
        else:
            self.drop_columns = drop_columns

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        # 실제로 데이터프레임에 존재하는 열만 drop
        self.drop_columns = [col for col in self.drop_columns if col in X.columns]
        # drop_columns가 존재하면 drop
        if self.drop_columns:
            X.drop(columns=self.drop_columns, inplace=True)

        # 실제로 데이터프레임에 존재하는 열만 선택
        rename_map = {key: value for key, value in self.rename_map.items() if key in X.columns}
        # rename_map이 존재하면 rename
        if self.rename_map:
            X.rename(columns=self.rename_map, inplace=True)
            
        return X

## NaN 및 값 대치 처리

In [82]:
class ValueImputer(BaseEstimator, TransformerMixin):
    def __init__(self, columns, missing_value, fill_value):
        if isinstance(columns, str):
            self.columns = [columns]
        else: 
            self.columns = columns
        self.missing_value = missing_value
        self.fill_value = fill_value
        
        self.imputer = SimpleImputer(missing_values=self.missing_value, strategy='constant', fill_value=self.fill_value)

    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X_copy = X.copy()        
        X_copy[self.columns] = self.imputer.fit_transform(X_copy[self.columns])
        return X_copy

## ManyHotEncoding을 위한 리스트 열 만들어주기

In [83]:
class ColumnsWithList(BaseEstimator, TransformerMixin):
    def __init__(self, from_columns, to_column):
        self.from_columns = from_columns
        self.to_column= to_column

    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X[self.to_column] = X[self.from_columns].apply(lambda row: ','.join([str(int(x)) for x in [row[col] for col in self.from_columns]]), axis=1)
        X.drop(columns=self.from_columns, inplace=True)
        return X

## ManyHotEncoding

In [None]:
class ManyHotEncoding(BaseEstimator, TransformerMixin):
    def __init__(self, columns, prefix = ""):
        self.columns = columns
        self.prefix = prefix
        self.mlb = MultiLabelBinarizer()

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        # MultiLabelBinarizer를 사용하여 데이터를 변환합니다.
        encoded = self.mlb.fit_transform(X[self.columns])
        encoded = encoded[:, 1:]

        # 숫자 클래스와 name_map을 사용하여 열 이름 생성
        mapped_classes = [f"{self.prefix}_{cls}" for cls in self.mlb.classes_[1:]]

        # 변환된 데이터를 DataFrame으로 반환
        encoded_df = pd.DataFrame(encoded, columns=mapped_classes)
        X.drop(columns=self.columns, inplace=True)
        data = pd.concat([encoded_df, X], axis=1)
        return data

In [280]:
class SameIdImpute(BaseEstimator, TransformerMixin):
    def __init__(self, id_cols = []):
        self.id_cols = id_cols
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        self.columns_to_fill = X.columns.difference(self.id_cols)
        X = X.sort_values(self.id_cols)
        X.loc[X[self.columns_to_fill].isnull().all(axis=1)] = X.ffill().bfill().loc[X[self.columns_to_fill].isnull().all(axis=1)]

        return X

In [331]:
SameIdImpute(id_cols=['D_YEAR','PIDWON','WT_MG','CARE_TOT','P2_2']).fit_transform(ind)

Unnamed: 0,D_YEAR,AGE,CARE_TOT,P2_2,WT_MG,PIDWON,DEATH_I_YN,SEX,MARR,EDU,...,OUGUN_ORT,EROOP,INOOP,OUOOP_1,OUOOP_1_MED,OUOOP_1_DENT,OUOOP_1_ORT,OUOOP_2,I_PHI_N,I_FFS_YN
0,2019,81.0,0.0,30.0,0,11200101.0,2.0,2.0,3.0,1.0,...,,,,5000.0,5000.0,,,81060.0,,
1,2019,73.0,0.0,,0,11200201.0,2.0,2.0,3.0,1.0,...,,,,8200.0,8200.0,,,580.0,,
2,2019,73.0,0.0,30.0,1,11200202.0,2.0,2.0,3.0,1.0,...,22.0,90930.0,783050.0,8200.0,8200.0,22800.0,44300.0,580.0,1.0,1.0
3,2019,44.0,0.0,40.0,0,11200301.0,2.0,1.0,1.0,4.0,...,,90930.0,,67400.0,44600.0,22800.0,,4200.0,1.0,1.0
4,2019,41.0,0.0,20.0,1,11200302.0,2.0,2.0,1.0,4.0,...,,,,,,,,,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45225,2021,41.0,0.0,20.0,1,40000703.0,2.0,2.0,1.0,4.0,...,4.0,,,1091000.0,1030800.0,,60200.0,307540.0,1.0,2.0
45226,2021,43.0,0.0,60.0,1,40000704.0,2.0,1.0,1.0,5.0,...,2.0,,2311380.0,669100.0,618600.0,32100.0,18400.0,43600.0,1.0,2.0
45227,2021,41.0,0.0,40.0,0,40000802.0,2.0,2.0,1.0,5.0,...,1.0,,,1834470.0,1503870.0,130600.0,200000.0,79000.0,2.0,1.0
45228,2021,41.0,0.0,40.0,1,40000804.0,2.0,1.0,1.0,5.0,...,5.0,,,1683300.0,132900.0,130400.0,1420000.0,4000.0,1.0,1.0


# HH Table

In [276]:
columns_hh = "HHID,CUSTM_BENF_YN,CUSTM_BENF1_D,CUSTM_BENF2_D,CUSTM_BENF3_D,CUSTM_BENF4_D,H_INC7,DEBT,PROP,OTC_MED,HLT_SUP1,MED_SUP,H_OOP,HEXP3"

query_hh = f"""
SELECT 2019 AS D_YEAR, {columns_hh} from raw_a_hh
UNION ALL
SELECT 2020 AS D_YEAR, {columns_hh} from raw_b_hh
UNION ALL
SELECT 2021 AS D_YEAR, {columns_hh} from raw_c_hh
"""

In [277]:
hh = get_data_from_db(query=query_hh)
hh

Unnamed: 0,D_YEAR,HHID,CUSTM_BENF_YN,CUSTM_BENF1_D,CUSTM_BENF2_D,CUSTM_BENF3_D,CUSTM_BENF4_D,H_INC7,DEBT,PROP,OTC_MED,HLT_SUP1,MED_SUP,H_OOP,HEXP3
0,2019,112001011.0,1.0,1.0,1.0,2.0,2.0,9.0,0.0,400.0,61000.0,0.0,0.0,86060.0,
1,2019,112002011.0,1.0,1.0,1.0,2.0,1.0,0.0,0.0,0.0,60000.0,0.0,0.0,8780.0,
2,2019,112003011.0,2.0,,,,,91.0,750.0,7650.0,74000.0,0.0,0.0,247670.0,
3,2019,112010011.0,2.0,,,,,0.0,0.0,23000.0,162600.0,450000.0,3800.0,3558000.0,
4,2019,112012011.0,2.0,,,,,232.0,0.0,52000.0,0.0,0.0,0.0,2041164.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18867,2021,400004011.0,2.0,,,,,0.0,8000.0,11000.0,500000.0,0.0,0.0,0.0,
18868,2021,400005011.0,2.0,,,,,0.0,0.0,3300.0,370000.0,0.0,0.0,123300.0,
18869,2021,400007011.0,2.0,,,,,0.0,3000.0,7000.0,400000.0,200000.0,0.0,4422620.0,
18870,2021,400008011.0,2.0,,,,,51.0,7000.0,19500.0,143000.0,500000.0,0.0,3600770.0,


In [288]:
hh_nan_0 = ['CUSTM_BENF1_D','CUSTM_BENF2_D','CUSTM_BENF3_D','CUSTM_BENF4_D','H_OOP','HEXP3']
hh_9_med = ['H_INC7','DEBT','PROP','MED_SUP','HEXP3']
hh_2_0 = ['CUSTM_BENF_YN','CUSTM_BENF1_D','CUSTM_BENF2_D','CUSTM_BENF3_D','CUSTM_BENF4_D']

In [None]:
imputer_1_hh = ColumnTransformer([
    ('Imputer_1', SimpleImputer(strategy='constant', missing_values=np.nan, fill_value=0.0), hh_nan_0),
    ],
    remainder='passthrough',
    verbose_feature_names_out=False
)

imputer_2_hh = ColumnTransformer([
    ('Imputer_2', SimpleImputer(strategy='median', missing_values=np.nan), hh_9_med)
    ],
    remainder='passthrough',
    verbose_feature_names_out=False
)

imputer_3_hh = ColumnTransformer([
    ('Imputer_3', SimpleImputer(strategy='constant', missing_values=2.0, fill_value=0.0), hh_2_0)
    ],
    remainder='passthrough',
    verbose_feature_names_out=False
)

pipeline_hh = Pipeline([
    ('Same_id', SameIdImpute(['HHID','D_YEAR'])),
    ('Imputer_1', imputer_1_hh),
    ('Imputer_2', imputer_2_hh),
    ('Imputer_3', imputer_3_hh)
])

In [292]:
hh_piped = pipeline_hh.fit_transform(hh)
hh_piped

Unnamed: 0,CUSTM_BENF_YN,CUSTM_BENF1_D,CUSTM_BENF2_D,CUSTM_BENF3_D,CUSTM_BENF4_D,H_INC7,DEBT,PROP,MED_SUP,HEXP3,H_OOP,D_YEAR,HHID,OTC_MED,HLT_SUP1
0,1.0,1.0,1.0,0.0,0.0,9.0,0.0,400.0,0.0,0.0,86060.0,2019,112001011.0,61000.0,0.0
6748,1.0,1.0,1.0,1.0,0.0,0.0,0.0,400.0,35000.0,0.0,157444.0,2020,112001011.0,102000.0,0.0
12965,1.0,1.0,1.0,0.0,0.0,14.0,0.0,400.0,15000.0,0.0,104520.0,2021,112001011.0,130000.0,0.0
1,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,8780.0,2019,112002011.0,60000.0,0.0
6749,1.0,1.0,1.0,0.0,1.0,50.0,0.0,20.0,0.0,0.0,28770.0,2020,112002011.0,412000.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12963,0.0,0.0,0.0,0.0,0.0,140.0,8000.0,17500.0,0.0,0.0,885770.0,2020,400008011.0,660000.0,200000.0
18870,0.0,0.0,0.0,0.0,0.0,51.0,7000.0,19500.0,0.0,0.0,3600770.0,2021,400008011.0,143000.0,500000.0
6747,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7000.0,0.0,0.0,7843129.0,2019,400009011.0,0.0,0.0
12964,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10000.0,0.0,0.0,390000.0,2020,400009011.0,130000.0,0.0


# IND Table

## 만성질환 분류표

- 고혈압과 당뇨병 (HTN):
  - 고혈압, 당뇨병 (HTN, DM)

- 심뇌혈관 질환 (CVD):
  - 협심증, 심근경색증, 뇌출혈, 뇌경색 (AP, MI, CH, CI)

- 간질환 (LIV):
  - 만성간염(B형, C형), 알코올성 간질환, 간경화증(간경변증) (CLD, ALD, LC)

- 만성 하기도 질환(Lower Respiratory Infection, LRI):
  - 천식, 폐기종, 만성폐쇄성폐질환(COPD), 기관지확장증 (AST, PEM, COPD, BPE)

- 근골격계 질환 Musculoskeletal Disorders (MSD):
  - 무릎골관절염(무릎퇴행성관절염), 무릎 외 골관절염(퇴행성관절염), 류마티스 관절염, 어깨관절질환, 추간판(디스크) 질환, 기타 척추 질환 (OAK, OAE, RA, OAS, VD, VD_OLD, VDE, VDE_OLD)

- 갑상선 기능 장애 hyperthyroidism (HPT):
  - 갑상선 기능저하증, 갑상선 기능항진증 (HPOT, HPT)

In [364]:
# 질병 코드 그룹화
D_HTN = ["HTN", "DM"]
D_CVD = ["AP", "MI", "CH", "CI"]
D_LIV = ["CLD", "ALD", "LC"]
D_LRI = ["AST", "PEM", "COPD", "BPE"]
D_MSD_OLD = ["OAK", "OAE", "RA", "VD_OLD", "VDE_OLD"]
D_MSD = ["OAK", "OAE", "RA", "OAS", "VD", "VDE"]
D_MSD_ALL = ["OAK", "OAE", "RA", "OAS", "VD", "VDE", "VD_OLD", "VDE_OLD"]
D_HPT = ["HPOT", "HPT"]

# 열 이름으로 리스트화 (CD1 - 질병유무, CD2 - 진단시기)
D_HTN_CD1 = ["CD1_"+d for d in D_HTN]
D_CVD_CD1 = ["CD1_"+d for d in D_CVD]
D_LIV_CD1 = ["CD1_"+d for d in D_LIV]
D_LRI_CD1 = ["CD1_"+d for d in D_LRI]
D_MSD_OLD_CD1 = ["CD1_"+d for d in D_MSD_OLD]
D_MSD_CD1 = ["CD1_"+d for d in D_MSD]
D_HPT_CD1 = ["CD1_"+d for d in D_HPT]
D_OLD_CD1 = ["CD1_"+d for d in D_HTN + D_CVD + D_LIV + D_LRI + D_MSD_OLD + D_HPT]
D_21_CD1 = ["CD1_"+d for d in D_HTN + D_CVD + D_LIV + D_LRI + D_MSD + D_HPT]

## 만성질환을 그룹화 시킨 후 Many-Hot Encoding 형태로 변환 Transformer

In [365]:
class DiseaseTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        # 질병 분류 별 하나라도 해당하면 1, 아니면 0
        # 질병 분류 별 그룹화
        X["CD_HTN"] = X.apply(lambda x: 1 if any(x[col] == 1 for col in D_HTN_CD1) else 0, axis=1)
        X["CD_CVD"] = X.apply(lambda x: 1 if any(x[col] == 1 for col in D_CVD_CD1) else 0, axis=1)
        X["CD_LIV"] = X.apply(lambda x: 1 if any(x[col] == 1 for col in D_LIV_CD1) else 0, axis=1)
        X["CD_LRI"] = X.apply(lambda x: 1 if any(x[col] == 1 for col in D_LRI_CD1) else 0, axis=1)
        X["CD_MSD"] = X.apply(lambda x: 1 if any(x[col] == 1 for col in list(set(D_OLD_CD1 + D_21_CD1))) else 0, axis=1)
        X["CD_HPT"] = X.apply(lambda x: 1 if any(x[col] == 1 for col in D_HPT_CD1) else 0, axis=1)

        # 필요없는 행 탈락
        X.drop(X.iloc[:, 2:25], axis=1, inplace = True)

        return X

In [366]:
# 3개년 데이터의 질병 코드가 상이함. 따라서, 따로 불러온 후 그룹화 진행행
query = f"""
select PIDWON,2019 AS D_YEAR,CD1_HTN,CD1_DM,CD1_AP,CD1_MI,CD1_CH,CD1_CI,CD1_CLD,CD1_ALD,CD1_LC,CD1_AST,CD1_PEM,CD1_COPD,CD1_BPE,CD1_OAK,CD1_OAE,CD1_RA,CD1_VD_OLD,NULL AS CD1_VD,CD1_VDE_OLD,NULL AS CD1_VDE,CD1_HPOT,CD1_HPT,NULL AS CD1_OAS
from raw_a_ind
UNION ALL
select PIDWON,2020 AS D_YEAR,CD1_HTN,CD1_DM,CD1_AP,CD1_MI,CD1_CH,CD1_CI,CD1_CLD,CD1_ALD,CD1_LC,CD1_AST,CD1_PEM,CD1_COPD,CD1_BPE,CD1_OAK,CD1_OAE,CD1_RA,CD1_VD_OLD,NULL AS CD1_VD,CD1_VDE_OLD,NULL AS CD1_VDE,CD1_HPOT,CD1_HPT,NULL AS CD1_OAS
from raw_b_ind
UNION ALL
select PIDWON,2021 AS D_YEAR,CD1_HTN,CD1_DM,CD1_AP,CD1_MI,CD1_CH,CD1_CI,CD1_CLD,CD1_ALD,CD1_LC,CD1_AST,CD1_PEM,CD1_COPD,CD1_BPE,CD1_OAK,CD1_OAE,CD1_RA,NULL AS CD1_VD_OLD,CD1_VD,NULL AS CD1_VDE_OLD,CD1_VDE,CD1_HPOT,CD1_HPT,CD1_OAS
from raw_c_ind
"""

phi_disease = get_data_from_db(query)

In [367]:
disease_pipeline = Pipeline([
    ('disease', DiseaseTransformer())
])

In [368]:
ind_disease_piped = disease_pipeline.fit_transform(phi_disease)
ind_disease_piped

Unnamed: 0,PIDWON,D_YEAR,CD_HTN,CD_CVD,CD_LIV,CD_LRI,CD_MSD,CD_HPT
0,11200101.0,2019,1,1,0,0,1,0
1,11200201.0,2019,0,0,0,0,1,0
2,11200202.0,2019,0,0,0,0,0,0
3,11200301.0,2019,0,0,0,0,0,0
4,11200302.0,2019,1,0,0,0,1,0
...,...,...,...,...,...,...,...,...
45225,40000703.0,2021,1,0,0,0,1,0
45226,40000704.0,2021,0,0,0,0,0,0
45227,40000802.0,2021,0,0,0,0,0,0
45228,40000804.0,2021,0,0,0,0,0,0


## 나머지 IND Table

In [341]:
columns_ind = """
PIDWON, DEATH_I_YN, SEX, MARR, EDU, LIVE_T_YN,
HEALTH_INS, DISA_YN, DISA_TY,
ECO1,
I_INC1, I_INC2,
CARE1, CARE4_2,
P1, P2, coalesce(S4, (S5 / 30 * S6)) as S4, D1, HT, WT, SE1,
HS1, HS_MED_YN, HS_SRH, HS6_1, HS7_1, HS8_1,
I_PHI1_1, I_PHI1_2, I_PHI1_3, I_PHI5, I_PHI6,
ERGUN, INGUN, OUGUN, OUGUN_MED, OUGUN_DENT, OUGUN_ORT, EROOP, INOOP, 
OUOOP_1, OUOOP_1_MED, OUOOP_1_DENT, OUOOP_1_ORT, OUOOP_2, I_PHI_N, I_FFS_YN
"""

query_ind = f"""
SELECT 2019 AS D_YEAR, 2019-BIRTH_Y AS AGE, 
(COALESCE(CARE6_1_4, 0) + COALESCE(CARE6_2_4, 0) + COALESCE(CARE6_3_4, 0) + COALESCE(CARE6_4_4, 0)) AS CARE_TOT,
(P2_1 * 60 + P2_2) AS P2_2, CASE WHEN WTMG = 4 THEN 0 ELSE 1 END AS WT_MG,
{columns_ind}
FROM raw_a_ind
UNION ALL
SELECT 2020 AS D_YEAR, 2020-BIRTH_Y AS AGE, 
(COALESCE(CARE6_1_4, 0) + COALESCE(CARE6_2_4, 0) + COALESCE(CARE6_3_4, 0) + COALESCE(CARE6_4_4, 0)) AS CARE_TOT,
P2_2, CASE WHEN WTMG = 4 THEN 0 ELSE 1 END AS WT_MG,
{columns_ind}
FROM raw_b_ind
UNION ALL
SELECT 2021 AS D_YEAR, 2021-BIRTH_Y AS AGE, 
(COALESCE(CARE6_1A_4, 0) + COALESCE(CARE6_1B_4, 0) + COALESCE(CARE6_2_4, 0) + COALESCE(CARE6_3_4, 0) + COALESCE(CARE6_4_4, 0)) AS CARE_TOT,
P2_2, CASE WHEN WTMG = 4 THEN 0 ELSE 1 END AS WT_MG,
{columns_ind}
FROM raw_c_ind
"""

In [343]:
ind = get_data_from_db(query_ind)
ind

Unnamed: 0,D_YEAR,AGE,CARE_TOT,P2_2,WT_MG,PIDWON,DEATH_I_YN,SEX,MARR,EDU,...,OUGUN_ORT,EROOP,INOOP,OUOOP_1,OUOOP_1_MED,OUOOP_1_DENT,OUOOP_1_ORT,OUOOP_2,I_PHI_N,I_FFS_YN
0,2019,81.0,0.0,30.0,0,11200101.0,2.0,2.0,3.0,1.0,...,,,,5000.0,5000.0,,,81060.0,,
1,2019,73.0,0.0,,0,11200201.0,2.0,2.0,3.0,1.0,...,,,,8200.0,8200.0,,,580.0,,
2,2019,,0.0,,1,11200202.0,,,,,...,,,,,,,,,,
3,2019,44.0,0.0,40.0,0,11200301.0,2.0,1.0,1.0,4.0,...,,90930.0,,67400.0,44600.0,22800.0,,4200.0,1.0,1.0
4,2019,41.0,0.0,20.0,1,11200302.0,2.0,2.0,1.0,4.0,...,,,,,,,,,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45225,2021,41.0,0.0,20.0,1,40000703.0,2.0,2.0,1.0,4.0,...,4.0,,,1091000.0,1030800.0,,60200.0,307540.0,1.0,2.0
45226,2021,43.0,0.0,60.0,1,40000704.0,2.0,1.0,1.0,5.0,...,2.0,,2311380.0,669100.0,618600.0,32100.0,18400.0,43600.0,1.0,2.0
45227,2021,41.0,0.0,40.0,0,40000802.0,2.0,2.0,1.0,5.0,...,1.0,,,1834470.0,1503870.0,130600.0,200000.0,79000.0,2.0,1.0
45228,2021,41.0,0.0,40.0,1,40000804.0,2.0,1.0,1.0,5.0,...,5.0,,,1683300.0,132900.0,130400.0,1420000.0,4000.0,1.0,1.0


In [352]:
ind_nan_0 = ['DEATH_I_YN','DISA_TY','ECO1','I_INC1','I_INC2','CARE1','CARE4_2','P1','P2','P2_2','S4','D1','SE1','HS_MED_YN','HS6_1','HS7_1','HS8_1','I_PHI1_1','I_PHI1_2','I_PHI1_3','I_PHI5','I_PHI6','ERGUN','INGUN','OUGUN','OUGUN_MED','OUGUN_DENT','OUGUN_ORT','EROOP','INOOP','OUOOP_1','OUOOP_1_MED','OUOOP_1_DENT','OUOOP_1_ORT','OUOOP_2','I_PHI_N','I_FFS_YN']
ind_nan_mean = ['HT']
ind_nan_mdn = ['AGE','WT','HS1','HS_SRH']
ind_2_0 = ['DEATH_I_YN','SEX','DISA_YN','CARE1','P1','SE1','I_FFS_YN']
ind_9_0 = ['P2_2','D1','I_PHI1_1','I_PHI1_2','I_PHI1_3','I_PHI5','I_PHI6']
ind_8_0 = ['P2','HS_MED_YN']
ind_1_0 = ['I_PHI5','I_PHI6']
ind_onehot = ['MARR','EDU','LIVE_T_YN','HEALTH_INS','DISA_TY','ECO1','CARE4_2','P1']
ind_manyhot = ['I_PHI1_1','I_PHI1_2','I_PHI1_3']

In [359]:
imputer_1_ind = ColumnTransformer([
    ('Imputer_1', SimpleImputer(strategy='constant', missing_values=np.nan, fill_value=0.0), ind_nan_0),
    ],
    remainder='passthrough',
    verbose_feature_names_out=False
)

imputer_2_ind = ColumnTransformer([
    ('Imputer_2', SimpleImputer(strategy='mean', missing_values=np.nan), ind_nan_mean),
    ],
    remainder='passthrough',
    verbose_feature_names_out=False
)

imputer_3_ind = ColumnTransformer([
    ('Imputer_3', SimpleImputer(strategy='median', missing_values=np.nan), ind_nan_mdn),
    ],
    remainder='passthrough',
    verbose_feature_names_out=False
)

imputer_4_ind = ColumnTransformer([
    ('Imputer_4', SimpleImputer(strategy='constant', missing_values=2.0, fill_value=0.0), ind_2_0),
    ],
    remainder='passthrough',
    verbose_feature_names_out=False
)

imputer_5_ind = ColumnTransformer([
    ('Imputer_5', SimpleImputer(strategy='constant', missing_values=-9.0, fill_value=0.0), ind_9_0),
    ],
    remainder='passthrough',
    verbose_feature_names_out=False
)

imputer_6_ind = ColumnTransformer([
    ('Imputer_6', SimpleImputer(strategy='constant', missing_values=8.0, fill_value=0.0), ind_8_0),
    ],
    remainder='passthrough',
    verbose_feature_names_out=False
)

imputer_7_ind = ColumnTransformer([
    ('Imputer_7', SimpleImputer(strategy='constant', missing_values=1.0, fill_value=0.0), ind_1_0),
    ],
    remainder='passthrough',
    verbose_feature_names_out=False
)

onehot_ind = ColumnTransformer([
    ('OneHot', OneHotEncoder(sparse_output=False), ind_onehot)
    ],
    remainder='passthrough',
    verbose_feature_names_out=False
)

pipeline_ind = Pipeline([
    ('Same_id', SameIdImpute(['D_YEAR','PIDWON','WT_MG','CARE_TOT','P2_2'])),
    ('Imputer_1', imputer_1_ind),
    ('Imputer_2', imputer_2_ind),
    ('Imputer_3', imputer_3_ind),
    ('Imputer_4', imputer_4_ind),
    ('Imputer_5', imputer_5_ind),
    ('Imputer_6', imputer_6_ind),
    ('Imputer_7', imputer_7_ind),
    ('OneHot', onehot_ind),
    ('Column', ColumnsWithList(from_columns=ind_manyhot, to_column='I_PHI1')),
    ('Encoding', ManyHotEncoding(columns='I_PHI1', prefix='I_PHI'))
])

In [360]:
ind_piped = pipeline_ind.fit_transform(ind)
ind_piped

Unnamed: 0,I_PHI_0,I_PHI_1,I_PHI_2,I_PHI_3,I_PHI_4,I_PHI_5,MARR_1.0,MARR_2.0,MARR_3.0,MARR_4.0,...,OUOOP_1,OUOOP_1_MED,OUOOP_1_DENT,OUOOP_1_ORT,OUOOP_2,I_PHI_N,D_YEAR,CARE_TOT,WT_MG,PIDWON
0,1,0,0,0,0,0,0.0,0.0,1.0,0.0,...,5000.0,5000.0,0.0,0.0,81060.0,0.0,2019,0.0,0,11200101.0
1,1,0,0,0,0,0,0.0,0.0,1.0,0.0,...,8200.0,8200.0,0.0,0.0,580.0,0.0,2019,0.0,0,11200201.0
2,1,0,1,0,1,0,0.0,0.0,1.0,0.0,...,8200.0,8200.0,22800.0,44300.0,580.0,1.0,2019,0.0,1,11200202.0
3,1,0,1,0,0,0,1.0,0.0,0.0,0.0,...,67400.0,44600.0,22800.0,0.0,4200.0,1.0,2019,0.0,0,11200301.0
4,1,0,1,0,0,0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,2019,0.0,1,11200302.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45225,1,1,1,0,0,0,1.0,0.0,0.0,0.0,...,1091000.0,1030800.0,0.0,60200.0,307540.0,1.0,2021,0.0,1,40000703.0
45226,1,0,0,0,0,0,1.0,0.0,0.0,0.0,...,669100.0,618600.0,32100.0,18400.0,43600.0,1.0,2021,0.0,1,40000704.0
45227,0,1,1,0,1,0,1.0,0.0,0.0,0.0,...,1834470.0,1503870.0,130600.0,200000.0,79000.0,2.0,2021,0.0,0,40000802.0
45228,0,1,1,0,1,0,1.0,0.0,0.0,0.0,...,1683300.0,132900.0,130400.0,1420000.0,4000.0,1.0,2021,0.0,1,40000804.0


## 최종 IND Table

In [371]:
ind_merged = pd.merge(ind_piped, ind_disease_piped, on=['PIDWON','D_YEAR'], how='outer')
ind_merged

Unnamed: 0,I_PHI_0,I_PHI_1,I_PHI_2,I_PHI_3,I_PHI_4,I_PHI_5,MARR_1.0,MARR_2.0,MARR_3.0,MARR_4.0,...,D_YEAR,CARE_TOT,WT_MG,PIDWON,CD_HTN,CD_CVD,CD_LIV,CD_LRI,CD_MSD,CD_HPT
0,1,0,0,0,0,0,0.0,0.0,1.0,0.0,...,2019,0.0,0,11200101.0,1,1,0,0,1,0
1,1,0,0,0,0,0,0.0,0.0,1.0,0.0,...,2020,0.0,0,11200101.0,1,0,0,1,1,0
2,1,0,0,0,0,0,0.0,0.0,1.0,0.0,...,2021,0.0,0,11200101.0,1,0,0,1,1,0
3,1,0,0,0,0,0,0.0,0.0,1.0,0.0,...,2019,0.0,0,11200201.0,0,0,0,0,1,0
4,1,0,0,0,0,0,0.0,0.0,1.0,0.0,...,2020,0.0,0,11200201.0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45225,1,0,0,0,0,0,1.0,0.0,0.0,0.0,...,2020,0.0,1,40000804.0,0,0,0,0,0,0
45226,0,1,1,0,1,0,1.0,0.0,0.0,0.0,...,2021,0.0,1,40000804.0,0,0,0,0,0,0
45227,1,0,0,0,0,0,0.0,0.0,0.0,1.0,...,2019,0.0,1,40000901.0,0,0,0,1,1,0
45228,1,0,0,0,0,0,0.0,0.0,0.0,1.0,...,2020,0.0,1,40000901.0,0,0,0,0,1,1


# MS Table

In [103]:
columns_ms = 'HHID, PIDWON, MS1, DAYS, M_TYPE, MEXP1, MEXP2, MEXP3_1, MEXP3_1_1, MEXP3_1_2, MEXP3_2'

query_ms = f"""
SELECT 2019 AS D_YEAR, {columns_ms} FROM raw_a_ms
UNION ALL
SELECT 2020 AS D_YEAR, {columns_ms} FROM raw_b_ms
UNION ALL 
SELECT 2021 AS D_YEAR, {columns_ms} FROM raw_c_ms
"""

ms = get_data_from_db(query_ms)
ms.head()

Unnamed: 0,D_YEAR,HHID,PIDWON,MS1,DAYS,M_TYPE,MEXP1,MEXP2,MEXP3_1,MEXP3_1_1,MEXP3_1_2,MEXP3_2
0,2019,112001011.0,11200101.0,3.0,1.0,2.0,2.0,2.0,0.0,-9.0,-9.0,-9.0
1,2019,112001011.0,11200101.0,3.0,1.0,2.0,2.0,2.0,0.0,-9.0,-9.0,-9.0
2,2019,112001011.0,11200101.0,3.0,1.0,2.0,2.0,2.0,0.0,-9.0,-9.0,-9.0
3,2019,112001011.0,11200101.0,3.0,1.0,2.0,2.0,2.0,0.0,-9.0,-9.0,-9.0
4,2019,112001011.0,11200101.0,3.0,1.0,2.0,2.0,2.0,0.0,-9.0,-9.0,-9.0


In [104]:
impute_90_ms = ['MEXP3_1','MEXP3_1_1','MEXP3_1_2','MEXP3_2']

In [105]:
ms_pipeline = Pipeline([
    ('Imputer', ValueImputer(columns=impute_90_ms, missing_value=-9, fill_value=0))
])

In [106]:
ms_piped = ms_pipeline.fit_transform(ms)
ms_piped

Unnamed: 0,D_YEAR,HHID,PIDWON,MS1,DAYS,M_TYPE,MEXP1,MEXP2,MEXP3_1,MEXP3_1_1,MEXP3_1_2,MEXP3_2
0,2019,112001011.0,11200101.0,3.0,1.0,2.0,2.0,2.0,0.0,0.0,0.0,0.0
1,2019,112001011.0,11200101.0,3.0,1.0,2.0,2.0,2.0,0.0,0.0,0.0,0.0
2,2019,112001011.0,11200101.0,3.0,1.0,2.0,2.0,2.0,0.0,0.0,0.0,0.0
3,2019,112001011.0,11200101.0,3.0,1.0,2.0,2.0,2.0,0.0,0.0,0.0,0.0
4,2019,112001011.0,11200101.0,3.0,1.0,2.0,2.0,2.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
776149,2021,400009011.0,40000901.0,3.0,1.0,4.0,1.0,2.0,0.0,0.0,0.0,0.0
776150,2021,400009011.0,40000901.0,3.0,1.0,1.0,1.0,1.0,50860.0,43300.0,7560.0,368412.0
776151,2021,400009011.0,40000901.0,3.0,1.0,1.0,1.0,1.0,11500.0,11500.0,0.0,104265.0
776152,2021,400009011.0,40000901.0,3.0,1.0,1.0,1.0,1.0,2100.0,2100.0,0.0,19940.0


In [107]:
ms_piped.isnull().sum()

D_YEAR       0
HHID         0
PIDWON       0
MS1          0
DAYS         0
M_TYPE       0
MEXP1        0
MEXP2        0
MEXP3_1      0
MEXP3_1_1    0
MEXP3_1_2    0
MEXP3_2      0
dtype: int64

In [108]:
tmp_ms = ms_piped.groupby(['D_YEAR','HHID']).agg(
    MED_CTN=('HHID','count'),
    MEXP3_1_MEAN = ('MEXP3_1', 'mean'),
    MEXP3_1_1_MEAN = ('MEXP3_1_1','mean'),
    MEXP3_1_2_MEAN = ('MEXP3_1_2','mean'),
    MEXP3_2_MEAN = ('MEXP3_2','mean')
).reset_index()


In [117]:
tmp_ms

Unnamed: 0,D_YEAR,HHID,MED_CTN,MEXP3_1_MEAN,MEXP3_1_1_MEAN,MEXP3_1_2_MEAN,MEXP3_2_MEAN
0,2019,112001011.0,36,138.888889,0.000000,0.000000,0.000000
1,2019,112002011.0,5,1640.000000,0.000000,740.000000,5386.000000
2,2019,112003011.0,20,10700.500000,9918.800000,873.500000,18198.750000
3,2019,112010011.0,144,19209.722222,5261.805556,13814.583333,13294.097222
4,2019,112012011.0,37,38463.891892,36669.756757,12135.945946,63378.216216
...,...,...,...,...,...,...,...
18223,2021,400004011.0,2,0.000000,0.000000,0.000000,0.000000
18224,2021,400005011.0,11,10345.454545,0.000000,0.000000,0.000000
18225,2021,400007011.0,51,79832.941176,0.000000,0.000000,0.000000
18226,2021,400008011.0,43,81808.604651,2402.325581,3139.534884,5621.860465


# PHI Table
- transform 내용 정리리리리리리리

## PHI Table 에만 필요한 Transformer
- 내용내용

In [109]:
class PHITableTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        # 단독 가구인 경우만 뽑기기
        X = X.loc[X["PHI_PID"].str.len() < 9, :]

        return X

## 3개년 병합해서 불러오기

In [152]:
columns_phi = "HHID,PHI_N,PHI_PID,PHI_PID1,PHI1,PHI2,PHI3,PHI4_1_D,PHI4_2_D,PHI4_3_D,PHI4_4_D,PHI4_5_D,PHI4_6_D,PHI4_7_D,PHI4_8_D,PHI5,PHI6,PHR1,PHR2,PHR3"

query = f"""SELECT 2019 AS D_YEAR, {columns_phi} FROM raw_a_phi
            WHERE PHR1 = 1
            UNION ALL
            SELECT 2020 AS D_YEAR, {columns_phi} FROM raw_b_phi
            WHERE PHR1 = 1
            UNION ALL
            SELECT 2021 AS D_YEAR, {columns_phi} FROM raw_c_phi
            WHERE PHR1 = 1"""
phi = get_data_from_db(query)
phi.head()

Unnamed: 0,D_YEAR,HHID,PHI_N,PHI_PID,PHI_PID1,PHI1,PHI2,PHI3,PHI4_1_D,PHI4_2_D,...,PHI4_4_D,PHI4_5_D,PHI4_6_D,PHI4_7_D,PHI4_8_D,PHI5,PHI6,PHR1,PHR2,PHR3
0,2019,112003011.0,101.0,11200301,11200301.0,2004.0,-9.0,3.0,1.0,1.0,...,2.0,2.0,2.0,2.0,1.0,1.0,120000.0,1.0,1.0,60000.0
1,2019,112033011.0,101.0,11203301,11203301.0,2012.0,9.0,2.0,,,...,,,,,,1.0,87000.0,1.0,1.0,120000.0
2,2019,112049011.0,101.0,11204901,11204901.0,-9.0,-9.0,2.0,,,...,,,,,,1.0,-9.0,1.0,1.0,300000.0
3,2019,112058011.0,102.0,11205801,11205801.0,2005.0,9.0,1.0,1.0,1.0,...,2.0,2.0,2.0,2.0,2.0,3.0,60770.0,1.0,1.0,1000000.0
4,2019,112064011.0,104.0,11206402,11206402.0,-9.0,-9.0,3.0,-9.0,-9.0,...,-9.0,-9.0,-9.0,-9.0,-9.0,1.0,75000.0,1.0,1.0,3800000.0


In [153]:
len(phi)

5133

## PHI Table 전처리 파이프라인

In [154]:
impute_nan2_phi = ['PHR2','PHI4_1_D','PHI4_2_D','PHI4_3_D','PHI4_4_D','PHI4_5_D','PHI4_6_D','PHI4_7_D','PHI4_8_D']
impute_nan0_phi = ['PHR3']
impute_90_phi = ['PHI1','PHI2','PHI3','PHI4_1_D','PHI4_2_D','PHI4_3_D','PHI4_4_D','PHI4_5_D','PHI4_6_D','PHI4_7_D','PHI4_8_D','PHI5','PHI6','PHR2','PHR3']
rename_map_phi = {'PHI_PID1' : 'PIDWON',
                  'PHI1' : 'PHI_Y',
                  'PHI2' : 'PHI_M',
                  'PHI3' : 'PHI_FORM',
                  'PHI4' : 'PHI_TYPE',
                  'PHI5' : 'PHI_PREMIUM_YN',
                  'PHI6' : 'PHI_PREMIUM',
                  'PHR1' : 'PHI_CLAIM_YN',
                  'PHR2' : 'PHI_BENEFIT_YN',
                  'PHR3' : 'PHI_BENEFIT'}
drop_columns_phi = ['PHI_PID']


In [156]:
phi_pipeline = Pipeline([
    ('Imputer_1', ValueImputer(columns=impute_nan2_phi, missing_value=np.nan, fill_value=2)),
    ('Imputer_2', ValueImputer(columns=impute_nan0_phi, missing_value=np.nan, fill_value=0)),
    ('Imputer_3', ValueImputer(columns=impute_90_phi, missing_value=-9, fill_value=0)),
    ('strip', StripTransformer(columns=['PHI_PID'])),
    ('phi',PHITableTransformer()),
    ('rename_drop', RenameDropColumnsTransformer(rename_map=rename_map_phi, drop_columns=drop_columns_phi))
])

In [157]:
phi_piped = phi_pipeline.fit_transform(phi)
phi_piped

Unnamed: 0,D_YEAR,HHID,PHI_N,PIDWON,PHI_Y,PHI_M,PHI_FORM,PHI4_1_D,PHI4_2_D,PHI4_3_D,PHI4_4_D,PHI4_5_D,PHI4_6_D,PHI4_7_D,PHI4_8_D,PHI_PREMIUM_YN,PHI_PREMIUM,PHI_CLAIM_YN,PHI_BENEFIT_YN,PHI_BENEFIT
0,2019,112003011.0,101.0,11200301.0,2004.0,0.0,3.0,1.0,1.0,1.0,2.0,2.0,2.0,2.0,1.0,1.0,120000.0,1.0,1.0,60000.0
1,2019,112033011.0,101.0,11203301.0,2012.0,9.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,1.0,87000.0,1.0,1.0,120000.0
2,2019,112049011.0,101.0,11204901.0,0.0,0.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,1.0,0.0,1.0,1.0,300000.0
3,2019,112058011.0,102.0,11205801.0,2005.0,9.0,1.0,1.0,1.0,1.0,2.0,2.0,2.0,2.0,2.0,3.0,60770.0,1.0,1.0,1000000.0
4,2019,112064011.0,104.0,11206402.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,75000.0,1.0,1.0,3800000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5128,2021,300070011.0,101.0,30007001.0,0.0,0.0,3.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,1.0,121260.0,1.0,1.0,545760.0
5129,2021,300070011.0,103.0,30007002.0,2013.0,0.0,3.0,1.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,1.0,144168.0,1.0,1.0,1730000.0
5130,2021,300081011.0,102.0,30008104.0,2013.0,1.0,3.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,1.0,39000.0,1.0,1.0,52600.0
5131,2021,300085011.0,201.0,30008504.0,2016.0,11.0,3.0,1.0,1.0,1.0,2.0,2.0,2.0,1.0,2.0,1.0,109058.0,1.0,1.0,134379.0


# PHI 기준으로 합치기

In [158]:
phi_piped

Unnamed: 0,D_YEAR,HHID,PHI_N,PIDWON,PHI_Y,PHI_M,PHI_FORM,PHI4_1_D,PHI4_2_D,PHI4_3_D,PHI4_4_D,PHI4_5_D,PHI4_6_D,PHI4_7_D,PHI4_8_D,PHI_PREMIUM_YN,PHI_PREMIUM,PHI_CLAIM_YN,PHI_BENEFIT_YN,PHI_BENEFIT
0,2019,112003011.0,101.0,11200301.0,2004.0,0.0,3.0,1.0,1.0,1.0,2.0,2.0,2.0,2.0,1.0,1.0,120000.0,1.0,1.0,60000.0
1,2019,112033011.0,101.0,11203301.0,2012.0,9.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,1.0,87000.0,1.0,1.0,120000.0
2,2019,112049011.0,101.0,11204901.0,0.0,0.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,1.0,0.0,1.0,1.0,300000.0
3,2019,112058011.0,102.0,11205801.0,2005.0,9.0,1.0,1.0,1.0,1.0,2.0,2.0,2.0,2.0,2.0,3.0,60770.0,1.0,1.0,1000000.0
4,2019,112064011.0,104.0,11206402.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,75000.0,1.0,1.0,3800000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5128,2021,300070011.0,101.0,30007001.0,0.0,0.0,3.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,1.0,121260.0,1.0,1.0,545760.0
5129,2021,300070011.0,103.0,30007002.0,2013.0,0.0,3.0,1.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,1.0,144168.0,1.0,1.0,1730000.0
5130,2021,300081011.0,102.0,30008104.0,2013.0,1.0,3.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,1.0,39000.0,1.0,1.0,52600.0
5131,2021,300085011.0,201.0,30008504.0,2016.0,11.0,3.0,1.0,1.0,1.0,2.0,2.0,2.0,1.0,2.0,1.0,109058.0,1.0,1.0,134379.0


In [159]:
phi_ind = pd.merge(phi_piped, tmp, on=['PIDWON','D_YEAR'], how='left')

In [160]:
# 가구데이터 추가
except_ms = pd.merge(phi_ind, hh_piped, on=["HHID", "D_YEAR"], how='left')
except_ms

Unnamed: 0,D_YEAR,HHID,PHI_N,PIDWON,PHI_Y,PHI_M,PHI_FORM,PHI4_1_D,PHI4_2_D,PHI4_3_D,...,H_INC7,H_INC_TOT,H_INC_MON,OTC_MED,HLT_SUP1,HLT_SUP2,MED_SUP,H_OOP,HEXP2,HEXP3
0,2019,112003011.0,101.0,11200301.0,2004.0,0.0,3.0,1.0,1.0,1.0,...,91.0,4497.0,374.750000,74000.0,0.0,0.0,0.0,247670.0,2.0,0.0
1,2019,112033011.0,101.0,11203301.0,2012.0,9.0,2.0,2.0,2.0,2.0,...,80.0,4822.0,401.833333,371000.0,360000.0,0.0,0.0,1914610.0,2.0,0.0
2,2019,112049011.0,101.0,11204901.0,0.0,0.0,2.0,2.0,2.0,2.0,...,100.0,4812.0,401.000000,200000.0,0.0,0.0,0.0,530620.0,2.0,0.0
3,2019,112058011.0,102.0,11205801.0,2005.0,9.0,1.0,1.0,1.0,1.0,...,200.0,1477.0,123.083333,103000.0,0.0,0.0,0.0,951795.0,2.0,0.0
4,2019,112064011.0,104.0,11206402.0,0.0,0.0,3.0,0.0,0.0,0.0,...,380.0,2139.0,178.250000,198000.0,300000.0,0.0,0.0,2701840.0,2.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5055,2021,300070011.0,101.0,30007001.0,0.0,0.0,3.0,1.0,2.0,2.0,...,228.0,7333.0,611.083333,700000.0,840000.0,720000.0,0.0,5479160.0,2.0,0.0
5056,2021,300070011.0,103.0,30007002.0,2013.0,0.0,3.0,1.0,1.0,2.0,...,228.0,7333.0,611.083333,700000.0,840000.0,720000.0,0.0,5479160.0,2.0,0.0
5057,2021,300081011.0,102.0,30008104.0,2013.0,1.0,3.0,1.0,2.0,2.0,...,5.0,2835.0,236.250000,200000.0,100000.0,0.0,0.0,2654290.0,2.0,0.0
5058,2021,300085011.0,201.0,30008504.0,2016.0,11.0,3.0,1.0,1.0,1.0,...,15.0,4360.0,363.333333,150000.0,150000.0,0.0,0.0,807460.0,2.0,0.0


In [161]:
tmp_all = pd.merge(except_ms, tmp_ms, on=['HHID','D_YEAR'], how='left')
tmp_all

Unnamed: 0,D_YEAR,HHID,PHI_N,PIDWON,PHI_Y,PHI_M,PHI_FORM,PHI4_1_D,PHI4_2_D,PHI4_3_D,...,HLT_SUP2,MED_SUP,H_OOP,HEXP2,HEXP3,MED_CTN,MEXP3_1_MEAN,MEXP3_1_1_MEAN,MEXP3_1_2_MEAN,MEXP3_2_MEAN
0,2019,112003011.0,101.0,11200301.0,2004.0,0.0,3.0,1.0,1.0,1.0,...,0.0,0.0,247670.0,2.0,0.0,20.0,10700.500000,9918.800000,873.500000,18198.750000
1,2019,112033011.0,101.0,11203301.0,2012.0,9.0,2.0,2.0,2.0,2.0,...,0.0,0.0,1914610.0,2.0,0.0,48.0,27550.208333,0.000000,0.000000,0.000000
2,2019,112049011.0,101.0,11204901.0,0.0,0.0,2.0,2.0,2.0,2.0,...,0.0,0.0,530620.0,2.0,0.0,9.0,7011.111111,0.000000,0.000000,0.000000
3,2019,112058011.0,102.0,11205801.0,2005.0,9.0,1.0,1.0,1.0,1.0,...,0.0,0.0,951795.0,2.0,0.0,55.0,13425.363636,8669.309091,4442.545455,27729.527273
4,2019,112064011.0,104.0,11206402.0,0.0,0.0,3.0,0.0,0.0,0.0,...,0.0,0.0,2701840.0,2.0,0.0,41.0,56779.024390,29394.804878,26943.780488,38854.463415
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5055,2021,300070011.0,101.0,30007001.0,0.0,0.0,3.0,1.0,2.0,2.0,...,720000.0,0.0,5479160.0,2.0,0.0,76.0,70468.421053,0.000000,0.000000,0.000000
5056,2021,300070011.0,103.0,30007002.0,2013.0,0.0,3.0,1.0,1.0,2.0,...,720000.0,0.0,5479160.0,2.0,0.0,76.0,70468.421053,0.000000,0.000000,0.000000
5057,2021,300081011.0,102.0,30008104.0,2013.0,1.0,3.0,1.0,2.0,2.0,...,0.0,0.0,2654290.0,2.0,0.0,12.0,203395.833333,0.000000,0.000000,0.000000
5058,2021,300085011.0,201.0,30008504.0,2016.0,11.0,3.0,1.0,1.0,1.0,...,0.0,0.0,807460.0,2.0,0.0,15.0,51450.666667,6118.666667,6000.000000,13600.000000
