# 결손치 예측 MICE 알고리즘

#### (1) 데이터 로드하기

In [1]:
# 데이터 로드

import pandas as pd

data_path = "../src/heart_missing.csv"
data = pd.read_csv(data_path)
data

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,52.0,1,0.0,125.0,212,0,1.0,168,0,1.0,2,2,3,0
1,53.0,1,0.0,140.0,203,1,0.0,155,1,3.1,0,0,3,0
2,70.0,1,,145.0,174,0,,125,1,2.6,0,0,3,0
3,61.0,1,0.0,148.0,203,0,1.0,161,0,0.0,2,1,3,0
4,62.0,0,0.0,,294,1,1.0,106,0,1.9,1,3,2,0
5,,0,0.0,100.0,248,0,0.0,122,0,1.0,1,0,2,1
6,58.0,1,0.0,114.0,318,0,2.0,140,0,4.4,0,3,1,0
7,55.0,1,0.0,160.0,289,0,0.0,145,1,0.8,1,1,3,0
8,46.0,1,0.0,120.0,249,0,0.0,144,0,0.8,2,0,3,0
9,54.0,1,0.0,122.0,286,0,0.0,116,1,3.2,1,2,2,0


#### (2) target column 제거하기

In [2]:
# target column 제거
data_without_target = data.drop('target', axis=1)
data_without_target

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
0,52.0,1,0.0,125.0,212,0,1.0,168,0,1.0,2,2,3
1,53.0,1,0.0,140.0,203,1,0.0,155,1,3.1,0,0,3
2,70.0,1,,145.0,174,0,,125,1,2.6,0,0,3
3,61.0,1,0.0,148.0,203,0,1.0,161,0,0.0,2,1,3
4,62.0,0,0.0,,294,1,1.0,106,0,1.9,1,3,2
5,,0,0.0,100.0,248,0,0.0,122,0,1.0,1,0,2
6,58.0,1,0.0,114.0,318,0,2.0,140,0,4.4,0,3,1
7,55.0,1,0.0,160.0,289,0,0.0,145,1,0.8,1,1,3
8,46.0,1,0.0,120.0,249,0,0.0,144,0,0.8,2,0,3
9,54.0,1,0.0,122.0,286,0,0.0,116,1,3.2,1,2,2


#### (3) MICE 알고리즘 선언하기

In [3]:
# MICE 알고리즘 적용하기 (sklearn IterativeImputer)
from sklearn.impute import IterativeImputer

ImportError: IterativeImputer is experimental and the API might change without any deprecation cycle. To use it, you need to explicitly import enable_iterative_imputer:
from sklearn.experimental import enable_iterative_imputer

In [4]:
# MICE 알고리즘 적용하기 (sklearn IterativeImputer)
from sklearn.linear_model import LinearRegression
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

lr = LinearRegression()
mice = IterativeImputer(estimator=lr, max_iter=100)

#### (4) MICE 학습하기

In [5]:
mice.fit(data_without_target)

#### (5) 결손치 예측데이터와 실제 비교하기

In [6]:
# 결손치 데이터
data

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,52.0,1,0.0,125.0,212,0,1.0,168,0,1.0,2,2,3,0
1,53.0,1,0.0,140.0,203,1,0.0,155,1,3.1,0,0,3,0
2,70.0,1,,145.0,174,0,,125,1,2.6,0,0,3,0
3,61.0,1,0.0,148.0,203,0,1.0,161,0,0.0,2,1,3,0
4,62.0,0,0.0,,294,1,1.0,106,0,1.9,1,3,2,0
5,,0,0.0,100.0,248,0,0.0,122,0,1.0,1,0,2,1
6,58.0,1,0.0,114.0,318,0,2.0,140,0,4.4,0,3,1,0
7,55.0,1,0.0,160.0,289,0,0.0,145,1,0.8,1,1,3,0
8,46.0,1,0.0,120.0,249,0,0.0,144,0,0.8,2,0,3,0
9,54.0,1,0.0,122.0,286,0,0.0,116,1,3.2,1,2,2,0


In [7]:
# 실제 데이터
data_real_path = "../src/heart.csv"
data_real = pd.read_csv(data_real_path)
data_real.head(10)

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,52,1,0,125,212,0,1,168,0,1.0,2,2,3,0
1,53,1,0,140,203,1,0,155,1,3.1,0,0,3,0
2,70,1,0,145,174,0,1,125,1,2.6,0,0,3,0
3,61,1,0,148,203,0,1,161,0,0.0,2,1,3,0
4,62,0,0,138,294,1,1,106,0,1.9,1,3,2,0
5,58,0,0,100,248,0,0,122,0,1.0,1,0,2,1
6,58,1,0,114,318,0,2,140,0,4.4,0,3,1,0
7,55,1,0,160,289,0,0,145,1,0.8,1,1,3,0
8,46,1,0,120,249,0,0,144,0,0.8,2,0,3,0
9,54,1,0,122,286,0,0,116,1,3.2,1,2,2,0


In [8]:
# 결손치 예측 데이터
predicted_data = mice.transform(data_without_target)
pd.DataFrame(predicted_data)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,52.0,1.0,0.0,125.0,212.0,0.0,1.0,168.0,0.0,1.0,2.0,2.0,3.0
1,53.0,1.0,0.0,140.0,203.0,1.0,0.0,155.0,1.0,3.1,0.0,0.0,3.0
2,70.0,1.0,0.0,145.0,174.0,0.0,0.324475,125.0,1.0,2.6,0.0,0.0,3.0
3,61.0,1.0,0.0,148.0,203.0,0.0,1.0,161.0,0.0,0.0,2.0,1.0,3.0
4,62.0,0.0,0.0,121.083875,294.0,1.0,1.0,106.0,0.0,1.9,1.0,3.0,2.0
5,49.215246,0.0,0.0,100.0,248.0,0.0,0.0,122.0,0.0,1.0,1.0,0.0,2.0
6,58.0,1.0,0.0,114.0,318.0,0.0,2.0,140.0,0.0,4.4,0.0,3.0,1.0
7,55.0,1.0,0.0,160.0,289.0,0.0,0.0,145.0,1.0,0.8,1.0,1.0,3.0
8,46.0,1.0,0.0,120.0,249.0,0.0,0.0,144.0,0.0,0.8,2.0,0.0,3.0
9,54.0,1.0,0.0,122.0,286.0,0.0,0.0,116.0,1.0,3.2,1.0,2.0,2.0


#### (+) 더 많은 실제 데이터를 사용하여 동일한 과정 적용하기