In [2]:
import pandas as pd
import numpy as np

rawData_org = pd.read_csv('datasets/국민건강정보/NHIS_OPEN_GJ_2017.csv', encoding='CP949', engine='python')

In [3]:
rawData_org

Unnamed: 0,기준년도,가입자일련번호,성별코드,연령대코드(5세단위),시도코드,신장(5Cm단위),체중(5Kg 단위),허리둘레,시력(좌),시력(우),...,감마지티피,흡연상태,음주여부,구강검진 수검여부,치아우식증유무,결손치유무,치아마모증유무,제3대구치(사랑니)이상,치석,데이터공개일자
0,2017,1,1,13,46,170.0,65.0,91.0,1.0,1.2,...,25.0,3.0,0.0,1,,,,,1.0,20181126
1,2017,2,2,8,41,150.0,45.0,73.4,1.2,1.0,...,10.0,1.0,0.0,1,,,,,1.0,20181126
2,2017,3,1,8,45,175.0,75.0,94.0,1.0,0.8,...,136.0,1.0,0.0,1,,,,,0.0,20181126
3,2017,4,2,12,11,155.0,55.0,67.5,0.9,1.0,...,30.0,1.0,1.0,0,,,,,,20181126
4,2017,5,1,8,41,175.0,75.0,93.0,1.5,1.5,...,68.0,3.0,0.0,0,,,,,,20181126
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999995,2017,999996,2,9,41,165.0,55.0,70.0,1.5,1.5,...,11.0,1.0,1.0,0,,,,,,20181126
999996,2017,999997,2,9,11,165.0,50.0,68.0,1.2,1.5,...,11.0,1.0,0.0,1,,,,,0.0,20181126
999997,2017,999998,2,12,27,155.0,50.0,83.8,0.2,1.0,...,12.0,1.0,0.0,1,,,,,0.0,20181126
999998,2017,999999,1,11,47,160.0,70.0,99.0,0.8,0.9,...,35.0,2.0,1.0,0,,,,,,20181126


In [4]:
# 원본데이터의 튜플 수와 컬럼수 출력
rawData_org.shape

(1000000, 34)

In [5]:
# 의사결정트리에 사용할 속성리스트
feature_columns_to_use = ['성별코드', '연령대코드(5세단위)', '신장(5Cm단위)', '체중(5Kg 단위)', '허리둘레', '시력(좌)', '시력(우)',
                    '청력(좌)', '청력(우)', '수축기혈압', '이완기혈압', '식전혈당(공복혈당)', '트리글리세라이드', 'HDL콜레스테롤',
                    'LDL콜레스테롤', '요단백', '혈청크레아티닌', '(혈청지오티)AST', '(혈청지오티)ALT', '감마지티피', '흡연상태', '음주여부']

rawData = rawData_org[feature_columns_to_use]

In [6]:
# 속성값이 NULL인 튜플 제외
rawData = rawData.dropna()

# 인덱스 재설정
rawData.reset_index(inplace=True, drop=True)

# 원본데이터의 튜플 수와 컬럼수 출력
rawData.shape

(990910, 22)

In [7]:
import random as rd  # 샘플링을 위한 random 패키지 임포트

# 1) 샘플링을 통한 수량 축소
# 원본데이터 중 50,000개를 튜플만 샘플링
# len() : 데이터프레임의 크기를 구하는 함수
sample_idx = rd.sample(range(0, len(rawData)), 50000)

# 인덱스 값 정렬(오름차순)
sample_idx.sort()

sample_idx

[53,
 73,
 116,
 122,
 159,
 163,
 165,
 171,
 173,
 251,
 255,
 283,
 344,
 362,
 418,
 424,
 465,
 480,
 483,
 487,
 494,
 498,
 502,
 549,
 561,
 573,
 592,
 594,
 619,
 622,
 624,
 626,
 692,
 696,
 713,
 725,
 758,
 773,
 820,
 821,
 835,
 864,
 876,
 886,
 927,
 938,
 957,
 966,
 975,
 1016,
 1017,
 1023,
 1053,
 1073,
 1099,
 1100,
 1138,
 1142,
 1151,
 1191,
 1195,
 1197,
 1204,
 1209,
 1227,
 1230,
 1244,
 1313,
 1333,
 1339,
 1341,
 1370,
 1388,
 1400,
 1501,
 1507,
 1544,
 1545,
 1546,
 1577,
 1584,
 1662,
 1668,
 1681,
 1697,
 1703,
 1775,
 1778,
 1779,
 1793,
 1836,
 1863,
 1872,
 1881,
 1891,
 1949,
 1951,
 1961,
 1968,
 1980,
 1986,
 1992,
 1998,
 2040,
 2044,
 2057,
 2086,
 2090,
 2126,
 2151,
 2171,
 2201,
 2230,
 2237,
 2279,
 2301,
 2354,
 2371,
 2373,
 2392,
 2436,
 2453,
 2496,
 2510,
 2520,
 2522,
 2536,
 2560,
 2593,
 2621,
 2647,
 2687,
 2723,
 2729,
 2735,
 2766,
 2797,
 2807,
 2815,
 2823,
 2837,
 2853,
 2867,
 2888,
 2938,
 2976,
 2990,
 3007,
 3015,
 3026,
 

In [8]:
# 샘플링 된 인덱스로 구성된 샘플 데이터프레임 생성
rawData_sample = rawData.loc[sample_idx]

rawData_sample.reset_index(inplace=True, drop=True) #  인덱스 재설정


# 첫 10개의 행만 출력
rawData_sample.head(10)

Unnamed: 0,성별코드,연령대코드(5세단위),신장(5Cm단위),체중(5Kg 단위),허리둘레,시력(좌),시력(우),청력(좌),청력(우),수축기혈압,...,트리글리세라이드,HDL콜레스테롤,LDL콜레스테롤,요단백,혈청크레아티닌,(혈청지오티)AST,(혈청지오티)ALT,감마지티피,흡연상태,음주여부
0,2,10,160.0,60.0,80.0,0.7,0.8,1.0,1.0,120.0,...,105.0,62.0,86.0,1.0,0.9,17.0,11.0,13.0,1.0,0.0
1,1,10,175.0,60.0,82.0,0.9,1.2,1.0,1.0,120.0,...,186.0,62.0,62.0,1.0,0.7,22.0,12.0,98.0,1.0,1.0
2,1,9,175.0,85.0,93.0,1.5,1.5,1.0,1.0,128.0,...,268.0,54.0,141.0,1.0,1.2,24.0,28.0,28.0,2.0,1.0
3,1,12,170.0,70.0,93.0,0.9,1.0,1.0,1.0,106.0,...,94.0,50.0,132.0,1.0,0.8,18.0,13.0,25.0,1.0,1.0
4,1,13,175.0,80.0,95.0,1.5,1.2,1.0,1.0,129.0,...,117.0,62.0,123.0,1.0,1.1,25.0,30.0,21.0,2.0,0.0
5,2,15,140.0,55.0,77.0,0.5,0.8,1.0,1.0,130.0,...,92.0,52.0,183.0,1.0,1.1,25.0,15.0,15.0,1.0,0.0
6,2,11,150.0,65.0,84.0,1.0,1.0,1.0,1.0,137.0,...,110.0,50.0,94.0,1.0,0.6,27.0,32.0,20.0,1.0,0.0
7,1,9,180.0,75.0,81.1,1.5,1.5,1.0,1.0,134.0,...,369.0,55.0,160.0,3.0,1.2,29.0,46.0,89.0,3.0,1.0
8,1,6,170.0,70.0,86.0,1.5,2.0,1.0,1.0,118.0,...,69.0,57.0,119.0,1.0,0.9,38.0,57.0,24.0,2.0,1.0
9,2,13,160.0,75.0,91.2,0.5,0.6,1.0,1.0,150.0,...,78.0,52.0,99.0,1.0,0.9,17.0,14.0,17.0,1.0,0.0


In [9]:
rawData_sample.shape

(50000, 22)

In [10]:
feature_columns_to_use = ['성별코드', '연령대코드', '신장', '체중', '허리둘레', '시력_좌', '시력_우',
                    '청력_좌', '청력_우', '수축기혈압', '이완기혈압', '식전혈당', '트리글리세라이드', 'HDL콜레스테롤',
                    'LDL콜레스테롤', '요단백', '혈청크레아티닌', '혈청지오티_AST', '혈청지오티_ALT', '감마지티피', '흡연상태', '음주여부']

rawData_sample.rename(columns = {'연령대코드(5세단위)' : '연령대코드',
                                 '신장(5Cm단위)' : '신장',
                                 '체중(5Kg 단위)' : '체중',
                                 '시력(좌)' : '시력_좌',
                                 '시력(우)' : '시력_우',
                                 '청력(좌)' : '청력_좌',
                                 '청력(우)' : '청력_우',
                                 '식전혈당(공복혈당)' : '식전혈당',
                                 '(혈청지오티)AST' : '혈청지오티_AST',
                                 '(혈청지오티)ALT' : '혈청지오티_ALT'}, inplace = True)

from sklearn.preprocessing import StandardScaler  # sklearn.preprocessing의 StandardScaler 모듈 임포트

# StandardScaler() : Z-score 정규화 함수
stdscaler = StandardScaler()

rawData_sam_std = pd.DataFrame(stdscaler.fit_transform(rawData_sample.loc[:, '성별코드' : '음주여부']), columns = feature_columns_to_use)

# nonnumeric 속성을 categrical 데이터로 변환
nonnumeric_columns = ['성별코드','음주여부']

from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
for feature in nonnumeric_columns:
    rawData_sam_std[feature] = le.fit_transform(rawData_sam_std[feature])
    
rawData_sam_std    

Unnamed: 0,성별코드,연령대코드,신장,체중,허리둘레,시력_좌,시력_우,청력_좌,청력_우,수축기혈압,...,트리글리세라이드,HDL콜레스테롤,LDL콜레스테롤,요단백,혈청크레아티닌,혈청지오티_AST,혈청지오티_ALT,감마지티피,흡연상태,음주여부
0,1,-0.195272,-0.232224,-0.266260,-0.120177,-0.457232,-0.289735,-0.182814,-0.181172,-0.177177,...,-0.270653,0.340417,-0.773167,-0.214334,0.079971,-0.301296,-0.563329,-0.477640,-0.741192,0
1,0,-0.195272,1.365796,-0.266260,0.055541,-0.130743,0.351548,-0.182814,-0.181172,-0.177177,...,0.522543,0.340417,-1.467225,-0.214334,-0.349264,-0.135175,-0.524952,1.223506,-0.741192,1
2,0,-0.546619,1.365796,1.681678,1.021993,0.848725,0.832510,-0.182814,-0.181172,0.369429,...,1.325532,-0.190738,0.817381,-0.214334,0.723822,-0.068727,0.089095,-0.177437,0.483595,1
3,0,0.507423,0.833123,0.512915,1.021993,-0.130743,0.030907,-0.182814,-0.181172,-1.133737,...,-0.378371,-0.456315,0.557110,-0.214334,-0.134646,-0.268072,-0.486574,-0.237478,-0.741192,1
4,0,0.858771,1.365796,1.292091,1.197711,0.848725,0.351548,-0.182814,-0.181172,0.437755,...,-0.153142,0.340417,0.296838,-0.214334,0.509205,-0.035503,0.165851,-0.317532,0.483595,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,0,0.156076,0.300449,-0.266260,-0.401327,-0.457232,0.030907,-0.182814,-0.181172,-0.587131,...,-0.456711,-0.589104,-0.744248,-0.214334,0.079971,-0.234848,-0.333062,0.002684,0.483595,1
49996,0,-0.897967,1.898470,2.071266,0.758415,0.358991,0.832510,-0.182814,-0.181172,-0.177177,...,-0.593806,3.460951,-1.929930,-0.214334,0.079971,-0.268072,-0.256306,-0.397586,1.708381,0
49997,1,1.561466,-1.830245,-1.435022,-0.559473,-0.293987,-0.931017,-0.182814,-0.181172,0.847709,...,0.140634,0.340417,-1.640740,-0.214334,0.079971,2.688882,5.116602,0.022697,-0.741192,0
49998,0,-1.600662,0.833123,0.512915,-0.295896,0.848725,0.832510,-0.182814,-0.181172,0.506081,...,-0.691732,-0.721892,-0.628572,-0.214334,-0.349264,-0.268072,-0.371440,-0.537680,0.483595,1


In [11]:
feature_columns_to_use[0:len(feature_columns_to_use)] #:8,10:21]

['성별코드',
 '연령대코드',
 '신장',
 '체중',
 '허리둘레',
 '시력_좌',
 '시력_우',
 '청력_좌',
 '청력_우',
 '수축기혈압',
 '이완기혈압',
 '식전혈당',
 '트리글리세라이드',
 'HDL콜레스테롤',
 'LDL콜레스테롤',
 '요단백',
 '혈청크레아티닌',
 '혈청지오티_AST',
 '혈청지오티_ALT',
 '감마지티피',
 '흡연상태',
 '음주여부']

In [12]:
# 소스 데이터프레임에서 예측(prediction)을 위한 속성 집합
# X = rawData.loc[:, feature_columns_to_use[0:8, 10:21]]  
y = rawData_sam_std.loc[:, feature_columns_to_use[9]]  # 종속변수(예측속성, 수축기혈압)
del feature_columns_to_use[9]
X = rawData_sam_std.loc[:, feature_columns_to_use]  # 독립변수

In [13]:
X

Unnamed: 0,성별코드,연령대코드,신장,체중,허리둘레,시력_좌,시력_우,청력_좌,청력_우,이완기혈압,...,트리글리세라이드,HDL콜레스테롤,LDL콜레스테롤,요단백,혈청크레아티닌,혈청지오티_AST,혈청지오티_ALT,감마지티피,흡연상태,음주여부
0,1,-0.195272,-0.232224,-0.266260,-0.120177,-0.457232,-0.289735,-0.182814,-0.181172,-0.619305,...,-0.270653,0.340417,-0.773167,-0.214334,0.079971,-0.301296,-0.563329,-0.477640,-0.741192,0
1,0,-0.195272,1.365796,-0.266260,0.055541,-0.130743,0.351548,-0.182814,-0.181172,0.391280,...,0.522543,0.340417,-1.467225,-0.214334,-0.349264,-0.135175,-0.524952,1.223506,-0.741192,1
2,0,-0.546619,1.365796,1.681678,1.021993,0.848725,0.832510,-0.182814,-0.181172,0.593397,...,1.325532,-0.190738,0.817381,-0.214334,0.723822,-0.068727,0.089095,-0.177437,0.483595,1
3,0,0.507423,0.833123,0.512915,1.021993,-0.130743,0.030907,-0.182814,-0.181172,-0.619305,...,-0.378371,-0.456315,0.557110,-0.214334,-0.134646,-0.268072,-0.486574,-0.237478,-0.741192,1
4,0,0.858771,1.365796,1.292091,1.197711,0.848725,0.351548,-0.182814,-0.181172,0.492339,...,-0.153142,0.340417,0.296838,-0.214334,0.509205,-0.035503,0.165851,-0.317532,0.483595,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,0,0.156076,0.300449,-0.266260,-0.401327,-0.457232,0.030907,-0.182814,-0.181172,-0.215071,...,-0.456711,-0.589104,-0.744248,-0.214334,0.079971,-0.234848,-0.333062,0.002684,0.483595,1
49996,0,-0.897967,1.898470,2.071266,0.758415,0.358991,0.832510,-0.182814,-0.181172,0.391280,...,-0.593806,3.460951,-1.929930,-0.214334,0.079971,-0.268072,-0.256306,-0.397586,1.708381,0
49997,1,1.561466,-1.830245,-1.435022,-0.559473,-0.293987,-0.931017,-0.182814,-0.181172,-1.225655,...,0.140634,0.340417,-1.640740,-0.214334,0.079971,2.688882,5.116602,0.022697,-0.741192,0
49998,0,-1.600662,0.833123,0.512915,-0.295896,0.848725,0.832510,-0.182814,-0.181172,1.300807,...,-0.691732,-0.721892,-0.628572,-0.214334,-0.349264,-0.268072,-0.371440,-0.537680,0.483595,1


In [14]:
y

0       -0.177177
1       -0.177177
2        0.369429
3       -1.133737
4        0.437755
           ...   
49995   -0.587131
49996   -0.177177
49997    0.847709
49998    0.506081
49999    0.506081
Name: 수축기혈압, Length: 50000, dtype: float64

In [15]:
from patsy import dmatrices
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [16]:
features = ""

for attr in feature_columns_to_use[:-1] :
    features = features + attr + "+"

features = features + feature_columns_to_use[len(feature_columns_to_use)-1]    
features

'성별코드+연령대코드+신장+체중+허리둘레+시력_좌+시력_우+청력_좌+청력_우+이완기혈압+식전혈당+트리글리세라이드+HDL콜레스테롤+LDL콜레스테롤+요단백+혈청크레아티닌+혈청지오티_AST+혈청지오티_ALT+감마지티피+흡연상태+음주여부'

In [17]:
# Break into left and right hand side; y and x
y, X = dmatrices('수축기혈압~'+features, data=rawData_sam_std, return_type="dataframe")

In [18]:
# For each Xi, calculate VIF
vif = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]

# Fit X to y
result = sm.OLS(y, X).fit()

In [19]:
result.summary()

0,1,2,3
Dep. Variable:,수축기혈압,R-squared:,0.596
Model:,OLS,Adj. R-squared:,0.596
Method:,Least Squares,F-statistic:,3517.0
Date:,"Wed, 07 Jul 2021",Prob (F-statistic):,0.0
Time:,19:45:31,Log-Likelihood:,-48262.0
No. Observations:,50000,AIC:,96570.0
Df Residuals:,49978,BIC:,96760.0
Df Model:,21,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.0363,0.007,5.405,0.000,0.023,0.050
성별코드,-0.0721,0.010,-7.219,0.000,-0.092,-0.053
연령대코드,0.1713,0.004,46.789,0.000,0.164,0.178
신장,-0.0627,0.005,-11.853,0.000,-0.073,-0.052
체중,0.1111,0.006,20.100,0.000,0.100,0.122
허리둘레,0.0187,0.004,4.328,0.000,0.010,0.027
시력_좌,-0.0049,0.003,-1.634,0.102,-0.011,0.001
시력_우,-0.0074,0.003,-2.462,0.014,-0.013,-0.002
청력_좌,0.0075,0.003,2.219,0.026,0.001,0.014

0,1,2,3
Omnibus:,3324.019,Durbin-Watson:,1.999
Prob(Omnibus):,0.0,Jarque-Bera (JB):,5857.541
Skew:,0.504,Prob(JB):,0.0
Kurtosis:,4.34,Cond. No.,7.93


In [20]:
# For each X, calculate VIF and save in dataframe
vif = pd.DataFrame()
vif["VIF Factor"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif["features"] = X.columns

In [21]:
vif.round(1)

Unnamed: 0,VIF Factor,features
0,5.6,Intercept
1,3.1,성별코드
2,1.7,연령대코드
3,3.5,신장
4,3.8,체중
5,2.3,허리둘레
6,1.1,시력_좌
7,1.1,시력_우
8,1.4,청력_좌
9,1.4,청력_우


In [22]:
float(np.mean((result.predict(X) - rawData_sam_std['수축기혈압'])**2))

0.4035743877040521

In [23]:
# RMSLE 계산하는 사용자정의 함수
from sklearn.metrics import make_scorer

def rmsle(predicted_values, actual_values) :
    # 넘파이로 배열 형태로 변환
    predicted_values = np.array(predicted_values)
    actual_values = np.array(actual_values)
    
    # 예측값과 실제값에 1을 더하고 로그를 씌움
    log_predict = np.log(predicted_values + 1)
    log_actual = np.log(actual_values + 1)
    
    # 위에서 계산한 예측값에서 실제값을 빼주고 제곱을 함
    difference = log_predict - log_actual
    # difference = (log_predict - log_actual) ** 2
    difference = np.square(difference)
    
    # 평균값 구함
    mean_difference = difference.mean()
    
    # 다시 루트를 씌움
    score = np.sqrt(mean_difference)
    
    return score

#rmsle_scorer = make_scorer(rmsle)
#rmsle_scorer

rmsle_score = rmsle(result.predict(X), rawData_sam_std['수축기혈압'])
rmsle_score

  log_predict = np.log(predicted_values + 1)
  log_actual = np.log(actual_values + 1)


nan

In [24]:
(result.predict(X) - rawData_sam_std['수축기혈압']).tail(100)

49900   -0.409239
49901   -0.176442
49902    0.493279
49903    0.649919
49904   -0.004507
           ...   
49995    0.414240
49996    0.491828
49997   -1.544373
49998    0.079259
49999   -0.166117
Length: 100, dtype: float64