In [1]:
import pandas as pd
import numpy as np

rawData_org = pd.read_csv('datasets/국민건강정보/NHIS_OPEN_GJ_2017.csv', encoding='CP949', engine='python')

In [2]:
rawData_org

Unnamed: 0,기준년도,가입자일련번호,성별코드,연령대코드(5세단위),시도코드,신장(5Cm단위),체중(5Kg 단위),허리둘레,시력(좌),시력(우),...,감마지티피,흡연상태,음주여부,구강검진 수검여부,치아우식증유무,결손치유무,치아마모증유무,제3대구치(사랑니)이상,치석,데이터공개일자
0,2017,1,1,13,46,170.0,65.0,91.0,1.0,1.2,...,25.0,3.0,0.0,1,,,,,1.0,20181126
1,2017,2,2,8,41,150.0,45.0,73.4,1.2,1.0,...,10.0,1.0,0.0,1,,,,,1.0,20181126
2,2017,3,1,8,45,175.0,75.0,94.0,1.0,0.8,...,136.0,1.0,0.0,1,,,,,0.0,20181126
3,2017,4,2,12,11,155.0,55.0,67.5,0.9,1.0,...,30.0,1.0,1.0,0,,,,,,20181126
4,2017,5,1,8,41,175.0,75.0,93.0,1.5,1.5,...,68.0,3.0,0.0,0,,,,,,20181126
5,2017,6,1,8,11,170.0,70.0,84.8,1.2,1.0,...,33.0,2.0,1.0,1,,,,,1.0,20181126
6,2017,7,1,8,48,175.0,110.0,111.5,1.0,1.0,...,45.0,3.0,1.0,0,,,,,,20181126
7,2017,8,2,6,26,170.0,70.0,78.0,0.8,0.7,...,13.0,1.0,0.0,0,,,,,,20181126
8,2017,9,1,8,41,170.0,80.0,93.0,0.7,0.9,...,51.0,2.0,0.0,1,,,,,1.0,20181126
9,2017,10,1,8,11,170.0,70.0,86.0,1.0,1.0,...,88.0,1.0,1.0,1,,,,,0.0,20181126


In [3]:
# 원본데이터의 튜플 수와 컬럼수 출력
rawData_org.shape

(1000000, 34)

In [4]:
# 의사결정트리에 사용할 속성리스트
feature_columns_to_use = ['성별코드', '연령대코드(5세단위)', '신장(5Cm단위)', '체중(5Kg 단위)', '허리둘레', '시력(좌)', '시력(우)',
                    '청력(좌)', '청력(우)', '수축기혈압', '이완기혈압', '식전혈당(공복혈당)', '트리글리세라이드', 'HDL콜레스테롤',
                    'LDL콜레스테롤', '요단백', '혈청크레아티닌', '(혈청지오티)AST', '(혈청지오티)ALT', '감마지티피', '흡연상태', '음주여부']

rawData = rawData_org[feature_columns_to_use]

In [5]:
# 속성값이 NULL인 튜플 제외
rawData = rawData.dropna()

# 인덱스 재설정
rawData.reset_index(inplace=True, drop=True)

# 원본데이터의 튜플 수와 컬럼수 출력
rawData.shape

(990910, 22)

In [6]:
import random as rd  # 샘플링을 위한 random 패키지 임포트

# 1) 샘플링을 통한 수량 축소
# 원본데이터 중 50,000개를 튜플만 샘플링
# len() : 데이터프레임의 크기를 구하는 함수
sample_idx = rd.sample(range(0, len(rawData)), 50000)

# 인덱스 값 정렬(오름차순)
sample_idx.sort()

sample_idx

[21,
 39,
 42,
 50,
 60,
 76,
 83,
 93,
 107,
 113,
 116,
 137,
 167,
 194,
 197,
 201,
 206,
 259,
 277,
 287,
 301,
 302,
 359,
 392,
 396,
 407,
 422,
 424,
 440,
 442,
 466,
 472,
 485,
 489,
 491,
 519,
 525,
 543,
 546,
 548,
 563,
 649,
 667,
 684,
 701,
 705,
 709,
 716,
 736,
 754,
 764,
 809,
 833,
 852,
 867,
 868,
 881,
 889,
 908,
 932,
 955,
 985,
 1030,
 1033,
 1034,
 1066,
 1069,
 1083,
 1084,
 1101,
 1105,
 1123,
 1208,
 1217,
 1238,
 1245,
 1264,
 1319,
 1329,
 1333,
 1368,
 1372,
 1373,
 1376,
 1385,
 1404,
 1414,
 1430,
 1433,
 1435,
 1442,
 1444,
 1464,
 1466,
 1495,
 1509,
 1521,
 1565,
 1585,
 1617,
 1635,
 1639,
 1660,
 1662,
 1678,
 1682,
 1683,
 1691,
 1702,
 1723,
 1726,
 1735,
 1743,
 1747,
 1761,
 1783,
 1786,
 1806,
 1812,
 1815,
 1839,
 1845,
 1864,
 1866,
 1892,
 1894,
 1917,
 1922,
 1941,
 1959,
 1985,
 2014,
 2026,
 2050,
 2056,
 2097,
 2100,
 2106,
 2108,
 2119,
 2126,
 2141,
 2152,
 2161,
 2169,
 2317,
 2339,
 2350,
 2371,
 2402,
 2409,
 2413,
 2480,

In [7]:
# 샘플링 된 인덱스로 구성된 샘플 데이터프레임 생성
rawData_sample = rawData.loc[sample_idx]

rawData_sample.reset_index(inplace=True, drop=True) #  인덱스 재설정


# 첫 10개의 행만 출력
rawData_sample.head(10)

Unnamed: 0,성별코드,연령대코드(5세단위),신장(5Cm단위),체중(5Kg 단위),허리둘레,시력(좌),시력(우),청력(좌),청력(우),수축기혈압,...,트리글리세라이드,HDL콜레스테롤,LDL콜레스테롤,요단백,혈청크레아티닌,(혈청지오티)AST,(혈청지오티)ALT,감마지티피,흡연상태,음주여부
0,1,7,170.0,65.0,83.0,1.5,1.2,1.0,1.0,118.0,...,132.0,42.0,106.0,1.0,0.8,21.0,34.0,35.0,3.0,1.0
1,2,10,155.0,60.0,88.0,1.0,1.0,1.0,1.0,120.0,...,131.0,65.0,114.0,1.0,0.6,25.0,15.0,12.0,1.0,0.0
2,2,13,155.0,50.0,72.0,1.0,1.0,1.0,1.0,110.0,...,52.0,70.0,115.0,1.0,0.5,20.0,11.0,24.0,1.0,0.0
3,2,13,155.0,40.0,64.0,0.9,0.8,1.0,1.0,96.0,...,86.0,84.0,127.0,1.0,0.9,16.0,16.0,12.0,1.0,0.0
4,1,15,160.0,65.0,93.0,0.6,0.6,1.0,1.0,140.0,...,115.0,58.0,130.0,1.0,1.2,22.0,17.0,65.0,2.0,0.0
5,1,7,175.0,70.0,80.0,1.2,1.5,1.0,1.0,122.0,...,35.0,73.0,87.0,1.0,1.1,36.0,31.0,13.0,3.0,0.0
6,1,8,170.0,60.0,78.0,1.5,1.5,1.0,1.0,110.0,...,117.0,45.0,104.0,1.0,1.0,16.0,17.0,16.0,1.0,0.0
7,1,10,165.0,65.0,85.0,0.8,1.0,1.0,1.0,120.0,...,161.0,81.0,240.0,1.0,0.9,21.0,41.0,29.0,3.0,0.0
8,1,8,165.0,60.0,82.0,0.5,0.3,1.0,1.0,134.0,...,144.0,42.0,127.0,1.0,0.8,22.0,35.0,55.0,3.0,1.0
9,1,13,160.0,65.0,88.0,1.2,1.2,1.0,1.0,130.0,...,135.0,53.0,151.0,1.0,1.2,23.0,14.0,45.0,2.0,1.0


In [8]:
rawData_sample.shape

(50000, 22)

In [9]:
feature_columns_to_use = ['성별코드', '연령대코드', '신장', '체중', '허리둘레', '시력_좌', '시력_우',
                    '청력_좌', '청력_우', '수축기혈압', '이완기혈압', '식전혈당', '트리글리세라이드', 'HDL콜레스테롤',
                    'LDL콜레스테롤', '요단백', '혈청크레아티닌', '혈청지오티_AST', '혈청지오티_ALT', '감마지티피', '흡연상태', '음주여부']

rawData_sample.rename(columns = {'연령대코드(5세단위)' : '연령대코드',
                                 '신장(5Cm단위)' : '신장',
                                 '체중(5Kg 단위)' : '체중',
                                 '시력(좌)' : '시력_좌',
                                 '시력(우)' : '시력_우',
                                 '청력(좌)' : '청력_좌',
                                 '청력(우)' : '청력_우',
                                 '식전혈당(공복혈당)' : '식전혈당',
                                 '(혈청지오티)AST' : '혈청지오티_AST',
                                 '(혈청지오티)ALT' : '혈청지오티_ALT'}, inplace = True)

# nonnumeric 속성을 categrical 데이터로 변환
nonnumeric_columns = ['성별코드','음주여부']

from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
for feature in nonnumeric_columns:
    rawData_sample[feature] = le.fit_transform(rawData_sample[feature])
    
rawData_sample 

Unnamed: 0,성별코드,연령대코드,신장,체중,허리둘레,시력_좌,시력_우,청력_좌,청력_우,수축기혈압,...,트리글리세라이드,HDL콜레스테롤,LDL콜레스테롤,요단백,혈청크레아티닌,혈청지오티_AST,혈청지오티_ALT,감마지티피,흡연상태,음주여부
0,0,7,170.0,65.0,83.0,1.5,1.2,1.0,1.0,118.0,...,132.0,42.0,106.0,1.0,0.8,21.0,34.0,35.0,3.0,1
1,1,10,155.0,60.0,88.0,1.0,1.0,1.0,1.0,120.0,...,131.0,65.0,114.0,1.0,0.6,25.0,15.0,12.0,1.0,0
2,1,13,155.0,50.0,72.0,1.0,1.0,1.0,1.0,110.0,...,52.0,70.0,115.0,1.0,0.5,20.0,11.0,24.0,1.0,0
3,1,13,155.0,40.0,64.0,0.9,0.8,1.0,1.0,96.0,...,86.0,84.0,127.0,1.0,0.9,16.0,16.0,12.0,1.0,0
4,0,15,160.0,65.0,93.0,0.6,0.6,1.0,1.0,140.0,...,115.0,58.0,130.0,1.0,1.2,22.0,17.0,65.0,2.0,0
5,0,7,175.0,70.0,80.0,1.2,1.5,1.0,1.0,122.0,...,35.0,73.0,87.0,1.0,1.1,36.0,31.0,13.0,3.0,0
6,0,8,170.0,60.0,78.0,1.5,1.5,1.0,1.0,110.0,...,117.0,45.0,104.0,1.0,1.0,16.0,17.0,16.0,1.0,0
7,0,10,165.0,65.0,85.0,0.8,1.0,1.0,1.0,120.0,...,161.0,81.0,240.0,1.0,0.9,21.0,41.0,29.0,3.0,0
8,0,8,165.0,60.0,82.0,0.5,0.3,1.0,1.0,134.0,...,144.0,42.0,127.0,1.0,0.8,22.0,35.0,55.0,3.0,1
9,0,13,160.0,65.0,88.0,1.2,1.2,1.0,1.0,130.0,...,135.0,53.0,151.0,1.0,1.2,23.0,14.0,45.0,2.0,1


In [11]:
feature_columns_to_use[0:len(feature_columns_to_use)] 

['성별코드',
 '연령대코드',
 '신장',
 '체중',
 '허리둘레',
 '시력_좌',
 '시력_우',
 '청력_좌',
 '청력_우',
 '수축기혈압',
 '이완기혈압',
 '식전혈당',
 '트리글리세라이드',
 'HDL콜레스테롤',
 'LDL콜레스테롤',
 '요단백',
 '혈청크레아티닌',
 '혈청지오티_AST',
 '혈청지오티_ALT',
 '감마지티피',
 '흡연상태',
 '음주여부']

In [12]:
# 소스 데이터프레임에서 예측(prediction)을 위한 속성 집합
# X = rawData.loc[:, feature_columns_to_use[0:8, 10:21]]  
y = rawData_sample.loc[:, feature_columns_to_use[9]]  # 종속변수(예측속성, 수축기혈압)
del feature_columns_to_use[9]
X = rawData_sample.loc[:, feature_columns_to_use]  # 독립변수

In [13]:
X

Unnamed: 0,성별코드,연령대코드,신장,체중,허리둘레,시력_좌,시력_우,청력_좌,청력_우,이완기혈압,...,트리글리세라이드,HDL콜레스테롤,LDL콜레스테롤,요단백,혈청크레아티닌,혈청지오티_AST,혈청지오티_ALT,감마지티피,흡연상태,음주여부
0,0,7,170.0,65.0,83.0,1.5,1.2,1.0,1.0,76.0,...,132.0,42.0,106.0,1.0,0.8,21.0,34.0,35.0,3.0,1
1,1,10,155.0,60.0,88.0,1.0,1.0,1.0,1.0,70.0,...,131.0,65.0,114.0,1.0,0.6,25.0,15.0,12.0,1.0,0
2,1,13,155.0,50.0,72.0,1.0,1.0,1.0,1.0,70.0,...,52.0,70.0,115.0,1.0,0.5,20.0,11.0,24.0,1.0,0
3,1,13,155.0,40.0,64.0,0.9,0.8,1.0,1.0,67.0,...,86.0,84.0,127.0,1.0,0.9,16.0,16.0,12.0,1.0,0
4,0,15,160.0,65.0,93.0,0.6,0.6,1.0,1.0,80.0,...,115.0,58.0,130.0,1.0,1.2,22.0,17.0,65.0,2.0,0
5,0,7,175.0,70.0,80.0,1.2,1.5,1.0,1.0,78.0,...,35.0,73.0,87.0,1.0,1.1,36.0,31.0,13.0,3.0,0
6,0,8,170.0,60.0,78.0,1.5,1.5,1.0,1.0,70.0,...,117.0,45.0,104.0,1.0,1.0,16.0,17.0,16.0,1.0,0
7,0,10,165.0,65.0,85.0,0.8,1.0,1.0,1.0,80.0,...,161.0,81.0,240.0,1.0,0.9,21.0,41.0,29.0,3.0,0
8,0,8,165.0,60.0,82.0,0.5,0.3,1.0,1.0,77.0,...,144.0,42.0,127.0,1.0,0.8,22.0,35.0,55.0,3.0,1
9,0,13,160.0,65.0,88.0,1.2,1.2,1.0,1.0,80.0,...,135.0,53.0,151.0,1.0,1.2,23.0,14.0,45.0,2.0,1


In [14]:
y

0        118.0
1        120.0
2        110.0
3         96.0
4        140.0
5        122.0
6        110.0
7        120.0
8        134.0
9        130.0
10       128.0
11       125.0
12       120.0
13       127.0
14       128.0
15       120.0
16       101.0
17       121.0
18       121.0
19       110.0
20       128.0
21       118.0
22       130.0
23       126.0
24       142.0
25       147.0
26       122.0
27       134.0
28       114.0
29       130.0
         ...  
49970    132.0
49971    100.0
49972    120.0
49973    113.0
49974    123.0
49975    118.0
49976    146.0
49977    119.0
49978    124.0
49979    119.0
49980    112.0
49981    138.0
49982    135.0
49983    145.0
49984    111.0
49985    129.0
49986    127.0
49987    108.0
49988    117.0
49989    117.0
49990    134.0
49991    135.0
49992    129.0
49993    100.0
49994    120.0
49995    135.0
49996    109.0
49997    128.0
49998    135.0
49999    100.0
Name: 수축기혈압, Length: 50000, dtype: float64

In [15]:
from patsy import dmatrices
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [16]:
features = ""

for attr in feature_columns_to_use[:-1] :
    features = features + attr + "+"

features = features + feature_columns_to_use[len(feature_columns_to_use)-1]    
features

'성별코드+연령대코드+신장+체중+허리둘레+시력_좌+시력_우+청력_좌+청력_우+이완기혈압+식전혈당+트리글리세라이드+HDL콜레스테롤+LDL콜레스테롤+요단백+혈청크레아티닌+혈청지오티_AST+혈청지오티_ALT+감마지티피+흡연상태+음주여부'

In [17]:
# Break into left and right hand side; y and x
y, X = dmatrices('수축기혈압~'+features, data=rawData_sample, return_type="dataframe")

In [18]:
# For each Xi, calculate VIF
vif = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]

# Fit X to y
result = sm.OLS(y, X).fit()

In [19]:
result.summary()

0,1,2,3
Dep. Variable:,수축기혈압,R-squared:,0.595
Model:,OLS,Adj. R-squared:,0.595
Method:,Least Squares,F-statistic:,3499.0
Date:,"Tue, 09 Jul 2019",Prob (F-statistic):,0.0
Time:,21:20:20,Log-Likelihood:,-182600.0
No. Observations:,50000,AIC:,365200.0
Df Residuals:,49978,BIC:,365400.0
Df Model:,21,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,41.8478,1.498,27.938,0.000,38.912,44.784
성별코드,-1.3381,0.149,-8.978,0.000,-1.630,-1.046
연령대코드,0.8876,0.019,47.049,0.000,0.851,0.925
신장,-0.1056,0.008,-12.917,0.000,-0.122,-0.090
체중,0.1325,0.006,21.945,0.000,0.121,0.144
허리둘레,0.0233,0.005,4.782,0.000,0.014,0.033
시력_좌,-0.1143,0.076,-1.508,0.132,-0.263,0.034
시력_우,-0.1760,0.074,-2.383,0.017,-0.321,-0.031
청력_좌,0.6288,0.279,2.250,0.024,0.081,1.177

0,1,2,3
Omnibus:,3421.838,Durbin-Watson:,2.004
Prob(Omnibus):,0.0,Jarque-Bera (JB):,6348.022
Skew:,0.5,Prob(JB):,0.0
Kurtosis:,4.43,Cond. No.,11000.0


In [20]:
# For each X, calculate VIF and save in dataframe
vif = pd.DataFrame()
vif["VIF Factor"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif["features"] = X.columns

In [21]:
vif.round(1)

Unnamed: 0,VIF Factor,features
0,1288.8,Intercept
1,3.2,성별코드
2,1.7,연령대코드
3,3.4,신장
4,3.5,체중
5,2.0,허리둘레
6,1.1,시력_좌
7,1.1,시력_우
8,1.4,청력_좌
9,1.4,청력_우


In [22]:
# RMSE
float(np.mean((result.predict(X) - rawData_sample['수축기혈압'])**2))

87.00844542845782

In [23]:
# RMSLE 계산하는 사용자정의 함수
from sklearn.metrics import make_scorer

def rmsle(predicted_values, actual_values) :
    # 넘파이로 배열 형태로 변환
    predicted_values = np.array(predicted_values)
    actual_values = np.array(actual_values)
    
    # 예측값과 실제값에 1을 더하고 로그를 씌움
    log_predict = np.log(predicted_values + 1)
    log_actual = np.log(actual_values + 1)
    
    # 위에서 계산한 예측값에서 실제값을 빼주고 제곱을 함
    difference = log_predict - log_actual
    # difference = (log_predict - log_actual) ** 2
    difference = np.square(difference)
    
    # 평균값 구함
    mean_difference = difference.mean()
    
    # 다시 루트를 씌움
    score = np.sqrt(mean_difference)
    
    return score

#rmsle_scorer = make_scorer(rmsle)
#rmsle_scorer

rmsle_score = rmsle(result.predict(X), rawData_sample['수축기혈압'])
rmsle_score

0.07489368478127571

In [24]:
(result.predict(X) - rawData_sample['수축기혈압']).tail(100)

49900    10.145037
49901    -5.246303
49902     7.692787
49903    13.055612
49904    -0.920184
49905     3.499341
49906    14.446126
49907     5.807756
49908    -0.630227
49909    -0.252839
49910    10.205367
49911    -1.201675
49912     4.185996
49913    -4.406153
49914     8.166116
49915   -11.734364
49916    -0.463836
49917    -8.840876
49918    -0.261509
49919     0.155126
49920     0.629434
49921    -1.767475
49922     3.473310
49923     2.232253
49924    -9.316472
49925     6.275226
49926     7.815772
49927    -6.260566
49928    -5.062889
49929    -4.462314
           ...    
49970    -7.109388
49971    35.591200
49972    11.126913
49973    -8.302127
49974    10.111096
49975     2.146534
49976    -6.032903
49977    18.543496
49978     8.979228
49979    -0.100060
49980     0.621154
49981     0.209354
49982   -22.300620
49983   -38.855065
49984    -5.076192
49985    -3.270915
49986    -8.348601
49987    -5.986557
49988     0.383862
49989   -15.722756
49990    -8.007642
49991    -8.