In [1128]:
import pandas as pd

df = pd.read_csv("국민건강보험공단_건강검진정보_2023.CSV", encoding='cp949')

In [1129]:
df.head()
df=df.fillna(0)

In [1130]:
df["BMI"]=df["체중(5kg단위)"]/((df["신장(5cm단위)"]/100)**2)

In [1131]:
df.head(2)

Unnamed: 0,기준년도,가입자일련번호,시도코드,성별코드,연령대코드(5세단위),신장(5cm단위),체중(5kg단위),허리둘레,시력(좌),시력(우),...,감마지티피,흡연상태,음주여부,구강검진수검여부,치아우식증유무,결손치 유무,치아마모증유무,제3대구치(사랑니) 이상,치석,BMI
0,2023,34735,46,2,9,155,70,92.0,1.2,1.2,...,50.0,1.0,1.0,0,0.0,0.0,0.0,0.0,0.0,29.136316
1,2023,4105118,11,1,17,160,55,86.0,0.9,9.9,...,31.0,1.0,1.0,0,0.0,0.0,0.0,0.0,0.0,21.484375


In [1132]:
df["SBP"]=((df["수축기혈압"]-120)/(140-120))*100
df["SBP"] = df["SBP"].clip(lower=0, upper=100)

In [1133]:
df["DBP"] = ((df["이완기혈압"] - 80) / (90 - 80)) * 100
df["DBP"] = df["DBP"].clip(lower=0, upper=100) 

In [1134]:
df["고혈압위험점수"] = df[["SBP", "DBP"]].max(axis=1) * 0.7  

In [1135]:
df["DBP"] = ((df["이완기혈압"] - 80) / (90 - 80)) * 100
df["DBP"] = df["DBP"].clip(lower=0, upper=100) 

In [1137]:
# ✅ 1. 연속적인 점수 방식 (0~100 정규화)
df["SBP"] = ((df["수축기혈압"] - 120) / (140 - 120)) * 100
df["SBP"] = df["SBP"].clip(lower=0, upper=100)

df["DBP"] = ((df["이완기혈압"] - 80) / (90 - 80)) * 100
df["DBP"] = df["DBP"].clip(lower=0, upper=100)

# ✅ 2. 세분화된 점수 방식
def sbp_risk(sbp):
    if sbp < 120:
        return 0
    elif 120 <= sbp < 130:
        return 20
    elif 130 <= sbp < 140:
        return 40
    elif 140 <= sbp < 160:
        return 70
    elif 160 <= sbp < 180:
        return 90
    else:
        return 100

def dbp_risk(dbp):
    if dbp < 80:
        return 0
    elif 80 <= dbp < 85:
        return 20
    elif 85 <= dbp < 90:
        return 40
    elif 90 <= dbp < 95:
        return 50
    elif 95 <= dbp < 100:
        return 70
    elif 100 <= dbp < 110:
        return 90
    else:
        return 100

df["SBP_Risk"] = df["수축기혈압"].apply(sbp_risk)
df["DBP_Risk"] = df["이완기혈압"].apply(dbp_risk)

# ✅ 3. 고혈압 위험 점수 계산 (최대값 대신 평균값 사용하여 증가 보정)
df["고혈압위험점수"] = (
    df[["SBP", "DBP", "SBP_Risk", "DBP_Risk"]].mean(axis=1) * 1.2
).clip(upper=100)

# ✅ 4. 최종 고혈압 위험 점수 (BMI, 흡연, 음주 반영)
df["최종_고혈압위험"] = (
    df["고혈압위험점수"] * 0.7
    + df["BMI_고혈압위험"] * 0.2
    + df["흡연위험"] * 0.05
    + df["음주위험"] * 0.05
).clip(upper=100)


In [1136]:
# ✅ 기존 BMI 기반 점수 계산 (기존 코드 유지)
df["BMI_고혈압위험"] = ((df["BMI"] - 25) / (30 - 25)) * 100
df["BMI_고혈압위험"] = df["BMI_고혈압위험"].clip(lower=0, upper=100)

df["BMI_당뇨위험"] = ((df["BMI"] - 23) / (30 - 23)) * 100
df["BMI_당뇨위험"] = df["BMI_당뇨위험"].clip(lower=0, upper=100)

df["BMI_고지혈증위험"] = ((df["BMI"] - 25) / (30 - 25)) * 100
df["BMI_고지혈증위험"] = df["BMI_고지혈증위험"].clip(lower=0, upper=100)

# ✅ 흡연과 음주 위험 가중치 추가
df["흡연위험"] = df["흡연상태"].apply(lambda x: 20 if x == 1 else 0)  # 흡연자 20점 추가
df["음주위험"] = df["음주여부"].apply(lambda x: 15 if x == 1 else 0)  # 음주자 15점 추가


df["최종_당뇨위험"] = (df["BMI_당뇨위험"] + df["흡연위험"] * 1.25 + df["음주위험"] * 0.66).clip(upper=100)
df["최종_고지혈증위험"] = (df["BMI_고지혈증위험"] + df["흡연위험"] + df["음주위험"] * 1.33).clip(upper=100)


In [1138]:

X = df[["수축기혈압", "이완기혈압", "체중(5kg단위)", "신장(5cm단위)", "흡연상태", "음주여부"]]


In [1139]:
df["흡연상태"]=df["흡연상태"].astype(int)

In [1140]:
df["흡연상태"]=df["흡연상태"].replace(3,0)
df["흡연상태"]=df["흡연상태"].replace(4,1)

In [1141]:
y=df[["최종_고혈압위험","최종_당뇨위험","최종_고지혈증위험","BMI"]]

In [1170]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test =train_test_split(X,y,train_size=0.2)

In [1171]:
from xgboost import XGBRegressor

regressoer_XG=XGBRegressor()

In [1172]:
from sklearn.linear_model import LinearRegression

ligressor=LinearRegression()

In [1173]:
ligressor.fit(X_train,y_train)

In [1174]:
pred=ligressor.predict(X_test)

In [1175]:
regressoer_XG.fit(X_train,y_train)

In [1176]:
y_pred_XG=regressoer_XG.predict(X_test)

In [1177]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np


mean_absolute_error(y_test,y_pred_XG)


0.11447297380803025

In [1178]:
mean_absolute_error(y_test,pred)

8.415559313097454

In [1179]:
r2_score(y_test, y_pred_XG)

0.9998564720153809

In [1180]:
r2_score(y_test, pred)

0.7873511103368505

In [1181]:
new_data = pd.DataFrame({
    "수축기혈압": [120],
    "이완기혈압": [80],
    "체중(5kg단위)": [73],
    "신장(5cm단위)": [160],
    "흡연상태": [0],  
    "음주여부": [0]
})

In [1182]:
import numpy as np

# 예측 수행
new_pred = regressoer_XG.predict(new_data)

# 배열 전체를 소수점 2자리까지 반올림
new_pred_rounded = np.round(new_pred, 2)

# ✅ 100점을 초과하는 값은 100으로 제한
new_pred_clipped = np.clip(new_pred_rounded, 0, 100)

# 결과 출력
print("예측된 건강 위험 점수:", new_pred_clipped[0])


예측된 건강 위험 점수: [19.36 72.86 49.58 27.34]


In [1184]:
import joblib

joblib.dump(regressoer_XG,'regressor_xg')

['regressor_xg']

In [1156]:
import numpy as np

# 예측 수행
new_pred1 = ligressor.predict(new_data)

# ✅ 고혈압 점수만 2배로 조정 (첫 번째 값만 조정)
new_pred1[0] *= np.array([1.5, 1, 1, 1])

# ✅ 배열 전체를 소수점 2자리까지 반올림
new_pred_rounded = np.round(new_pred1, 2)

# ✅ 100점을 초과하는 값은 100으로 제한
new_pred_clipped = np.clip(new_pred_rounded, 0, 100)

# 결과 출력
print("예측된 건강 위험 점수 (고혈압 조정됨):", new_pred_clipped[0])


예측된 건강 위험 점수 (고혈압 조정됨): [44.92 80.78 64.37 27.24]


In [1185]:
df

Unnamed: 0,기준년도,가입자일련번호,시도코드,성별코드,연령대코드(5세단위),신장(5cm단위),체중(5kg단위),허리둘레,시력(좌),시력(우),...,BMI_고혈압위험,BMI_당뇨위험,BMI_고지혈증위험,흡연위험,음주위험,최종_당뇨위험,최종_고지혈증위험,SBP_Risk,DBP_Risk,최종_고혈압위험
0,2023,34735,46,2,9,155,70,92.0,1.2,1.2,...,82.726327,87.661662,82.726327,20,15,100.000000,100.000000,40,40,73.945265
1,2023,4105118,11,1,17,160,55,86.0,0.9,9.9,...,0.000000,0.000000,0.000000,20,15,34.900000,39.950000,20,0,9.100000
2,2023,362482,36,2,13,150,65,96.0,1.0,0.8,...,77.777778,84.126984,77.777778,20,15,100.000000,100.000000,0,0,17.305556
3,2023,653166,11,1,13,160,70,85.0,1.0,1.2,...,46.875000,62.053571,46.875000,0,15,71.953571,66.825000,0,0,10.125000
4,2023,4152237,41,1,12,165,65,84.5,1.0,1.2,...,0.000000,12.501640,0.000000,0,15,22.401640,19.950000,40,40,48.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999995,2023,3265034,26,1,8,170,65,78.0,1.2,1.0,...,0.000000,0.000000,0.000000,20,15,34.900000,39.950000,0,0,1.750000
999996,2023,1421865,41,1,10,165,80,96.1,0.9,1.2,...,87.695133,91.210809,87.695133,0,15,100.000000,100.000000,40,40,67.639027
999997,2023,3889375,41,2,11,155,65,87.0,0.5,0.7,...,41.103018,57.930727,41.103018,20,15,92.830727,81.053018,70,20,56.170604
999998,2023,2618086,41,2,7,160,55,69.0,1.0,1.2,...,0.000000,0.000000,0.000000,20,0,25.000000,20.000000,20,0,7.300000
