# Korean Medicine (KM) patient data analysis
19100720 Siheon Kim

20102115 Jeongmin Oh

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report

In [2]:
# CSV 파일 로드
data = pd.read_csv('../data/patient_data.csv')

data

Unnamed: 0,patient_id,km_diagnosis,gender,height,weight,bmi,systolic_blood_coagulation,diastolic_blood_pressure,job,education,...,fatigue_all_day,hypertension_diagnosis,diabetes_diagnosis,hyperlipidemia_diagnosis,general_health,blood_sugar,total_cholesterol,triglycerides,high_density_cholesterol,low_density_cholesterol
0,KDCT00001,2,2,158.0,55.0,22.0,120,80,2,5,...,0,1,1,1,3,104,219,72,60,128
1,KDCT00002,3,2,155.0,63.0,26.2,124,65,14,3,...,0,1,1,1,3,110,199,97,46,114
2,KDCT00003,2,1,160.0,59.0,23.0,121,75,3,3,...,0,1,1,1,3,79,216,88,49,132
3,KDCT00004,2,2,165.0,76.0,27.9,113,81,14,4,...,0,1,1,1,4,100,218,136,44,136
4,KDCT00005,1,1,165.0,70.0,25.7,129,72,8,2,...,0,1,1,1,2,92,186,61,33,118
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,KDCT00496,1,1,164.5,57.9,21.4,120,80,3,5,...,0,2,1,1,3,71,190,75,73,101
496,KDCT00497,2,2,157.2,41.9,17.0,110,70,3,5,...,0,1,1,1,3,74,246,86,104,117
497,KDCT00498,3,1,165.0,82.0,30.1,150,100,7,5,...,0,2,1,2,4,125,246,220,74,154
498,KDCT00499,3,1,171.1,68.3,23.3,110,80,7,4,...,0,2,1,1,2,85,172,127,49,109


In [8]:
# 'patient_id' 열은 제외하고 feature와 target 변수 선택
X = data.drop(['patient_id', 'km_diagnosis'], axis=1)
y = data['km_diagnosis']

In [9]:
# 범주형 변수들에 대해 원-핫 인코딩 수행
X = pd.get_dummies(X)
X


Unnamed: 0,gender,height,weight,bmi,systolic_blood_coagulation,diastolic_blood_pressure,job,education,marital_status,age,...,fatigue_all_day,hypertension_diagnosis,diabetes_diagnosis,hyperlipidemia_diagnosis,general_health,blood_sugar,total_cholesterol,triglycerides,high_density_cholesterol,low_density_cholesterol
0,2,158.0,55.0,22.0,120,80,2,5,2,48.666,...,0,1,1,1,3,104,219,72,60,128
1,2,155.0,63.0,26.2,124,65,14,3,2,80.773,...,0,1,1,1,3,110,199,97,46,114
2,1,160.0,59.0,23.0,121,75,3,3,2,50.008,...,0,1,1,1,3,79,216,88,49,132
3,2,165.0,76.0,27.9,113,81,14,4,2,50.551,...,0,1,1,1,4,100,218,136,44,136
4,1,165.0,70.0,25.7,129,72,8,2,2,49.581,...,0,1,1,1,2,92,186,61,33,118
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,1,164.5,57.9,21.4,120,80,3,5,2,50.682,...,0,2,1,1,3,71,190,75,73,101
496,2,157.2,41.9,17.0,110,70,3,5,2,42.542,...,0,1,1,1,3,74,246,86,104,117
497,1,165.0,82.0,30.1,150,100,7,5,2,48.797,...,0,2,1,2,4,125,246,220,74,154
498,1,171.1,68.3,23.3,110,80,7,4,2,51.263,...,0,2,1,1,2,85,172,127,49,109


In [10]:
# 학습용 데이터와 테스트용 데이터로 분할
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [11]:
# Decision Tree 모델 생성 및 학습
clf = DecisionTreeClassifier(random_state=42)
clf.fit(X_train, y_train)

In [12]:
# 학습된 모델로 예측
y_pred = clf.predict(X_test)


In [13]:
# 모델 평가
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print(f'Classification Report:\n{report}')

Accuracy: 0.34
Classification Report:
              precision    recall  f1-score   support

           1       0.55      0.37      0.44        46
           2       0.23      0.26      0.24        23
           3       0.26      0.35      0.30        31

    accuracy                           0.34       100
   macro avg       0.34      0.33      0.33       100
weighted avg       0.38      0.34      0.35       100

