# Korean Medicine (KM) patient data analysis
19100720 Siheon Kim

20102115 Jeongmin Oh

In [6]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report

In [7]:
# CSV 파일 로드
df = pd.read_csv('./patient_data.csv')

# 데이터 확인
print(df.head())

  patient_id  km_diagnosis  gender  height  weight   bmi  \
0  KDCT00001             2       2   158.0    55.0  22.0   
1  KDCT00002             3       2   155.0    63.0  26.2   
2  KDCT00003             2       1   160.0    59.0  23.0   
3  KDCT00004             2       2   165.0    76.0  27.9   
4  KDCT00005             1       1   165.0    70.0  25.7   

   systolic_blood_coagulation  diastolic_blood_pressure  job  education  ...  \
0                         120                        80    2          5  ...   
1                         124                        65   14          3  ...   
2                         121                        75    3          3  ...   
3                         113                        81   14          4  ...   
4                         129                        72    8          2  ...   

   fatigue_all_day  hypertension_diagnosis  diabetes_diagnosis  \
0                0                       1                   1   
1                0        

In [8]:
# 종속변수 및 feature 선택
target_variable = 'km_diagnosis'
feature_columns = [col for col in df.columns if col != target_variable and col != 'patient_id']

In [11]:
# 999 값을 NaN으로 변경
df.replace(999, np.nan, inplace=True)

print(df['fouw_defecating'])


0      NaN
1      NaN
2      NaN
3      NaN
4      NaN
      ... 
495    3.0
496    3.0
497    3.0
498    2.0
499    3.0
Name: fouw_defecating, Length: 500, dtype: float64


In [12]:
# 각 컬럼별로 999인 값을 해당 컬럼의 평균값으로 대체
for col in df.columns:
    if df[col].isnull().sum() > 0:  # NaN 값이 있는 경우에만 처리
        col_mean = df[col].mean()
        df[col].fillna(col_mean, inplace=True)

print(df['fouw_defecating'])


0      2.490637
1      2.490637
2      2.490637
3      2.490637
4      2.490637
         ...   
495    3.000000
496    3.000000
497    3.000000
498    2.000000
499    3.000000
Name: fouw_defecating, Length: 500, dtype: float64


In [13]:
# 데이터 분할
X = df[feature_columns]
y = df[target_variable]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [14]:
# 데이터 표준화 (numerical feature에 대해서만)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train.select_dtypes(include=['number']))
X_test_scaled = scaler.transform(X_test.select_dtypes(include=['number']))

In [15]:
# 모델 학습
model = LogisticRegression(max_iter=1000)
model.fit(X_train_scaled, y_train)

In [16]:
# 예측 및 평가
y_pred = model.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print(f'Classification Report:\n{report}')

Accuracy: 0.45
Classification Report:
              precision    recall  f1-score   support

           1       0.58      0.61      0.60        46
           2       0.32      0.26      0.29        23
           3       0.33      0.35      0.34        31

    accuracy                           0.45       100
   macro avg       0.41      0.41      0.41       100
weighted avg       0.44      0.45      0.45       100

