# Korean Medicine (KM) patient data analysis
19100720 Siheon Kim

20102115 Jeongmin Oh

In [9]:
%pip install scikit-learn

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report

Note: you may need to restart the kernel to use updated packages.


In [10]:
# CSV 파일 로드
df = pd.read_csv('../data/patient_data_new.csv')

# 데이터 확인
print(df.head())

   km_diagnosis   bmi  forehead_circumference  neck_circumference  \
0             3  26.2                    59.0                35.0   
1             1  25.7                    60.0                40.0   
2             3  24.2                    57.0                41.0   
3             1  24.6                    56.0                36.0   
4             1  22.0                    56.0                36.5   

   armpit_circumference   bust  rib_cage  waist_circumference  \
0                  96.0  101.0      93.0                 92.0   
1                 106.0  102.0      99.0                 98.0   
2                  98.0   96.0      92.0                 91.0   
3                  93.0   97.0      91.0                 89.0   
4                  93.0   93.5      87.0                 86.5   

   iliac_circumference  femur_circumference  ...  personality_ee  \
0                 96.0                 99.0  ...               1   
1                 98.0                100.0  ...          

In [11]:
# 종속변수 및 feature 선택
target_variable = 'km_diagnosis'
feature_columns = [col for col in df.columns if col != target_variable and col != 'patient_id']

In [12]:
# 999 값을 NaN으로 변경
df.replace(999, np.nan, inplace=True)

print(df['fouw_defecating'])


0      NaN
1      NaN
2      NaN
3      NaN
4      NaN
      ... 
370    2.0
371    3.0
372    3.0
373    2.0
374    3.0
Name: fouw_defecating, Length: 375, dtype: float64


In [13]:
# 각 컬럼별로 999인 값을 해당 컬럼의 평균값으로 대체
for col in df.columns:
    if df[col].isnull().sum() > 0:  # NaN 값이 있는 경우에만 처리
        col_mean = df[col].mean()
        df[col].fillna(col_mean, inplace=True)

print(df['fouw_defecating'])


0      2.484694
1      2.484694
2      2.484694
3      2.484694
4      2.484694
         ...   
370    2.000000
371    3.000000
372    3.000000
373    2.000000
374    3.000000
Name: fouw_defecating, Length: 375, dtype: float64


In [14]:
df.to_csv('modified_file.csv', index=False)

In [15]:
# CSV 파일 읽기
df_modified = pd.read_csv('modified_file.csv')

# 값 변경 함수 정의
def change_value(column, value):
    df_modified.loc[df[column] == 2, column] = value

# 각 컬럼에 대해 값 변경
change_value('personality_bs', 1)
change_value('personality_fasa', 3)
change_value('personality_ap', 3)
change_value('personality_di', 3)
change_value('personality_fgsg', 3)
change_value('personality_ei', 1)
change_value('personality_ds', 3)
change_value('personality_ed', 1)
change_value('personality_mf', 3)
change_value('personality_ifte', 1)
change_value('personality_bl', 3)
change_value('personality_es', 1)
change_value('personality_ee', 3)
change_value('personality_oxr', 3)
change_value('personality_po', 1)

change_value('sweating', 1)
change_value('sweatmood', 3)
change_value('stoolhabits', 3)
change_value('fecal_bulge', 1)
change_value('fouw_defecating', 1)
change_value('folsa_defecation', 1)

# 변경된 데이터프레임 출력 (검토용)
print(df_modified.head())

# 변경된 데이터프레임을 CSV 파일로 저장
df_modified.to_csv('modified_file.csv', index=False)

   km_diagnosis   bmi  forehead_circumference  neck_circumference  \
0             3  26.2                    59.0                35.0   
1             1  25.7                    60.0                40.0   
2             3  24.2                    57.0                41.0   
3             1  24.6                    56.0                36.0   
4             1  22.0                    56.0                36.5   

   armpit_circumference   bust  rib_cage  waist_circumference  \
0                  96.0  101.0      93.0                 92.0   
1                 106.0  102.0      99.0                 98.0   
2                  98.0   96.0      92.0                 91.0   
3                  93.0   97.0      91.0                 89.0   
4                  93.0   93.5      87.0                 86.5   

   iliac_circumference  femur_circumference  ...  personality_ee  \
0                 96.0                 99.0  ...               1   
1                 98.0                100.0  ...          

In [17]:
# 3 -> 0 바꾸기
# CSV 파일 읽기
preprocessed_df = pd.read_csv('mod_pre_data.csv')

# 변경할 컬럼 리스트
columns_to_modify = [
    'personality_bs', 'personality_fasa', 'personality_ap', 'personality_di', 'personality_fgsg', 
    'personality_ei', 'personality_ds', 'personality_ed', 'personality_mf', 'personality_ifte', 
    'personality_bl', 'personality_es', 'personality_ee', 'personality_oxr', 'personality_po', 
    'sweating', 'sweatmood', 'stoolhabits', 'fecal_bulge', 'fouw_defecating', 'folsa_defecation'
]

# 각 컬럼에 대해 값이 3인 경우 0으로 변경
for col in columns_to_modify:
    preprocessed_df[col] = preprocessed_df[col].replace(3, 0)

preprocessed_df.to_csv('../data/preprocessed_data.csv', index=False)

In [13]:
# 데이터 분할
X = df[feature_columns]
y = df[target_variable]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [14]:
# 데이터 표준화 (numerical feature에 대해서만)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train.select_dtypes(include=['number']))
X_test_scaled = scaler.transform(X_test.select_dtypes(include=['number']))

In [15]:
# 모델 학습
model = LogisticRegression(max_iter=1000)
model.fit(X_train_scaled, y_train)

In [16]:
# 예측 및 평가
y_pred = model.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print(f'Classification Report:\n{report}')

Accuracy: 0.45
Classification Report:
              precision    recall  f1-score   support

           1       0.58      0.61      0.60        46
           2       0.32      0.26      0.29        23
           3       0.33      0.35      0.34        31

    accuracy                           0.45       100
   macro avg       0.41      0.41      0.41       100
weighted avg       0.44      0.45      0.45       100

