In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

import requests
from io import BytesIO

# 파일 URL
url = "https://github.com/MyungKyuYi/AI-class/raw/d3cfa9110259a58e961f81dc6ef459768a1d2861/combined_dataset-1.xlsx"

# 파일 다운로드
response = requests.get(url)
response.raise_for_status()  # 요청 에러 확인

# BytesIO를 이용해 pandas로 읽기
df = pd.read_excel(BytesIO(response.content), index_col=[0, 1])

In [2]:
# 데이터프레임 확인
df.head()

Unnamed: 0,Num.,subject_ID,Sex(M/F),Age(year),Height(cm),Weight(kg),Systolic Blood Pressure(mmHg),Diastolic Blood Pressure(mmHg),Heart Rate(b/m),BMI(kg/m^2),...,2091,2092,2093,2094,2095,2096,2097,2098,2099,2100
0,1,2,Female,45,152,63,161,89,97,27.268006,...,1766,1766,1766,1833,1833,1827,1827,1827,1754,1754
1,1,2,Female,45,152,63,161,89,97,27.268006,...,1985,1985,2026,2026,2026,1977,1977,1997,1997,1997
2,1,2,Female,45,152,63,161,89,97,27.268006,...,1942,1900,1900,1938,1938,1938,1924,1924,1929,1929
3,2,3,Female,50,157,50,160,93,76,20.284799,...,2073,2072,2072,2072,2051,2051,2036,2036,2036,2045
4,2,3,Female,50,157,50,160,93,76,20.284799,...,2021,2010,2010,2010,2001,2001,2003,2003,2003,1989


In [3]:
df.columns

Index(['Num.', 'subject_ID', 'Sex(M/F)', 'Age(year)', 'Height(cm)',
       'Weight(kg)', 'Systolic Blood Pressure(mmHg)',
       'Diastolic Blood Pressure(mmHg)', 'Heart Rate(b/m)', 'BMI(kg/m^2)',
       ...
       '2091', '2092', '2093', '2094', '2095', '2096', '2097', '2098', '2099',
       '2100'],
      dtype='object', length=2114)

In [4]:
# 데이터 분할
X = df.drop("Sex(M/F)", axis=1)
y = df["Sex(M/F)"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [5]:
def evaluate(model, X_test, y_test):
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    print(f"{model.__class__} Accuracy: {acc:.4f}")
    print(classification_report(y_test, y_pred))

In [6]:
# 결정 트리
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train, y_train)
evaluate(dt, X_test, y_test)

ValueError: could not convert string to float: 'Normal'

In [None]:
# 랜덤 포레스트
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
evaluate(rf, X_test, y_test)

<class 'sklearn.ensemble._forest.RandomForestClassifier'> Accuracy: 0.8800
              precision    recall  f1-score   support

           0       0.95      0.96      0.96       100
           1       0.82      0.83      0.83       100
           2       0.81      0.80      0.80       100
           3       0.94      0.93      0.93       100

    accuracy                           0.88       400
   macro avg       0.88      0.88      0.88       400
weighted avg       0.88      0.88      0.88       400



In [None]:
# 서포트 벡터 머신
svm = SVC(kernel='linear', random_state=42)
svm.fit(X_train, y_train)
evaluate(svm, X_test, y_test)

<class 'sklearn.svm._classes.SVC'> Accuracy: 0.9825
              precision    recall  f1-score   support

           0       0.99      1.00      1.00       100
           1       0.99      0.97      0.98       100
           2       0.95      0.99      0.97       100
           3       1.00      0.97      0.98       100

    accuracy                           0.98       400
   macro avg       0.98      0.98      0.98       400
weighted avg       0.98      0.98      0.98       400



In [None]:
# 로지스틱 회귀
lr = LogisticRegression(max_iter=2000, random_state=42)
lr.fit(X_train, y_train)
evaluate(lr, X_test, y_test)

<class 'sklearn.linear_model._logistic.LogisticRegression'> Accuracy: 0.6725
              precision    recall  f1-score   support

           0       0.90      0.85      0.88       100
           1       0.62      0.58      0.60       100
           2       0.48      0.58      0.52       100
           3       0.75      0.68      0.71       100

    accuracy                           0.67       400
   macro avg       0.69      0.67      0.68       400
weighted avg       0.69      0.67      0.68       400



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
