In [42]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier, export_text
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

import seaborn as sns
import matplotlib.font_manager as fm
plt.rc('font', family='AppleGothic')

# 웹에서 데이터 불러오기
file_path = "/Users/jangsohyun/Desktop/Introduction_to_AI/car_evaluation.csv"
df = pd.read_csv(file_path, header=None) # header=None => 칼럼명 없이 가져옴

# 칼럼명 지정하여 추가
df.columns = ['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety', 'output']
print(df)

# 결측치 확인
print("\n결측치")
print(df.isnull().sum())

     buying  maint  doors persons lug_boot safety output
0     vhigh  vhigh      2       2    small    low  unacc
1     vhigh  vhigh      2       2    small    med  unacc
2     vhigh  vhigh      2       2    small   high  unacc
3     vhigh  vhigh      2       2      med    low  unacc
4     vhigh  vhigh      2       2      med    med  unacc
...     ...    ...    ...     ...      ...    ...    ...
1723    low    low  5more    more      med    med   good
1724    low    low  5more    more      med   high  vgood
1725    low    low  5more    more      big    low  unacc
1726    low    low  5more    more      big    med   good
1727    low    low  5more    more      big   high  vgood

[1728 rows x 7 columns]

결측치
buying      0
maint       0
doors       0
persons     0
lug_boot    0
safety      0
output      0
dtype: int64


In [55]:
columns_to_encode = ['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety', 'output']

label_encoder = LabelEncoder()

for column in columns_to_encode:
    df[column] = label_encoder.fit_transform(df[column])
    
# 변환된 데이터 확인
print(df.head())

   buying  maint  doors  persons  lug_boot  safety  output
0       3      3      0        0         2       1       2
1       3      3      0        0         2       2       2
2       3      3      0        0         2       0       2
3       3      3      0        0         1       1       2
4       3      3      0        0         1       2       2


In [47]:
df['output'].value_counts()

output
2    1210
0     384
1      69
3      65
Name: count, dtype: int64

In [56]:
# 특성과 레이블 분리
X = df.iloc[:, :-1].values  # 마지막 열 제외
y = df.iloc[:, -1].values   # 마지막 열 (클래스)

# 데이터 분할 (훈련: 80%, 테스트: 20%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 데이터 정규화
scaler = StandardScaler()
X = scaler.fit_transform(X)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((1382, 6), (346, 6), (1382,), (346,))

In [49]:
# 의사결정나무(DT) 학습 및 평가
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)
y_pred_dt = dt_model.predict(X_test)
dt_accuracy = accuracy_score(y_test, y_pred_dt)
print(f"Decision Tree Accuracy: {dt_accuracy:.4f}")

Decision Tree Accuracy: 0.9740


In [50]:
# 랜덤 포레스트(RF) 학습 및 평가
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)
rf_accuracy = accuracy_score(y_test, y_pred_rf)
print(f"Random Forest Accuracy: {rf_accuracy:.4f}")

Random Forest Accuracy: 0.9798


In [51]:
# 서포트 벡터 머신(SVM) 학습 및 평가
svm_model = SVC()
svm_model.fit(X_train, y_train)
y_pred_svm = svm_model.predict(X_test)
svm_accuracy = accuracy_score(y_test, y_pred_svm)
print(f"SVM Accuracy: {svm_accuracy:.4f}")

SVM Accuracy: 0.9017


In [52]:
# 로지스틱 회귀(LR) 학습 및 평가
lr_model = LogisticRegression(max_iter=200)
lr_model.fit(X_train, y_train)
y_pred_lr = lr_model.predict(X_test)
lr_accuracy = accuracy_score(y_test, y_pred_lr)
print(f"Logistic Regression Accuracy: {lr_accuracy:.4f}")

Logistic Regression Accuracy: 0.6590


In [28]:
# KNN(K=5) 학습 및 평가
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_train, y_train)
y_pred_knn = knn_model.predict(X_test)
accuracy_knn = accuracy_score(y_test, y_pred_knn)
print(f"KNN (K=5) Accuracy: {accuracy_knn:.4f}")

KNN (K=5) Accuracy: 0.9133
