In [None]:
# winequality-white.csv 데이터셋으로 실습 (분류, 와인 품질 예측)
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix

In [None]:
# -----------------------------
# 1) 데이터 준비
# -----------------------------

# 구분자가 세미콜론(;)이므로 sep=';' 옵션을 추가
df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/machine-learning-practice/Machine-Learning-Programming/week4/data/winequality-white.csv", sep=';')

# 결측치 확인
print("결측치 확인:\n", df.isnull().sum())

# 결측치가 있는 행이 있다면 제거
df = df.dropna()

# 타겟 변수인 'quality'의 분포를 확인
print("와인 품질 분포:\n", df["quality"].value_counts())

결측치 확인:
 fixed acidity           0
volatile acidity        0
citric acid             0
residual sugar          0
chlorides               0
free sulfur dioxide     0
total sulfur dioxide    0
density                 0
pH                      0
sulphates               0
alcohol                 0
quality                 0
dtype: int64
와인 품질 분포:
 quality
6    2198
5    1457
7     880
8     175
4     163
3      20
9       5
Name: count, dtype: int64


In [None]:
# 모든 레이블을 숫자로 변형
from sklearn.preprocessing import LabelEncoder

label_encoders = {}
for column in df.columns:
    label_encoders[column] = LabelEncoder()
    df[column] = label_encoders[column].fit_transform(df[column])

df # 모든 컬럼 값이 숫자로 변경되었는지 확인

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,31,36,36,302,34,57,146,878,23,22,5,3
1,23,42,34,17,38,13,106,471,53,26,12,3
2,43,38,40,101,39,33,69,560,49,21,24,3
3,34,28,32,128,47,59,163,601,42,17,21,3
4,34,28,32,128,47,59,163,601,42,17,21,3
...,...,...,...,...,...,...,...,...,...,...,...,...
4893,22,24,29,17,28,26,64,224,50,27,47,3
4894,27,46,36,120,36,73,144,544,38,23,15,2
4895,26,30,19,9,30,33,83,347,22,23,11,3
4896,14,40,30,7,11,21,82,29,57,15,84,4


In [None]:
# 'quality' 열을 제외한 모든 열을 X로 사용
X = df.drop(columns=["quality"])
# 'quality' 열을 y로 사용
y = df["quality"]

In [None]:
#데이터 정규화
scaler = StandardScaler()
X = scaler.fit_transform(X)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

print("데이터 형태 확인:")
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

데이터 형태 확인:
(3918, 11) (980, 11) (3918,) (980,)


In [None]:
# -----------------------------
# 2) 모델 구성
# -----------------------------

# 의사결정나무 분류기
dt = DecisionTreeClassifier(random_state=42)
# 랜덤 포레스트 분류기 (200개의 트리 사용)
rf = RandomForestClassifier(n_estimators=200, random_state=42)
# 로지스틱 회귀 (다중 클래스 분류)
# max_iter를 늘려 모델이 충분히 학습되도록 합니다.
lr = LogisticRegression(max_iter=1000, random_state=42)
knn = KNeighborsClassifier()

In [None]:
# -----------------------------
# 3) 모델 학습
# -----------------------------

# fit 함수를 통해 학습
print("모델 학습을 시작합니다...")
dt.fit(X_train, y_train)
rf.fit(X_train, y_train)
lr.fit(X_train, y_train)
knn.fit(X_train, y_train)
print("모델 학습이 완료되었습니다.")

모델 학습을 시작합니다...
모델 학습이 완료되었습니다.


In [None]:
print(lr.coef_, lr.intercept_)

[[ 0.51677947  0.43667771 -0.22503853 -0.3776913   0.21344688  0.30549048
   0.07705279  0.45792979  0.07802597 -0.32400831 -0.11574998]
 [-0.28191781  0.65107862 -0.12379423 -1.30794207  0.3032995  -0.65496507
  -0.11968764  1.45370649 -0.37717281 -0.10036391 -0.30353277]
 [-0.48006217  0.30865657  0.02395182 -0.56375279  0.21203961 -0.04530358
   0.07229944  0.63580651 -0.50349356 -0.11884591 -0.89897936]
 [-0.49435426 -0.30675047  0.05237349 -0.056152    0.20939982  0.05517133
   0.06602953  0.15985514 -0.40100284  0.06240153 -0.01681512]
 [-0.027803   -0.45231211 -0.05327225  0.95503848 -0.04863851  0.16049742
   0.0385213  -1.23361882  0.05568977  0.23308183  0.01470102]
 [-0.19549837 -0.46906134  0.01486048  1.09164966  0.13609543  0.3660056
  -0.05249701 -1.01568236  0.07779386  0.14063151  0.60356212]
 [ 0.96285614 -0.16828897  0.31091921  0.25885003 -1.02564272 -0.18689618
  -0.08171841 -0.45799675  1.07015961  0.10710326  0.71681409]] [-2.06678119 -0.2890168   2.43856873  3.1

In [None]:
# -----------------------------
# 4) 예측결과 생성
# -----------------------------

# 학습된 모델로 테스트 데이터의 결과를 예측합니다.
dt_y_pred = dt.predict(X_test)
rf_y_pred = rf.predict(X_test)
lr_y_pred = lr.predict(X_test)
knn_pred = knn.predict(X_test)

In [None]:
# -----------------------------
# 5) 모델 평가 (정확도)
# -----------------------------

# 예측 결과와 실제 정답을 비교하여 정확도를 계산
dt_acc = accuracy_score(y_test, dt_y_pred)
rf_acc = accuracy_score(y_test, rf_y_pred)
lr_acc = accuracy_score(y_test, lr_y_pred)
knn_acc = accuracy_score(y_test, knn_pred)

print("\n=== 모델별 테스트 정확도 ===")
print(f"의사결정나무 (Decision Tree): {dt_acc:.4f}")
print(f"랜덤 포레스트 (Random Forest): {rf_acc:.4f}")
print(f"로지스틱 회귀 (Logistic Regression): {lr_acc:.4f}")
print(f"K-최근접 이웃 (KNN): {knn_acc:.4f}")


=== 모델별 테스트 정확도 ===
의사결정나무 (Decision Tree): 0.5949
랜덤 포레스트 (Random Forest): 0.6765
로지스틱 회귀 (Logistic Regression): 0.5469
K-최근접 이웃 (KNN): 0.5378


In [None]:
# 각 모델의 혼동 행렬(Confusion Matrix)을 출력하여 예측 성능을 더 자세히 확인합니다.
print("\n=== 의사결정나무 혼동 행렬 ===")
print(confusion_matrix(y_test, dt_y_pred))

print("\n=== 랜덤 포레스트 혼동 행렬 ===")
print(confusion_matrix(y_test, rf_y_pred))

print("\n=== 로지스틱 회귀 혼동 행렬 ===")
print(confusion_matrix(y_test, lr_y_pred))

print("\n=== K-means 혼동 행렬 ===")
print(confusion_matrix(y_test, knn_pred))


=== 의사결정나무 혼동 행렬 ===
[[  0   1   2   1   0   0   0]
 [  0   9  14   8   1   1   0]
 [  1  15 174  80  14   7   0]
 [  1  12  80 279  59   9   0]
 [  0   1  15  54 100   6   0]
 [  0   1   1   6   6  21   0]
 [  0   0   0   0   1   0   0]]

=== 랜덤 포레스트 혼동 행렬 ===
[[  0   0   1   3   0   0   0]
 [  0   7  18   8   0   0   0]
 [  0   3 188 100   0   0   0]
 [  0   0  50 363  27   0   0]
 [  0   0   4  86  86   0   0]
 [  0   0   1   8   7  19   0]
 [  0   0   0   0   1   0   0]]

=== 로지스틱 회귀 혼동 행렬 ===
[[  0   0   1   3   0   0   0]
 [  0   2  20  11   0   0   0]
 [  0   0 154 133   4   0   0]
 [  0   1  77 336  26   0   0]
 [  0   0   6 126  44   0   0]
 [  0   0   0  28   7   0   0]
 [  0   0   0   0   1   0   0]]

=== K-means 혼동 행렬 ===
[[  0   0   1   3   0   0   0]
 [  0   6  15  11   1   0   0]
 [  1   8 159 108  15   0   0]
 [  0   7 109 267  53   4   0]
 [  0   0   6  78  87   5   0]
 [  0   0   1  17   9   8   0]
 [  0   0   0   1   0   0   0]]


In [None]:
# winequality-white.csv 데이터셋으로 실습 (딥러닝, 와인 품질 예측)
!pip install tensorflow

In [None]:
import pandas as pd
import numpy as np
from tensorflow.keras import layers, models
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# -----------------------------
# 1) 데이터 준비
# -----------------------------

# 구분자가 세미콜론(;)이므로 sep=';' 옵션을 추가
df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/machine-learning-practice/Machine-Learning-Programming/week4/data/winequality-white.csv", sep=';')

# 결측치 확인
print("결측치 확인:\n", df.isnull().sum())

# 결측치가 있는 행이 있다면 제거
df = df.dropna()

# 타겟 변수인 'quality'의 분포를 확인
print("와인 품질 분포:\n", df["quality"].value_counts())

In [None]:
# 'quality' 열을 제외한 모든 열을 X로 사용
X = df.drop(columns=["quality"])
# 'quality' 열을 y로 사용
y = df["quality"]

In [None]:
#데이터 정규화
scaler = StandardScaler()
X = scaler.fit_transform(X)

In [None]:
# one-hot encoding
Y = pd.get_dummies(y).values
# X = X.values

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, Y, test_size=0.2, random_state=0
)

print("데이터 형태 확인:")
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

In [None]:
# -----------------------------
# 2) 모델 구성
# -----------------------------

model = models.Sequential([
    # layers.Dense(64, activation="relu", input_shape=(X.shape[1],)),
    # layers.Dropout(0.3),
    layers.Dense(128, activation="relu"),
    layers.Dropout(0.3),
    layers.Dense(64, activation="relu"),
    layers.Dense(64, activation="relu"),
    layers.Dense(Y.shape[1], activation="softmax")   # 클래스 수 맞춤
])

In [None]:
model.compile(
    optimizer="adam",
    loss="categorical_crossentropy",   # one-hot이면 categorical_crossentropy
    metrics=["accuracy"]
)

In [None]:
# -----------------------------
# 3) 모델 학습
# -----------------------------
history = model.fit(
    X_train, y_train,
    validation_split=0.2,
    epochs=50,
    batch_size=16,
    verbose=1
)

In [None]:
# -----------------------------
# 4) 예측결과 생성
# -----------------------------
y_pred = model.predict(X_test)

In [None]:
y_test_class = np.argmax(y_test, axis=1)
y_pred_class = np.argmax(y_pred, axis=1)

In [None]:
# 학습 곡선
loss = history.history['loss']
val_loss = history.history['val_loss']
acc = history.history['accuracy']
val_acc = history.history['val_accuracy']

In [None]:
# -----------------------------
# 5) 모델 평가 (손실도, 정확도)
# -----------------------------

loss = history.history['loss']
val_loss = history.history['val_loss']
epochs = range(1, len(loss) + 1)
plt.plot(epochs, loss, 'y', label='Training loss')
plt.plot(epochs, val_loss, 'r', label='Validation loss')
plt.title('Training and validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
plt.plot(epochs, acc, 'y', label='Training acc')
plt.plot(epochs, val_acc, 'r', label='Validation acc')
plt.title('Training and validation accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

In [None]:
#예측 값에 대한 정확도 확인
print(classification_report(y_test_class,y_pred_class))
print(confusion_matrix(y_test_class,y_pred_class))

In [None]:
# breast_cancer.csv 데이터셋으로 실습 (딥러닝)
!pip install tensorflow

In [3]:
import pandas as pd
import numpy as np
from tensorflow.keras import layers, models
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# -----------------------------
# 1) 데이터 준비
# -----------------------------

df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/machine-learning-practice/Machine-Learning-Programming/week4/data/breast_cancer.csv")

# 결측치 확인
print("결측치 확인:\n", df.isnull().sum())

# 결측치가 있는 행이 있다면 제거
df = df.dropna()

df

In [None]:
# 'label' 열을 제외한 모든 열을 X로 사용
X = df.drop(columns=["label"])
# 'label' 열을 y로 사용
y = df["label"]

In [None]:
#데이터 정규화
scaler = StandardScaler()
X = scaler.fit_transform(X)

In [None]:
# one-hot encoding
Y = pd.get_dummies(y).values
# X = X.values

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, Y, test_size=0.2, random_state=0
)

print("데이터 형태 확인:")
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

In [None]:
# -----------------------------
# 2) 모델 구성
# -----------------------------

model = models.Sequential([
    layers.Dense(64, activation="relu", input_shape=(X.shape[1],)),
    layers.Dropout(0.3),
    layers.Dense(32, activation="relu"),
    layers.Dense(32, activation="relu"),
    layers.Dense(Y.shape[1], activation="softmax")   # 클래스 수 맞춤
])

In [None]:
model.compile(
    optimizer="adam",
    loss="categorical_crossentropy",   # one-hot이면 categorical_crossentropy
    metrics=["accuracy"]
)

In [None]:
# -----------------------------
# 3) 모델 학습
# -----------------------------
history = model.fit(
    X_train, y_train,
    validation_split=0.2,
    epochs=50,
    batch_size=16,
    verbose=1
)

In [None]:
# -----------------------------
# 4) 예측결과 생성
# -----------------------------
y_pred = model.predict(X_test)

In [None]:
y_test_class = np.argmax(y_test, axis=1)
y_pred_class = np.argmax(y_pred, axis=1)

In [None]:
# 학습 곡선
loss = history.history['loss']
val_loss = history.history['val_loss']
acc = history.history['accuracy']
val_acc = history.history['val_accuracy']

In [None]:
# -----------------------------
# 5) 모델 평가 (손실도, 정확도)
# -----------------------------

loss = history.history['loss']
val_loss = history.history['val_loss']
epochs = range(1, len(loss) + 1)
plt.plot(epochs, loss, 'y', label='Training loss')
plt.plot(epochs, val_loss, 'r', label='Validation loss')
plt.title('Training and validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
plt.plot(epochs, acc, 'y', label='Training acc')
plt.plot(epochs, val_acc, 'r', label='Validation acc')
plt.title('Training and validation accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

In [None]:
#예측 값에 대한 정확도 확인
print(classification_report(y_test_class,y_pred_class))
print(confusion_matrix(y_test_class,y_pred_class))