# EMNIST 머신러닝 프로젝트1 시작

## Pandas의 read_csv()를 이용해 훈련셋 읽고, NumPy 배열 data_train로 저장

import numpy as np
import pandas as pd

input_file = "emnist-digits-train.csv"
df_train = pd.read_csv(input_file, header=None)

data_train = df_train.to_numpy()
data_train

data_train.shape

## 훈련셋을 타겟값과 데이터 배열로 분리

X_train = data_train[:, 1:]
X_train

y_train = data_train[:, 0]
y_train

## Min-Max 스케일링 적용한 훈련셋

min_pixel_value = X_train.min()
max_pixel_value = X_train.max()

X_train_normalized = (X_train - min_pixel_value) / (max_pixel_value - min_pixel_value)
X_train_normalized

## 표준화 스케일링 적용한 훈련셋

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_standardized = scaler.fit_transform(X_train)
X_train_standardized

## 확률적 경사 하강법 분류기 사용하여 모델 훈련

from sklearn.linear_model import SGDClassifier

# Scikit-learn >= 1.3.0v인 경우 tol에 대한 값을 -np.infty 대신 1e-3으로 변경하라고 pdf에 있음.
sgd_clf = SGDClassifier(max_iter=5, tol= 1e-3, random_state=42)
sgd_clf.fit(X_train, y_train)

from sklearn.model_selection import cross_val_score

SGDC_cross_val_score = cross_val_score(sgd_clf, X_train, y_train, cv=10, scoring="accuracy")
SGDC_cross_val_score

SGDC_mean = np.mean(SGDC_cross_val_score)
SGDC_mean

## SGDC에 대한 10회 CV 평균 정확도

SGDC_result = f"10회 CV 평균 정확도: {SGDC_mean*100:.5f} %"
SGDC_result

## 오차 분석 방법: 오차 행렬

from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix

y_train_pred = cross_val_predict(sgd_clf, X_train, y_train, cv=10)
conf_mx = confusion_matrix(y_train, y_train_pred)
conf_mx

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)


plt.matshow(conf_mx, cmap=plt.cm.gray)
plt.show()

row_sums = conf_mx.sum(axis=1, keepdims=True)
norm_conf_mx = conf_mx / row_sums

np.fill_diagonal(norm_conf_mx, 0)
plt.matshow(norm_conf_mx, cmap=plt.cm.gray)
plt.show()

# 위의 오차 행렬을 통해 8을 2로, 8을 5로, 3을 5로 잘못 예측한 정도가 크다는 것을 알 수 있음. 이외에도 잘못 예측한 사례가 더 있다.

from sklearn.svm import SVC

svm_clf = SVC(gamma="auto", random_state=42)
svm_clf.fit(X_train, y_train) # y_train, not y_train_5
SVC_cross_val_score = cross_val_score(svm_clf, X_train, y_train, cv=10, scoring="accuracy")
SVC_cross_val_score

SVC_mean = np.mean(SVC_cross_val_score)
SVC_mean

## SVC에 대한 10회 CV 평균 정확도

SVC_result = f"10회 CV 평균 정확도: {SVC_mean*100:.5f} %"
SVC_result

## RandomForestClassifier 사용 (일반 훈련셋, min-max 훈련셋, 표준화 훈련셋 순서)

from sklearn.ensemble import RandomForestClassifier

forest_clf = RandomForestClassifier(n_estimators=20, random_state=100)
forest_clf.fit(X_train, y_train)

forest_clf_result = cross_val_score(forest_clf, X_train, y_train, cv=10, scoring="accuracy")
forest_clf_result

np.mean(forest_clf_result)

forest_clf2 = RandomForestClassifier(n_estimators=20, random_state=100)
forest_clf2.fit(X_train_normalized, y_train)

forest_clf2_result = cross_val_score(forest_clf2, X_train_normalized, y_train, cv=10, scoring="accuracy")
forest_clf2_result

np.mean(forest_clf2_result)

forest_clf3 = RandomForestClassifier(n_estimators=20, random_state=100)
forest_clf3.fit(X_train_standardized, y_train)

forest_clf_result3 = cross_val_score(forest_clf3, X_train_standardized, y_train, cv=10, scoring="accuracy")
forest_clf_result3

np.mean(forest_clf_result3)

y_train_pred2 = cross_val_predict(forest_clf3, X_train_standardized, y_train, cv=10)
conf_mx2 = confusion_matrix(y_train, y_train_pred)
conf_mx2

plt.matshow(conf_mx2, cmap=plt.cm.gray)
plt.show()

row_sums2 = conf_mx2.sum(axis=1, keepdims=True)
norm_conf_mx2 = conf_mx2 / row_sums2

np.fill_diagonal(norm_conf_mx2, 0)
plt.matshow(norm_conf_mx2, cmap=plt.cm.gray)
plt.show()

## KNN

from sklearn.neighbors import KNeighborsClassifier
# KNN 모델 초기화 및 훈련
knn = KNeighborsClassifier(n_neighbors=5)  # K 값은 5로 설정
knn.fit(X_train, y_train)

from sklearn.model_selection import cross_val_score

knn_result = cross_val_score(knn, X_train, y_train, cv=10, scoring="accuracy")
knn_result

np.mean(knn_result)

knn2 = KNeighborsClassifier(n_neighbors=3)  # K 값은 3로 설정
knn2.fit(X_train, y_train)

knn_result2 = cross_val_score(knn2, X_train, y_train, cv=10, scoring="accuracy")
knn_result2

np.mean(knn_result2)

knn3 = KNeighborsClassifier(n_neighbors=7)
knn3.fit(X_train, y_train)

knn_result3 = cross_val_score(knn3, X_train, y_train, cv=10, scoring="accuracy")
knn_result3

np.mean(knn_result3)

knn4 = KNeighborsClassifier(n_neighbors=10)
knn4.fit(X_train, y_train)

knn_result4 = cross_val_score(knn4, X_train, y_train, cv=10, scoring="accuracy")
knn_result4

np.mean(knn_result4)

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.ensemble import VotingClassifier

# 두 개의 분류기 정의
rf_clf = RandomForestClassifier(n_estimators=100)
gb_clf = GradientBoostingClassifier(n_estimators=100)

# 앙상블을 위한 VotingClassifier 생성
voting_clf = VotingClassifier(
    estimators=[('random_forest', rf_clf), ('gradient_boosting', gb_clf)],
    voting='soft'  # 예측을 평균화하는 방식 선택 (soft voting)
)

# VotingClassifier 모델 훈련
voting_clf.fit(X_train, y_train)

voting_clf_result = cross_val_score(voting_clf, X_train, y_train, cv=10, scoring="accuracy")
voting_clf_result

np.mean(voting_clf_result)

from sklearn.ensemble import VotingClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
import numpy as np

# 모델 정의
forest_clf = RandomForestClassifier(n_estimators=100, max_depth=30, n_jobs=-1, random_state=0)
poly_kernel_svm_clf = SVC(kernel="poly", degree=3, coef0=1, C=100, gamma='auto', max_iter=-1, probability=True)

# 앙상블 모델 생성
voting_clf = VotingClassifier(estimators=[('fc', forest_clf), ('svc', poly_kernel_svm_clf)], voting='soft')

# 교차 검증 수행
cv_scores = cross_val_score(voting_clf, X_train, y_train, cv=10, scoring="accuracy", n_jobs=4)

# 각 폴드의 정확도 출력
for i, score in enumerate(cv_scores, start=1):
    print(f"Cross Validation score {i}: {score}")

# 평균 정확도 출력
print(f"Cross Validation Mean is {np.mean(cv_scores)}")


from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.ensemble import VotingClassifier

# 두 개의 분류기 정의
rf_clf = RandomForestClassifier(n_estimators=100)
gb_clf = GradientBoostingClassifier(n_estimators=100)

# 앙상블을 위한 VotingClassifier 생성
voting_clf = VotingClassifier(
    estimators=[('random_forest', rf_clf), ('gradient_boosting', gb_clf)],
    voting='soft'  # 예측을 평균화하는 방식 선택 (soft voting)
)

# VotingClassifier 모델 훈련
voting_clf.fit(X_train, y_train)

voting_clf_result = cross_val_score(voting_clf, X_train, y_train, cv=10, scoring="accuracy")
voting_clf_result

np.mean(voting_clf_result)

from sklearn.ensemble import VotingClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
import numpy as np

# 모델 정의
forest_clf = RandomForestClassifier(n_estimators=150, max_depth=30, n_jobs=-1, random_state=42)
poly_kernel_svm_clf = SVC(kernel="poly", degree=3, coef0=1, C=70, gamma='auto', max_iter=-1, probability=True)

# 앙상블 모델 생성
voting_clf = VotingClassifier(estimators=[('fc', forest_clf), ('svc', poly_kernel_svm_clf)], voting='soft')

# 교차 검증 수행
cv_scores = cross_val_score(voting_clf, X_train, y_train, cv=10, scoring="accuracy", n_jobs=4)

# 각 폴드의 정확도 출력
for i, score in enumerate(cv_scores, start=1):
    print(f"Cross Validation score {i}: {score}")

# 평균 정확도 출력
print(f"Cross Validation Mean is {np.mean(cv_scores)}")


In [None]:
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
import numpy as np
import pandas as pd

input_file = "emnist-digits-train.csv"
df_train = pd.read_csv(input_file, header=None)

data_train = df_train.to_numpy()

X_train = data_train[:, 1:]
y_train = data_train[:, 0]

# 모델 정의
extra_trees_clf = ExtraTreesClassifier(n_estimators=100, max_depth=30, n_jobs=-1, random_state=42)
poly_kernel_svm_clf = SVC(kernel="poly", degree=3, coef0=1, C=100, gamma='auto', max_iter=-1, probability=True)

# 앙상블 모델 생성
voting_clf = VotingClassifier(estimators=[('etc', extra_trees_clf), ('svc', poly_kernel_svm_clf)], voting='soft')

# 교차 검증 수행
cv_scores = cross_val_score(voting_clf, X_train, y_train, cv=10, scoring="accuracy", n_jobs=-1)

# 평균 정확도 출력
print(f"Cross Validation Mean is {np.mean(cv_scores)}")


#

# 결과

In [10]:
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
import numpy as np
import pandas as pd

input_file = "emnist-digits-train.csv"
df_train = pd.read_csv(input_file, header=None)

data_train = df_train.to_numpy()

X_train = data_train[:, 1:]
y_train = data_train[:, 0]

# 모델 정의
extra_tree_clf = ExtraTreesClassifier(n_estimators=100, max_depth=30, n_jobs=-1, random_state=42)
svm_clf = SVC(kernel="poly", degree=3, coef0=1, C=100, gamma='auto', max_iter=-1, probability=True)

# 앙상블 모델 생성
voting_clf = VotingClassifier(estimators=[('etc', extra_tree_clf), ('svc', svm_clf)], voting='soft')

# 교차 검증 수행
cv_scores = cross_val_score(voting_clf, X_train, y_train, cv=10, scoring="accuracy", n_jobs=-1)

# 각 폴드의 정확도 출력
for i, score in enumerate(cv_scores, start=1):
    print(f"Fold {i}: {score}")

# 평균 정확도 출력
print(f"Cross Validation Mean is {np.mean(cv_scores)}")


Fold 1: 0.9901666666666666
Fold 2: 0.989
Fold 3: 0.9903333333333333
Fold 4: 0.9905416666666667
Fold 5: 0.9887916666666666
Fold 6: 0.9909166666666667
Fold 7: 0.99025
Fold 8: 0.990875
Fold 9: 0.9903333333333333
Fold 10: 0.9894583333333333
Cross Validation Mean is 0.9900666666666667
