In [1]:
# 성능 평가 계산 함수
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import f1_score

def get_clf_eval(y_test, pred):
    accuracy = accuracy_score(y_test, pred)
    precision = precision_score(y_test, pred, average='macro')
    recall = recall_score(y_test, pred, average='macro')
    f1 = f1_score(y_test, pred, average='macro')
    
    print('정확도: {0:.4f}, 정밀도: {1:.4f}, 재현율: {2:.4f}, F1:{3:.4f}'.format(accuracy, precision, recall, f1))

# 의사 결정 트리 (DecisionTree Classifier)

In [2]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV

# DecisionTree Classifier 생성
dt_clf = DecisionTreeClassifier(random_state=0)

# MNIST 데이터 로딩
digits = load_digits()

# 학습 데이터와 테스트 데이터로 분리
X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target, test_size=0.2, random_state=0)

# parameter들을 dictionary 형태로 설정
parameters = {'max_depth':[2,3,4,5], 'min_samples_split':[2,4,6,8,10]}

#### 정확도

In [3]:
# 교차 검증 및 하이퍼 파라미터 최적화
import pandas as pd

# param_grid의 하이퍼 파라미터를 3개의 train, test set fold로 나누어 테스트 수행 설정
cross_validation = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
grid_dtree = GridSearchCV(dt_clf, param_grid=parameters, cv=cross_validation, refit=True)

# MNIST 데이터로 param_grid의 하이퍼 파라미터를 순차적으로 학습 및 평가
grid_dtree.fit(X_train, y_train)

# GridSearchCV 결과 추출하여 DataFrame으로 변환
scores_df = pd.DataFrame(grid_dtree.cv_results_)
scores_df[['params', 'mean_test_score', 'rank_test_score', \
           'split0_test_score', 'split1_test_score', 'split2_test_score', 'split3_test_score', 'split4_test_score']]

Unnamed: 0,params,mean_test_score,rank_test_score,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score
0,"{'max_depth': 2, 'min_samples_split': 2}",0.329152,16,0.333333,0.333333,0.324042,0.320557,0.334495
1,"{'max_depth': 2, 'min_samples_split': 4}",0.329152,16,0.333333,0.333333,0.324042,0.320557,0.334495
2,"{'max_depth': 2, 'min_samples_split': 6}",0.329152,16,0.333333,0.333333,0.324042,0.320557,0.334495
3,"{'max_depth': 2, 'min_samples_split': 8}",0.329152,16,0.333333,0.333333,0.324042,0.320557,0.334495
4,"{'max_depth': 2, 'min_samples_split': 10}",0.329152,16,0.333333,0.333333,0.324042,0.320557,0.334495
5,"{'max_depth': 3, 'min_samples_split': 2}",0.478777,11,0.461806,0.493056,0.494774,0.477352,0.466899
6,"{'max_depth': 3, 'min_samples_split': 4}",0.478777,11,0.461806,0.493056,0.494774,0.477352,0.466899
7,"{'max_depth': 3, 'min_samples_split': 6}",0.478777,11,0.461806,0.493056,0.494774,0.477352,0.466899
8,"{'max_depth': 3, 'min_samples_split': 8}",0.478777,11,0.461806,0.493056,0.494774,0.477352,0.466899
9,"{'max_depth': 3, 'min_samples_split': 10}",0.478777,11,0.461806,0.493056,0.494774,0.477352,0.466899


In [4]:
print('GridSearchCV 최적 파라미터:',grid_dtree.best_params_)
print('GridSearchCV 최고 정확도: {0:.4f}'.format(grid_dtree.best_score_))

GridSearchCV 최적 파라미터: {'max_depth': 5, 'min_samples_split': 2}
GridSearchCV 최고 정확도: 0.7488


=> DecisionTree Classifier 모델에서 하이퍼 파라미터 최적화 결과 max_depth가 5, min_samples_split이 2일 때 평균 최고 정확도가 74.88%로
측정됐다.

#### 정밀도

In [5]:
# 교차 검증 및 하이퍼 파라미터 최적화
import pandas as pd
from sklearn.metrics import make_scorer
import warnings
warnings.filterwarnings('ignore')

# param_grid의 하이퍼 파라미터를 3개의 train, test set fold로 나누어 테스트 수행 설정
cross_validation = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
grid_dtree = GridSearchCV(dt_clf, param_grid=parameters, scoring=make_scorer(precision_score, average='macro'), cv=cross_validation, refit=True)

# MNIST 데이터로 param_grid의 하이퍼 파라미터를 순차적으로 학습 및 평가
grid_dtree.fit(X_train, y_train)

# GridSearchCV 결과 추출하여 DataFrame으로 변환
scores_df = pd.DataFrame(grid_dtree.cv_results_)
scores_df[['params', 'mean_test_score', 'rank_test_score', \
           'split0_test_score', 'split1_test_score', 'split2_test_score', 'split3_test_score', 'split4_test_score']]

Unnamed: 0,params,mean_test_score,rank_test_score,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score
0,"{'max_depth': 2, 'min_samples_split': 2}",0.20368,16,0.221763,0.198119,0.207013,0.193911,0.197595
1,"{'max_depth': 2, 'min_samples_split': 4}",0.20368,16,0.221763,0.198119,0.207013,0.193911,0.197595
2,"{'max_depth': 2, 'min_samples_split': 6}",0.20368,16,0.221763,0.198119,0.207013,0.193911,0.197595
3,"{'max_depth': 2, 'min_samples_split': 8}",0.20368,16,0.221763,0.198119,0.207013,0.193911,0.197595
4,"{'max_depth': 2, 'min_samples_split': 10}",0.20368,16,0.221763,0.198119,0.207013,0.193911,0.197595
5,"{'max_depth': 3, 'min_samples_split': 2}",0.443988,11,0.383077,0.457094,0.467531,0.463977,0.448261
6,"{'max_depth': 3, 'min_samples_split': 4}",0.443988,11,0.383077,0.457094,0.467531,0.463977,0.448261
7,"{'max_depth': 3, 'min_samples_split': 6}",0.443988,11,0.383077,0.457094,0.467531,0.463977,0.448261
8,"{'max_depth': 3, 'min_samples_split': 8}",0.443988,11,0.383077,0.457094,0.467531,0.463977,0.448261
9,"{'max_depth': 3, 'min_samples_split': 10}",0.443988,11,0.383077,0.457094,0.467531,0.463977,0.448261


In [6]:
print('GridSearchCV 최적 파라미터:',grid_dtree.best_params_)
print('GridSearchCV 최고 정밀도: {0:.4f}'.format(grid_dtree.best_score_))

GridSearchCV 최적 파라미터: {'max_depth': 5, 'min_samples_split': 2}
GridSearchCV 최고 정밀도: 0.7499


=> DecisionTree Classifier 모델에서 하이퍼 파라미터 최적화 결과 max_depth가 5, min_samples_split이 2일 때 평균 최고 정밀도가 74.99%로
측정됐다

#### 재현율

In [7]:
# 교차 검증 및 하이퍼 파라미터 최적화
import pandas as pd
from sklearn.metrics import make_scorer
import warnings
warnings.filterwarnings('ignore')

# param_grid의 하이퍼 파라미터를 3개의 train, test set fold로 나누어 테스트 수행 설정
cross_validation = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
grid_dtree = GridSearchCV(dt_clf, param_grid=parameters, scoring=make_scorer(recall_score, average='macro'), cv=cross_validation, refit=True)

# MNIST 데이터로 param_grid의 하이퍼 파라미터를 순차적으로 학습 및 평가
grid_dtree.fit(X_train, y_train)

# GridSearchCV 결과 추출하여 DataFrame으로 변환
scores_df = pd.DataFrame(grid_dtree.cv_results_)
scores_df[['params', 'mean_test_score', 'rank_test_score', \
           'split0_test_score', 'split1_test_score', 'split2_test_score', 'split3_test_score', 'split4_test_score']]

Unnamed: 0,params,mean_test_score,rank_test_score,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score
0,"{'max_depth': 2, 'min_samples_split': 2}",0.31375,16,0.31881,0.31881,0.310038,0.305323,0.315771
1,"{'max_depth': 2, 'min_samples_split': 4}",0.31375,16,0.31881,0.31881,0.310038,0.305323,0.315771
2,"{'max_depth': 2, 'min_samples_split': 6}",0.31375,16,0.31881,0.31881,0.310038,0.305323,0.315771
3,"{'max_depth': 2, 'min_samples_split': 8}",0.31375,16,0.31881,0.31881,0.310038,0.305323,0.315771
4,"{'max_depth': 2, 'min_samples_split': 10}",0.31375,16,0.31881,0.31881,0.310038,0.305323,0.315771
5,"{'max_depth': 3, 'min_samples_split': 2}",0.470694,11,0.452834,0.484631,0.489506,0.470568,0.455929
6,"{'max_depth': 3, 'min_samples_split': 4}",0.470694,11,0.452834,0.484631,0.489506,0.470568,0.455929
7,"{'max_depth': 3, 'min_samples_split': 6}",0.470694,11,0.452834,0.484631,0.489506,0.470568,0.455929
8,"{'max_depth': 3, 'min_samples_split': 8}",0.470694,11,0.452834,0.484631,0.489506,0.470568,0.455929
9,"{'max_depth': 3, 'min_samples_split': 10}",0.470694,11,0.452834,0.484631,0.489506,0.470568,0.455929


In [8]:
print('GridSearchCV 최적 파라미터:',grid_dtree.best_params_)
print('GridSearchCV 최고 재현율: {0:.4f}'.format(grid_dtree.best_score_))

GridSearchCV 최적 파라미터: {'max_depth': 5, 'min_samples_split': 2}
GridSearchCV 최고 재현율: 0.7446


=> DecisionTree Classifier 모델에서 하이퍼 파라미터 최적화 결과 max_depth가 5, min_samples_split이 2일 때 평균 최고 재현율이 74.46%로
측정됐다.

#### F1 스코어

In [9]:
# 교차 검증 및 하이퍼 파라미터 최적화
import pandas as pd
from sklearn.metrics import make_scorer
import warnings
warnings.filterwarnings('ignore')

# param_grid의 하이퍼 파라미터를 3개의 train, test set fold로 나누어 테스트 수행 설정
cross_validation = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
grid_dtree = GridSearchCV(dt_clf, param_grid=parameters, scoring=make_scorer(f1_score, average='macro'), cv=cross_validation, refit=True)

# MNIST 데이터로 param_grid의 하이퍼 파라미터를 순차적으로 학습 및 평가
grid_dtree.fit(X_train, y_train)

# GridSearchCV 결과 추출하여 DataFrame으로 변환
scores_df = pd.DataFrame(grid_dtree.cv_results_)
scores_df[['params', 'mean_test_score', 'rank_test_score', \
           'split0_test_score', 'split1_test_score', 'split2_test_score', 'split3_test_score', 'split4_test_score']]

Unnamed: 0,params,mean_test_score,rank_test_score,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score
0,"{'max_depth': 2, 'min_samples_split': 2}",0.209319,16,0.213329,0.209566,0.215841,0.202254,0.205608
1,"{'max_depth': 2, 'min_samples_split': 4}",0.209319,16,0.213329,0.209566,0.215841,0.202254,0.205608
2,"{'max_depth': 2, 'min_samples_split': 6}",0.209319,16,0.213329,0.209566,0.215841,0.202254,0.205608
3,"{'max_depth': 2, 'min_samples_split': 8}",0.209319,16,0.213329,0.209566,0.215841,0.202254,0.205608
4,"{'max_depth': 2, 'min_samples_split': 10}",0.209319,16,0.213329,0.209566,0.215841,0.202254,0.205608
5,"{'max_depth': 3, 'min_samples_split': 2}",0.383898,11,0.367253,0.390396,0.394844,0.390192,0.376807
6,"{'max_depth': 3, 'min_samples_split': 4}",0.383898,11,0.367253,0.390396,0.394844,0.390192,0.376807
7,"{'max_depth': 3, 'min_samples_split': 6}",0.383898,11,0.367253,0.390396,0.394844,0.390192,0.376807
8,"{'max_depth': 3, 'min_samples_split': 8}",0.383898,11,0.367253,0.390396,0.394844,0.390192,0.376807
9,"{'max_depth': 3, 'min_samples_split': 10}",0.383898,11,0.367253,0.390396,0.394844,0.390192,0.376807


In [10]:
print('GridSearchCV 최적 파라미터:',grid_dtree.best_params_)
print('GridSearchCV 최고 F1 스코어: {0:.4f}'.format(grid_dtree.best_score_))

GridSearchCV 최적 파라미터: {'max_depth': 5, 'min_samples_split': 2}
GridSearchCV 최고 F1 스코어: 0.7341


=> DecisionTree Classifier 모델에서 하이퍼 파라미터 최적화 결과 max_depth가 5, min_samples_split이 2일 때 평균 최고 F1 스코어가 73.41%로
측정됐다.

In [11]:
# GridSearchCV의 refit으로 이미 학습된 estimator 반환
estimator = grid_dtree.best_estimator_

# 테스트 데이터 세트의 성능 평가
pred = estimator.predict(X_test)
get_clf_eval(y_test,pred)

정확도: 0.7306, 정밀도: 0.7656, 재현율: 0.7368, F1:0.7409


=> 별도의 테스트 데이터 세트로 성능을 평가한 결과 정확도는 73.06%, 정밀도는 76.56%, 재현율은 73.68%, 그리고 F1 스코어는 74.09%의 결과가 도출됐다.
=> 위의 성능 평가를 종합한 결과 max_depth가 5, min_samples_split이 2일 때 DecisionTree Classifier 모델은 가장 최적의 성능을 보인다.

# 랜덤 포레스트 (Random Forest Classifier)

In [12]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV

# Random Forest Classifier 생성
rf_clf = RandomForestClassifier(random_state=0)

# MNIST 데이터 로딩
digits = load_digits()

# 학습 데이터와 테스트 데이터로 분리
X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target, test_size=0.2, random_state=0)

# parameter들을 dictionary 형태로 설정
parameters = {'min_samples_split':[2,4,6,8,10], 'n_estimators':[100, 300, 500]} 

#### 정확도

In [13]:
# 하이퍼 파라미터 최적화 및 교차검증
import pandas as pd

cross_validation = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
grid_rforest = GridSearchCV(rf_clf, param_grid=parameters, cv=cross_validation, refit=True)

grid_rforest.fit(X_train, y_train)

# GridSearchCV 결과 추출하여 DataFrame으로 변환
scores_df = pd.DataFrame(grid_rforest.cv_results_)
scores_df[['params', 'mean_test_score', 'rank_test_score', \
           'split0_test_score', 'split1_test_score', 'split2_test_score', 'split3_test_score', 'split4_test_score']]

Unnamed: 0,params,mean_test_score,rank_test_score,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score
0,"{'min_samples_split': 2, 'n_estimators': 100}",0.972849,1,0.986111,0.975694,0.961672,0.972125,0.968641
1,"{'min_samples_split': 2, 'n_estimators': 300}",0.971462,3,0.986111,0.965278,0.961672,0.97561,0.968641
2,"{'min_samples_split': 2, 'n_estimators': 500}",0.971465,2,0.982639,0.965278,0.968641,0.972125,0.968641
3,"{'min_samples_split': 4, 'n_estimators': 100}",0.966592,10,0.982639,0.958333,0.965157,0.965157,0.961672
4,"{'min_samples_split': 4, 'n_estimators': 300}",0.966589,11,0.982639,0.961806,0.958188,0.968641,0.961672
5,"{'min_samples_split': 4, 'n_estimators': 500}",0.968682,6,0.982639,0.958333,0.965157,0.968641,0.968641
6,"{'min_samples_split': 6, 'n_estimators': 100}",0.967289,8,0.975694,0.965278,0.961672,0.965157,0.968641
7,"{'min_samples_split': 6, 'n_estimators': 300}",0.970074,4,0.982639,0.961806,0.968641,0.968641,0.968641
8,"{'min_samples_split': 6, 'n_estimators': 500}",0.969377,5,0.982639,0.961806,0.968641,0.968641,0.965157
9,"{'min_samples_split': 8, 'n_estimators': 100}",0.965203,12,0.982639,0.951389,0.968641,0.965157,0.958188


In [14]:
print('GridSearchCV 최적 파라미터:',grid_rforest.best_params_)
print('GridSearchCV 최고 정확도: {0:.4f}'.format(grid_rforest.best_score_))

GridSearchCV 최적 파라미터: {'min_samples_split': 2, 'n_estimators': 100}
GridSearchCV 최고 정확도: 0.9728


=> Random Forest Classifier 모델에서 하이퍼 파라미터 최적화 결과 n_estimators가 100, min_samples_split이 2일 때 평균 최고 정확도가 97.28%로
측정됐다.

#### 정밀도

In [15]:
# 하이퍼 파라미터 최적화 및 교차검증
import pandas as pd

cross_validation = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
grid_rforest = GridSearchCV(rf_clf, param_grid=parameters,  scoring=make_scorer(precision_score, average='macro'), cv=cross_validation, refit=True)

grid_rforest.fit(X_train, y_train)

# GridSearchCV 결과 추출하여 DataFrame으로 변환
scores_df = pd.DataFrame(grid_rforest.cv_results_)
scores_df[['params', 'mean_test_score', 'rank_test_score', \
           'split0_test_score', 'split1_test_score', 'split2_test_score', 'split3_test_score', 'split4_test_score']]

Unnamed: 0,params,mean_test_score,rank_test_score,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score
0,"{'min_samples_split': 2, 'n_estimators': 100}",0.973813,1,0.985908,0.975757,0.963755,0.974267,0.969375
1,"{'min_samples_split': 2, 'n_estimators': 300}",0.972462,2,0.986273,0.966819,0.963243,0.977089,0.968883
2,"{'min_samples_split': 2, 'n_estimators': 500}",0.97231,3,0.983148,0.966328,0.969804,0.973386,0.968883
3,"{'min_samples_split': 4, 'n_estimators': 100}",0.967732,10,0.98257,0.960617,0.966947,0.965876,0.962653
4,"{'min_samples_split': 4, 'n_estimators': 300}",0.967739,9,0.98257,0.963115,0.960818,0.970563,0.96163
5,"{'min_samples_split': 4, 'n_estimators': 500}",0.969643,6,0.98257,0.959968,0.966815,0.969978,0.968883
6,"{'min_samples_split': 6, 'n_estimators': 100}",0.968248,7,0.975418,0.966623,0.962538,0.965876,0.970785
7,"{'min_samples_split': 6, 'n_estimators': 300}",0.970778,4,0.98257,0.962731,0.97005,0.969467,0.969071
8,"{'min_samples_split': 6, 'n_estimators': 500}",0.970298,5,0.98257,0.963672,0.97005,0.969467,0.965732
9,"{'min_samples_split': 8, 'n_estimators': 100}",0.965828,13,0.982683,0.952477,0.97005,0.965661,0.958267


In [16]:
print('GridSearchCV 최적 파라미터:',grid_rforest.best_params_)
print('GridSearchCV 최고 정밀도: {0:.4f}'.format(grid_rforest.best_score_))

GridSearchCV 최적 파라미터: {'min_samples_split': 2, 'n_estimators': 100}
GridSearchCV 최고 정밀도: 0.9738


=> Random Forest Classifier 모델에서 하이퍼 파라미터 최적화 결과 n_estimators가 100, min_samples_split이 2일 때 평균 최고 정밀도가 97.38%로
측정됐다.

#### 재현율

In [17]:
# 하이퍼 파라미터 최적화 및 교차검증
import pandas as pd

cross_validation = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
grid_rforest = GridSearchCV(rf_clf, param_grid=parameters,  scoring=make_scorer(recall_score, average='macro'), cv=cross_validation, refit=True)

grid_rforest.fit(X_train, y_train)

# GridSearchCV 결과 추출하여 DataFrame으로 변환
scores_df = pd.DataFrame(grid_rforest.cv_results_)
scores_df[['params', 'mean_test_score', 'rank_test_score', \
           'split0_test_score', 'split1_test_score', 'split2_test_score', 'split3_test_score', 'split4_test_score']]

Unnamed: 0,params,mean_test_score,rank_test_score,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score
0,"{'min_samples_split': 2, 'n_estimators': 100}",0.972493,1,0.98545,0.974471,0.962548,0.972463,0.967532
1,"{'min_samples_split': 2, 'n_estimators': 300}",0.971182,2,0.98582,0.96434,0.962786,0.975911,0.967054
2,"{'min_samples_split': 2, 'n_estimators': 500}",0.971108,3,0.982249,0.96397,0.969691,0.972578,0.967054
3,"{'min_samples_split': 4, 'n_estimators': 100}",0.966202,11,0.982249,0.957065,0.966012,0.965426,0.960257
4,"{'min_samples_split': 4, 'n_estimators': 300}",0.966271,9,0.982249,0.960637,0.959215,0.969129,0.960125
5,"{'min_samples_split': 4, 'n_estimators': 500}",0.968302,6,0.982249,0.957065,0.966012,0.969129,0.967054
6,"{'min_samples_split': 6, 'n_estimators': 100}",0.966888,8,0.975319,0.964102,0.962416,0.965426,0.967178
7,"{'min_samples_split': 6, 'n_estimators': 300}",0.969736,4,0.982249,0.960769,0.969345,0.969129,0.967187
8,"{'min_samples_split': 6, 'n_estimators': 500}",0.96902,5,0.982249,0.960769,0.969345,0.969129,0.963606
9,"{'min_samples_split': 8, 'n_estimators': 100}",0.964651,13,0.982116,0.950506,0.969345,0.96508,0.956208


In [18]:
print('GridSearchCV 최적 파라미터:',grid_rforest.best_params_)
print('GridSearchCV 최고 재현율: {0:.4f}'.format(grid_rforest.best_score_))

GridSearchCV 최적 파라미터: {'min_samples_split': 2, 'n_estimators': 100}
GridSearchCV 최고 재현율: 0.9725


=> Random Forest Classifier 모델에서 하이퍼 파라미터 최적화 결과 n_estimators가 100, min_samples_split이 2일 때 평균 최고 재현율이 97.25%로
측정됐다.

#### F1 스코어

In [19]:
# 하이퍼 파라미터 최적화 및 교차검증
import pandas as pd

cross_validation = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
grid_rforest = GridSearchCV(rf_clf, param_grid=parameters,  scoring=make_scorer(f1_score, average='macro'), cv=cross_validation, refit=True)

grid_rforest.fit(X_train, y_train)

# GridSearchCV 결과 추출하여 DataFrame으로 변환
scores_df = pd.DataFrame(grid_rforest.cv_results_)
scores_df[['params', 'mean_test_score', 'rank_test_score', \
           'split0_test_score', 'split1_test_score', 'split2_test_score', 'split3_test_score', 'split4_test_score']]

Unnamed: 0,params,mean_test_score,rank_test_score,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score
0,"{'min_samples_split': 2, 'n_estimators': 100}",0.972446,1,0.985556,0.974549,0.961666,0.972509,0.967949
1,"{'min_samples_split': 2, 'n_estimators': 300}",0.971087,2,0.985933,0.964465,0.961673,0.975974,0.967392
2,"{'min_samples_split': 2, 'n_estimators': 500}",0.970971,3,0.982528,0.963839,0.968634,0.972463,0.967392
3,"{'min_samples_split': 4, 'n_estimators': 100}",0.966077,10,0.982297,0.95703,0.965299,0.965075,0.960686
4,"{'min_samples_split': 4, 'n_estimators': 300}",0.966101,9,0.982297,0.96051,0.958346,0.968998,0.960356
5,"{'min_samples_split': 4, 'n_estimators': 500}",0.968189,6,0.982297,0.957107,0.965309,0.968843,0.967392
6,"{'min_samples_split': 6, 'n_estimators': 100}",0.966777,8,0.975199,0.964208,0.961423,0.965075,0.967981
7,"{'min_samples_split': 6, 'n_estimators': 300}",0.96965,4,0.982297,0.960844,0.968686,0.968786,0.967637
8,"{'min_samples_split': 6, 'n_estimators': 500}",0.968987,5,0.982297,0.961062,0.968686,0.968786,0.964103
9,"{'min_samples_split': 8, 'n_estimators': 100}",0.964478,13,0.982222,0.950045,0.968686,0.964956,0.956481


In [20]:
print('GridSearchCV 최적 파라미터:',grid_rforest.best_params_)
print('GridSearchCV 최고 F1 스코어: {0:.4f}'.format(grid_rforest.best_score_))

GridSearchCV 최적 파라미터: {'min_samples_split': 2, 'n_estimators': 100}
GridSearchCV 최고 F1 스코어: 0.9724


=> Random Forest Classifier 모델에서 하이퍼 파라미터 최적화 결과 n_estimators가 100, min_samples_split이 2일 때 평균 최고 F1 스코어가 97.24%로
측정됐다.

In [21]:
estimator = grid_rforest.best_estimator_

pred = estimator.predict(X_test)

get_clf_eval(y_test,pred)

정확도: 0.9694, 정밀도: 0.9695, 재현율: 0.9709, F1:0.9698


=> 별도의 테스트 데이터 세트로 성능을 평가한 결과 정확도는 96.94%, 정밀도는 96.95%, 재현율은 97.09%, 그리고 F1 스코어는 96.98%의 결과가 도출됐다.
=> 위의 성능 평가를 종합한 결과 n_estimators가 100, min_samples_split이 2일 때 Random Forest Classifier 모델은 가장 최적의 성능을 보인다.

# 성능 레포트

의사 결정 트리(DecisionTree Classifier) 모델에서 하이퍼 파라미터 최적화를 수행하였을 때 정확도, 정밀도, 재현율, F1 스코어 각 성능 모두 max_depth가 5, min_samples_split이 2일 때 가장 최적 성능을 보였다. 각 성능은 정확도는 74.88%, 정밀도는 74.99%, 재현율은 74.46%, F1 스코어는 73.41%로 측정됐다. 가장 최적 성능을 보인 모델을 테스트 데이터 세트로 성능 평가를 한 결과 정확도는 73.06%, 정밀도는 76.56%, 재현율은 73.68%, 그리고 F1 스코어는 74.09%의 결과가 도출됐다.

랜덤 포레스트(Random Forest Classifier)모델에서 하이퍼 파라미터 최적화를 수행하였을 때 정확도, 정밀도, 재현율, F1 스코어 각 성능 모두 n_estimator가 100, min_samples_split이 2일 때 가장 최적 성능을 보였다. 각 성능은 정확도는 97.28%, 정밀도는 97.38%, 재현율은 97.25%, F1 스코어는 97.24%로 측정됐다. 가장 최적 성능을 보인 모델을 테스트 데이터 세트로 성능 평가를 한 결과 정확도는 96.94%, 정밀도는 96.95%, 재현율은 97.09%, 그리고 F1 스코어는 96.98%의 결과가 도출됐다.

의사 결정 트리(DecisionTree Classifier)와 랜덤 포레스트(Random Forest Classifier) 각 모델의 최적 성능을 보일 때의 두 모델을 비교하였을 때 정확도, 정밀도, 재현율, F1 스코어 각 성능 모두 랜덤 포레스트가 훨씬 더 성능이 좋은 것을 확인할 수 있다. 따라서 두 모델 중 가장 최적 성능을 보이는 모델은 랜덤 포레스트이다.