# 2. FER2013 RandomForest 분류

In [1]:
import sys
import os
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import train_test_split

## 2.1. 학습에 사용될 하이퍼 패러미터 선언 및 Random Seed 설정

In [2]:
np.random.seed(201)

In [3]:
FER2013_DIR = "datasets/ferPlus/prep/"
INPUT_TRAIN_DATA = 'ferPlus_X.npy'
LABEL_TRAIN_DATA = 'ferPlus_y.npy'

TEST_SPLIT = 0.2
RND_SEED = 0

In [4]:
input_data = np.load(open(FER2013_DIR + INPUT_TRAIN_DATA, 'rb'))
label_data = np.load(open(FER2013_DIR + LABEL_TRAIN_DATA, 'rb'))

In [5]:
nsamples, nx, ny, nc = input_data.shape
input_data = input_data.reshape((nsamples,nx*ny*nc))

In [6]:
input_data.shape

(11043, 2304)

In [7]:
x_train, x_test, y_train, y_test = train_test_split(input_data,
                                                    label_data,
                                                    test_size=TEST_SPLIT,
                                                    random_state=RND_SEED)

In [8]:
x_train, x_val, y_train, y_val = train_test_split(x_train,
                                                  y_train,
                                                  test_size=TEST_SPLIT,
                                                  random_state=RND_SEED)

## 2.2 RandomForest를 이용하여 학습

In [9]:
clf = RandomForestClassifier(n_estimators=1000, oob_score=True, random_state=RND_SEED)
clf.fit(x_train, np.argmax(y_train, axis=1))

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=1000,
                       n_jobs=None, oob_score=True, random_state=0, verbose=0,
                       warm_start=False)

검증셋 정확도

In [10]:
clf.score(x_val, np.argmax(y_val, axis=1))

0.7628749292586304

테스트셋 정확도

In [11]:
clf.score(x_test, np.argmax(y_test, axis=1))

0.7618832050701675

## 2.3 K-fold  Cross Validation 이용

In [12]:
from sklearn.model_selection import KFold

kf = KFold(n_splits=10, random_state=RND_SEED, shuffle=True)

for train_index, test_index in kf.split(input_data):
    X_train, X_test = input_data[train_index], input_data[test_index]
    Y_train, Y_test = label_data[train_index], label_data[test_index]

In [13]:
clf = RandomForestClassifier(n_estimators=1000, oob_score=True, random_state=RND_SEED)
clf.fit(X_train, np.argmax(Y_train, axis=1))

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=1000,
                       n_jobs=None, oob_score=True, random_state=0, verbose=0,
                       warm_start=False)

테스트셋 정확도

In [14]:
clf.score(X_test, np.argmax(Y_test, axis=1))

0.7690217391304348