In [1]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.preprocessing import LabelEncoder

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import tensorflow as tf
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV


'''
- 데이터는 총 150개의 로우로 구성되어있으며, 컬럼은 5개가 존재한다.
- 각 컬럼의 결측치는 존재하지 않는다.

- sepal length : 꽃받침의 길이
- sepal width : 꽃받침의 너비
- petal length : 꽃잎의 길이
- petal width : 꽃잎의 너비
- label : 품종
'''
def loadData(name=True):
    iris = load_iris()
    # print(iris.feature_names)
    # print(iris.data)
    # print(iris.target)
    # print(iris.target_names)
    iris_df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
    iris_df['label'] = iris.target
    if(name) :
        iris_df['label'] = iris_df['label'].apply(lambda x: iris.target_names[x])
    # print(iris_df.head())
    return iris_df


def getOneHotEncodedLabel(y):
    # 문자열을 숫자로 변환
    e = LabelEncoder()
    e.fit(y)
    Y = e.transform(y)
    Y_encoded = tf.keras.utils.to_categorical(Y)
    return Y_encoded

def eda(df):
    print('-------INFO-------')
    print(df.info())
    # 데이터프레임의 통계 정보
    print('\n-------DESCRIBE-------')
    print(df.describe())
    # label 필드의 고유한 데이터 값
    print('\n-------LABEL UNIQUE-------')
    print(df['label'].unique())
    # iris 데이터의 pairplot
    # sns.pairplot(df, hue='label')

''' 
결측치 여부 확인
'''
def checkNull(df):
    print('-------- Null Check --------')
    print(df.isnull().sum())

def dataSplit(df) :
    X = df.iloc[:, :4]
    y = df['label']

    # 학습용 데이터와 테스트용 데이터로 나눔
    X_train, X_test, y_train, y_test = train_test_split(X, y)

    print(len(X_train))
    print(len(X_test))

    return X_train, X_test, y_train, y_test

def getTreeModel(X_train, y_train):
    # DecisionTreeClassifier 객체 생성
    model = DecisionTreeClassifier(random_state=11)

    # 학습 수행
    model.fit(X_train, y_train)
    return model

def getBestTreeModel(X_train, y_train):
    dtree = DecisionTreeClassifier()
    parameters = {'max_depth': [1, 2, 3], 'min_samples_split': [2, 3]}

    # param_grid: 하이퍼파라미터
    # cv: 3개의 train, test set fold 로 나누어 학습
    grid_dtree = GridSearchCV(dtree, param_grid=parameters, cv=3)

    # param_grid의 하이퍼파라미터들을 순차적으로 학습
    grid_dtree.fit(X_train, y_train)

    # GridSearchCV 결과 추출하여 DataFrame으로 변환
    scores_df = pd.DataFrame(grid_dtree.cv_results_)
    print(scores_df)
    print(scores_df[['params', 'mean_test_score', 'rank_test_score',
                     'split0_test_score', 'split1_test_score', 'split2_test_score']])
    print('GridSearchCV 최적 파라미터:', grid_dtree.best_params_)
    print('GridSearchCV 최고 정확도: {0:.4f}'.format(grid_dtree.best_score_))

    # 최고 성능을 낸 분류기
    estimator = grid_dtree.best_estimator_
    return estimator

def printAccuracy(X_test, y_test, model, keras=False):
    model_name = model.__class__.__name__
    if(keras):
        eval = model.evaluate(X_test, getOneHotEncodedLabel(y_test))
        print('{0} 정확도: {1:.4f}, Loss: {2:.4f}'.format(model_name, eval[1], eval[0]))
    else:
        pred = model.predict(X_test)
        print('{0} 정확도: {1:.4f}'.format(model_name, accuracy_score(y_test, pred)))

def getDenseModel(X_train, y_train):
    # 모델의 설정
    model = Sequential()
    model.add(Dense(16,  input_dim=4, activation='relu'))
    model.add(Dense(3, activation='softmax'))

    # 모델 컴파일
    model.compile(loss='categorical_crossentropy',
                optimizer='adam',
                metrics=['accuracy'])

    # 모델 실행
    model.fit(X_train, y_train, epochs=50, batch_size=1)

    return model

if __name__ == "__main__":
    df = loadData(False)
    eda(df)
    checkNull(df)
    X_train, X_test, y_train, y_test = dataSplit(df)
    model = getTreeModel(X_train, y_train)
    printAccuracy(X_test, y_test, model)

    df = loadData(True)
    X_train, X_test, y_train, y_test = dataSplit(df)
    model = getDenseModel(X_train, getOneHotEncodedLabel(y_train))
    model.summary()
    printAccuracy(X_test, y_test, model, True)

-------INFO-------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   sepal length (cm)  150 non-null    float64
 1   sepal width (cm)   150 non-null    float64
 2   petal length (cm)  150 non-null    float64
 3   petal width (cm)   150 non-null    float64
 4   label              150 non-null    int32  
dtypes: float64(4), int32(1)
memory usage: 5.4 KB
None

-------DESCRIBE-------
       sepal length (cm)  sepal width (cm)  petal length (cm)  \
count         150.000000        150.000000         150.000000   
mean            5.843333          3.057333           3.758000   
std             0.828066          0.435866           1.765298   
min             4.300000          2.000000           1.000000   
25%             5.100000          2.800000           1.600000   
50%             5.800000          3.000000           4.350000   
75%    