# 플랫폼 업로드를 쉽게하기 위한 로컬 개발 코드
- T3Q.ai(T3Q.cep + T3Q.dl): 빅데이터/인공지능 통합 플랫폼
- 플랫폼 업로드를 쉽게하기 위하여 로컬에서 아래의 코드(파일1)를 개발한다.
- 파일 1(파일명): 1_local_platform_image_classification.ipynb

### 전처리 객체 또는 학습모델 객체
- 전처리 객체나 학습모델 객체는 meta_data 폴더 아래에 저장한다.

### 데이터셋(학습 데이터/테스트 데이터)
- 학습과 테스트에 사용되는 데이터를 나누어 관리한다.
- 학습 데이터: dataset 폴더 아래에 저장하거나 dataset.zip 파일 형태로 저장한다.
- 테스트 데이터: test_dataset 폴더 아래에 저장하거나 test_dataset.zip 파일 형태로 저장한다.

### 로컬 개발 워크플로우(workflow)  
- 로컬 개발 워크플로우를 다음의 4단계로 분리한다.

1. 데이터셋 준비(Data Setup)
- 로컬 저장소에서 전처리 및 학습에 필요한 학습 데이터셋을 준비한다.

2. 데이터 전처리(Data Preprocessing)
- 데이터셋의 분석 및 정규화(Normalization)등의 전처리를 수행한다.
- 데이터를 모델 학습에 사용할 수 있도록 가공한다.
- 추론과정에서 필요한 경우, 데이터 전처리에 사용된 객체를 meta_data 폴더 아래에 저장한다.

3. 학습 모델 훈련(Train Model)
- 데이터를 훈련에 사용할 수 있도록 가공한 뒤에 학습 모델을 구성한다. 
- 학습 모델을 준비된 데이터셋으로 훈련시킨다.
- 정확도(Accuracy)나 손실(Loss)등 학습 모델의 성능을 검증한다.
- 학습 모델의 성능 검증 후, 학습 모델을 배포한다.
- 배포할 학습 모델을 meta_data 폴더 아래에 저장한다.

4. 추론(Inference)
- 저장된 전처리 객체나 학습 모델 객체를 준비한다.
- 추론에 필요한 테스트 데이터셋을 준비한다.
- 배포된 학습 모델을 통해 테스트 데이터에 대한 추론을 진행한다. 

In [1]:
from IPython.display import Image
#Image(filename='./T3Q.ai.jpg')

# 인공지능 통합플랫폼(T3Q.ai) 프로세스를 이해하고 인공지능 쉽게 하기

In [2]:
# 파일명: image_classification_preprocess.py

'''
from image_classification_preprocess_sub import exec_process
'''
import logging

logging.basicConfig(level=logging.INFO)

def process_for_train(pm):
    
    exec_process(pm)
    
    logging.info('[hunmin log] the end line of the function [process_for_train]')
    
    
def init_svc(im, rule):
    return {}


def transform(df, params, batch_id):
    
    logging.info('[hunmin log] df : {}'.format(df))
    logging.info('[hunmin log] df.shape : {}'.format(df.shape))
    logging.info('[hunmin log] type(df) : {}'.format(type(df)))   
    logging.info('[hunmin log] the end line of the function [transform]')
    
    return df

In [3]:
# 파일명: image_classification_preprocess_sub.py

import os
import numpy as np
import pandas as pd
import zipfile
import logging


def exec_process(pm):

    logging.info('[hunmin log]  the start line of the function [exec_process]')

    logging.info('[hunmin log] pm.source_path : {}'.format(pm.source_path))

    # 저장 파일 확인
    list_files_directories(pm.source_path)
    
    # pm.source_path의 dataset.zip 파일을 
    # pm.target_path의 dataset 폴더에 압축을 풀어준다.
    my_zip_path = os.path.join(pm.source_path,'dataset.zip')
    extract_zip_file = zipfile.ZipFile(my_zip_path)
    extract_zip_file.extractall(pm.target_path)
    extract_zip_file.close()
    
    # 저장 파일 확인
    list_files_directories(pm.target_path)

    logging.info('[hunmin log]  the finish line of the function [exec_process]')



# 저장 파일 확인
def list_files_directories(path):
    # Get the list of all files and directories in current working directory
    dir_list = os.listdir(path)
    logging.info('[hunmin log] Files and directories in {} :'.format(path))
    logging.info('[hunmin log] dir_list : {}'.format(dir_list))
    
    

INFO:numexpr.utils:Note: NumExpr detected 12 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
INFO:numexpr.utils:NumExpr defaulting to 8 threads.


In [4]:
# 파일명: image_classification_train.py

'''
from image_classification_train_sub import exec_train, exec_init_svc, exec_inference
'''
import logging


def train(tm):
    
    exec_train(tm)
    logging.info('[hunmin log] the end line of the function [train]')


def init_svc(im):
    
    params = exec_init_svc(im)
    logging.info('[hunmin log] the end line of the function [init_svc]')
    
    return { **params }


def inference(df, params, batch_id):
    
    result = exec_inference(df, params, batch_id)
    logging.info('[hunmin log] the end line of the function [inference]')
    
    return { **result }


In [5]:
# 파일명: image_classification_train_sub.py

# Imports
import os
import numpy as np
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import utils
from tensorflow.keras import layers
from tensorflow.keras.models import load_model
import logging
import base64 
import io
from PIL import Image

logging.info(f'[hunmin log] tensorflow ver : {tf.__version__}')

# 사용할 gpu 번호를 적는다.
os.environ["CUDA_VISIBLE_DEVICES"]='0'

gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        tf.config.experimental.set_visible_devices(gpus, 'GPU')
        logging.info('[hunmin log] gpu set complete')
        logging.info('[hunmin log] num of gpu: {}'.format(len(gpus)))
    
    except RuntimeError as e:
        logging.info('[hunmin log] gpu set failed')
        logging.info(e)
        
        
def exec_train(tm):
    
    logging.info('[hunmin log] the start line of the function [exec_train]')
    
    logging.info('[hunmin log] tm.train_data_path : {}'.format(tm.train_data_path))
    
    # 저장 파일 확인
    list_files_directories(tm.train_data_path)
    
    ###########################################################################
    ## 1. 데이터셋 준비(Data Setup)
    ###########################################################################
    
    my_path = os.path.join(tm.train_data_path, 'dataset') + '/'
    
    # 카테고리
    dataset=['ant','apple', 'bus', 'butterfly', 'cup', 'envelope','fish', 'giraffe', 'lightbulb','pig']
    dataset_num= len(dataset) #10

    # 경로에 있는 numpy를 load하고 dataset_numpy list에 추가한다. 
    dataset_numpy = []
    for i in range (dataset_num):
        ad = my_path + str(dataset[i]) +'.npy'
        dataset_numpy.append(np.load(ad))
   
    logging.info('[hunmin log] : (image_number, image_size)')
    
    for i in range (dataset_num):
        logging.info('[hunmin log] : {}'.format(dataset_numpy[i].shape))
    
        
    np.set_printoptions(linewidth=116)
    # dataset_numpy[5] 가 envelope numpy 이다.    
    logging.info('[hunmin log] envelope : ')
    logging.info('{}'.format(dataset_numpy[5][0]))
    
    ###########################################################################
    ## 2. 데이터 전처리(Data Preprocessing)
    ###########################################################################

    # 카테고리별로 같은 수의 이미지를 훈련시키기 위해 훈련시키고자 하는 이미지의 개수를 정해준다.
    idx = 1000
    
    # 데이터 정규화 (Normalization) & 데이터 합치기 & 레이블 생성
    # X: 입력 이미지 배열 데이터
    # Y: 정답 레이블 데이터
    # 정규화 및 정답 레이블 생성
    X = np.array([data_numpy[:idx, :]/255. for data_numpy in dataset_numpy]).astype('float32')
    X = X.reshape(-1, 28*28)
    Y = np.array([i for i in range(10) for j in range(idx)]).astype('float32')

    # 훈련 & 평가 데이터셋 생성
    X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size=0.2,random_state=0)

    # 모델 훈련에 사용할 수 있는 형태로 변경
    # X의 값을 [samples][pixels][width][height] 형태로 reshape한다.
    X_train_cnn = X_train.reshape(X_train.shape[0], 28, 28 , 1).astype('float32')
    X_test_cnn = X_test.reshape(X_test.shape[0], 28, 28 , 1).astype('float32')
    
    # reshape된 결과 확인 및 원래 배열의 형태와 비교
    logging.info('[hunmin log] X_train : {}'.format(X_train.shape))
    logging.info('[hunmin log] X_train_cnn : {}'.format(X_train_cnn.shape))

    
    # Y의 배열에 one-hot-encoding 진행
    Y_train_cnn = utils.to_categorical(Y_train)
    Y_test_cnn = utils.to_categorical(Y_test)
    num_classes = Y_test_cnn.shape[1] # class는 총 10개이다.

    # encoding된 결과 확인 및 원래 배열의 형태와 비교
    logging.info('[hunmin log] Y_train : {}'.format(Y_train.shape))
    logging.info('[hunmin log] Y_train_cnn : {}'.format(Y_train_cnn.shape))
    logging.info('[hunmin log] class number : {}'.format(num_classes))
    
    
    
    ###########################################################################
    ## 3. 학습 모델 훈련(Train Model)
    ###########################################################################

    # 모델 구축 (Build Model)
    # 이미지 분류를 위해 아주 간단한 CNN 모델을 Keras를 이용하여 구축하고자 한다.
    
    # 단일 gpu 혹은 cpu학습
    if len(gpus) < 2:
        model = model_build_and_compile(num_classes)
    # multi-gpu
    else:
        strategy = tf.distribute.MirroredStrategy()
        logging.info('[hunmin log] gpu devices num {}'.format(strategy.num_replicas_in_sync))
        with strategy.scope():
            model = model_build_and_compile(num_classes)

    # 사용자 입력 파라미터
    batch_size = int(tm.param_info['batch_size'])
    epochs = int(tm.param_info['epoch'])

    # gpu에 따른 batch_size 설정
    batch_size = batch_size * len(gpus) if len(gpus) > 0 else batch_size

    # 모델 학습 (Train Model)
    history = model.fit(X_train_cnn, Y_train_cnn, 
                        batch_size=batch_size, 
                        epochs=epochs, 
                        validation_split=0.1, 
                        verbose=0, 
                        callbacks=[LossAndErrorPrintingCallback()]
                       )
    
    # 모델 평가 (Evaluate Model)
    loss, acc = model.evaluate(X_test_cnn, Y_test_cnn, verbose=0, callbacks=[LossAndErrorPrintingCallback()])
    
    logging.info('[hunmin log] loss : {}'.format(loss))
    logging.info('[hunmin log] acc : {}'.format(acc))
    

    ###########################################################################
    ## 플랫폼 시각화
    ###########################################################################  
    '''
    plot_metrics(tm, history, model, X_test_cnn, Y_test_cnn)
    '''
    
    
    ###########################################################################
    ## 학습 모델 저장
    ###########################################################################
    
    logging.info('[hunmin log] tm.model_path : {}'.format(tm.model_path))
    model.save(os.path.join(tm.model_path, 'cnn_model.h5'))
    
    # 저장 파일 확인
    list_files_directories(tm.model_path)
    
    logging.info('[hunmin log]  the finish line of the function [exec_train]')
    


def exec_init_svc(im):

    logging.info('[hunmin log] im.model_path : {}'.format(im.model_path))
    
    # 저장 파일 확인
    list_files_directories(im.model_path)
    
    ###########################################################################
    ## 학습 모델 준비
    ########################################################################### 
    
    # load the model
    model = load_model(os.path.join(im.model_path, 'cnn_model.h5'))
    
    return {'model' : model}



def exec_inference(df, params, batch_id):
    
    ###########################################################################
    ## 4. 추론(Inference)
    ###########################################################################
    
    logging.info('[hunmin log] the start line of the function [exec_inference]')
    
    ## 학습 모델 준비
    model = params['model']
    logging.info('[hunmin log] model.summary() :')
    model.summary(print_fn=logging.info)
    
    dataset=['ant','apple', 'bus', 'butterfly', 'cup', 'envelope','fish', 'giraffe', 'lightbulb','pig']
    
    # image preprocess
    img_base64 = df.iloc[0, 0]
    image_bytes = io.BytesIO(base64.b64decode(img_base64))
    image = Image.open(image_bytes).convert('L')
    image = image.resize((28, 28))
    image = np.invert(image).astype('float32')/255.
    image = image.reshape(-1, 28, 28 , 1)
    
    # data predict
    y_pred = model.predict(image)
    y_pred_idx=np.argmax(y_pred, axis=1)
    
    # inverse transform
    result = {'inference' : dataset[y_pred_idx[0]]}
    logging.info('[hunmin log] result : {}'.format(result))

    return result



# 저장 파일 확인
def list_files_directories(path):
    # Get the list of all files and directories in current working directory
    dir_list = os.listdir(path)
    logging.info('[hunmin log] Files and directories in {} :'.format(path))
    logging.info('[hunmin log] dir_list : {}'.format(dir_list))



###########################################################################
## exec_train(tm) 호출 함수 
###########################################################################

# for epoch, loss
class LossAndErrorPrintingCallback(keras.callbacks.Callback):
    def on_epoch_end(self, batch, logs={}):
        #logging.info("For epoch {}, loss is {:.2f}, acc is {:.2f}.".format(batch, logs.get('loss'), logs.get('acc')))
        logging.info('[hunmin log] For epoch {}, loss is {:.2f}.'.format(batch+1, logs['loss']))

def model_build_and_compile(num_classes):
    #모델 구축
    model = keras.Sequential(
        [
            layers.Input(shape=(28,28,1)),
            layers.Conv2D(32, kernel_size=(3, 3), padding='same', activation="relu"),
            layers.Conv2D(64, kernel_size=(3, 3), padding='same', activation="relu"),
            layers.Dropout(0.25),
            layers.Conv2D(64, kernel_size=(3, 3), padding='same', activation="relu"),
            layers.MaxPooling2D(pool_size=(2, 2)),
            layers.Flatten(),
            layers.Dense(32, activation="relu"),
            layers.Dropout(0.25),
            layers.Dense(num_classes, activation="softmax")
        ]
    )
    logging.info('[hunmin log] model.summary() :')
    model.summary(print_fn=logging.info)
    
    # 모델 컴파일 (Compile Model)
    model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
    
    return model
    
# 시각화
def plot_metrics(tm, history, model, x_test, y_test):
    from sklearn.metrics import confusion_matrix
    
    accuracy_list = history.history['accuracy']
    loss_list = history.history['loss']
    
    for step, (acc, loss) in enumerate(zip(accuracy_list, loss_list)):
        metric={}
        metric['accuracy'] = acc
        metric['loss'] = loss
        metric['step'] = step
        tm.save_stat_metrics(metric)

    predict_y = np.argmax(model.predict(x_test), axis = 1).tolist()
    actual_y = np.argmax(y_test, axis = 1).tolist()
    
    eval_results={}
    eval_results['predict_y'] = predict_y
    eval_results['actual_y'] = actual_y
    eval_results['accuracy'] = history.history['val_accuracy'][-1]
    eval_results['loss'] = history.history['val_loss'][-1]

    # calculate_confusion_matrix(eval_results)
    eval_results['confusion_matrix'] = confusion_matrix(actual_y, predict_y).tolist()
    tm.save_result_metrics(eval_results)
    logging.info('[hunmin log] accuracy and loss curve plot for platform')
    

INFO:root:[hunmin log] tensorflow ver : 2.9.0
INFO:root:[hunmin log] gpu set complete
INFO:root:[hunmin log] num of gpu: 1


In [6]:
# PM 클래스: pm 객체
class PM:
    def __init__(self):
        self.source_path = './'
        self.target_path = './meta_data'

# TM 클래스: tm 객체
class TM:
    param_info = {}
    def __init__(self):
        self.train_data_path = './meta_data'
        self.model_path = './meta_data'
        self.param_info['batch_size'] = 10
        self.param_info['epoch'] = 20

# IM 클래스: im 객체
class IM:
    def __init__(self):
        self.model_path = './meta_data'


# pm 객체
pm = PM()
print('pm.source_path:', pm.source_path)
print('pm.target_path: ', pm.target_path)

# tm 객체
tm = TM()
print('tm.train_data_path: ', tm.train_data_path)
print('tm.model_path: ', tm.model_path)
print('tm.param_info[\'batch_size\']: ', tm.param_info['batch_size'])
print('tm.param_info[\'epoch\']: ', tm.param_info['epoch'])

# im 객체
im = IM()
print('im.model_path: ', im.model_path)

# inferecne(df, params, batch_id) 함수 입력
params = {}
batch_id = 0

import io
import pandas as pd

# base64 encoded image
data = [['iVBORw0KGgoAAAANSUhEUgAAABwAAAAcCAIAAAD9b0jDAAAAAXNSR0IArs4c6QAAAARnQU1BAACxjwv8YQUAAAAJcEhZcwAADsMAAA7DAcdvqGQAAACySURBVEhL7ZLRDoAgCEWt//9nc8EIEepStvXQeWkyPEKw1FrLbFb+TuWXzichXXb4cAokJR0tH+JFK21G8V6CSqlApMxGelBIsVBHeOOEzZYGdRzpussfL7eIazHPa0wrx0GMdBSiuMbkdINyb5og0gRL3dTbcPtNOpYZveQ2pA1n0hTakF5+hA9Lzd87pNFYLhkvsvT2lMhorndluxkRUuCYbzcp9ROi55+up8sLK1XKBj1wbx3DelAOAAAAAElFTkSuQmCC']]
df = pd.DataFrame(data)
print('df: ', df)
print('df.dtypes:', df.dtypes)
df.columns

pm.source_path: ./
pm.target_path:  ./meta_data
tm.train_data_path:  ./meta_data
tm.model_path:  ./meta_data
tm.param_info['batch_size']:  10
tm.param_info['epoch']:  20
im.model_path:  ./meta_data
df:                                                     0
0  iVBORw0KGgoAAAANSUhEUgAAABwAAAAcCAIAAAD9b0jDAA...
df.dtypes: 0    object
dtype: object


RangeIndex(start=0, stop=1, step=1)

In [7]:
%%time
process_for_train(pm)

train(tm)

transform(df, params, batch_id)

params = init_svc(im)

inference(df, params, batch_id)

INFO:root:[hunmin log]  the start line of the function [exec_process]
INFO:root:[hunmin log] pm.source_path : ./
INFO:root:[hunmin log] Files and directories in ./ :
INFO:root:[hunmin log] dir_list : ['0_local_image_classification.ipynb', '0_local_image_classification_requirement.txt', '1_local_platform_image_classification.ipynb', '2_1_1_platform_image_classification_preprocess.py', '2_1_2_platform_image_classification_preprocess_sub.py', '2_2_1_platform_image_classification_train.py', '2_2_2_platform_image_classification_train_sub.py', 'dataset.zip', 'LICENSE.txt', 'meta_data', 'README.txt', 'T3Q.ai_platform_image_classification', 'test_dataset.zip']
INFO:root:[hunmin log] Files and directories in ./meta_data :
INFO:root:[hunmin log] dir_list : ['cnn_model.h5', 'dataset', 'test_dataset']
INFO:root:[hunmin log]  the finish line of the function [exec_process]
INFO:root:[hunmin log] the end line of the function [process_for_train]
INFO:root:[hunmin log] the start line of the function [e

INFO:root:[hunmin log] tm.model_path : ./meta_data
INFO:root:[hunmin log] Files and directories in ./meta_data :
INFO:root:[hunmin log] dir_list : ['cnn_model.h5', 'dataset', 'test_dataset']
INFO:root:[hunmin log]  the finish line of the function [exec_train]
INFO:root:[hunmin log] the end line of the function [train]
INFO:root:[hunmin log] df :                                                    0
0  iVBORw0KGgoAAAANSUhEUgAAABwAAAAcCAIAAAD9b0jDAA...
INFO:root:[hunmin log] df.shape : (1, 1)
INFO:root:[hunmin log] type(df) : <class 'pandas.core.frame.DataFrame'>
INFO:root:[hunmin log] the end line of the function [transform]
INFO:root:[hunmin log] im.model_path : ./meta_data
INFO:root:[hunmin log] Files and directories in ./meta_data :
INFO:root:[hunmin log] dir_list : ['cnn_model.h5', 'dataset', 'test_dataset']
INFO:root:[hunmin log] the end line of the function [init_svc]
INFO:root:[hunmin log] the start line of the function [exec_inference]
INFO:root:[hunmin log] model.summary() :
INF



INFO:root:[hunmin log] result : {'inference': 'apple'}
INFO:root:[hunmin log] the end line of the function [inference]


Wall time: 1min 12s


{'inference': 'apple'}