# 데이터 수집
dataframe에 데이터 가져오기

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
!7z x drive/MyDrive/data.zip -o/drive/MyDrive/blocks_data


7-Zip [64] 16.02 : Copyright (c) 1999-2016 Igor Pavlov : 2016-05-21
p7zip Version 16.02 (locale=en_US.UTF-8,Utf16=on,HugeFiles=on,64 bits,8 CPUs Intel(R) Xeon(R) CPU @ 2.20GHz (406F0),ASM,AES-NI)

Scanning the drive for archives:
  0M Scan         1 file, 20208454033 bytes (19 GiB)

Extracting archive: drive/MyDrive/data.zip
 66% 4096 Open              --
Path = drive/MyDrive/data.zip
Type = zip
Physical Size = 20208454033
64-bit = +

  0%      0% 10 - 숫자인식/00/n00/4d_n00_08.jpg                                             0% 13 - 숫자인식/00/n00/4d_n00_11.jpg                                             0% 17 - 숫자인식/00/n00/4d_n00_15.jpg                                             0% 2

In [4]:
import os
import pandas as pd

numbers_path = '/drive/MyDrive/blocks_data/숫자인식/'
alphabets_path = '/drive/MyDrive/blocks_data/알파벳인식/'
free_patterns_path = '/drive/MyDrive/blocks_data/자유패턴/'

number_classes = [str(i) for i in range(10)]
alphabet_classes = [chr(i) for i in range(ord('A'), ord('Z')+1)]

number_folders = {}
for class_name in number_classes:
    number_folders[class_name] = '0' + class_name

data = []

for number, folder_name in number_folders.items():
    n_folder_path = os.path.join(numbers_path, folder_name, 'n'+folder_name)
    r_folder_path = os.path.join(numbers_path, folder_name, 'r_n'+folder_name)
    for file in os.listdir(n_folder_path):
        data.append({
            'image_path': os.path.join(n_folder_path, file),
            'category': 'number',
            'target': number,
            'r': False
        })
    for file in os.listdir(r_folder_path):
        data.append({
            'image_path': os.path.join(r_folder_path, file),
            'category': 'number',
            'target': number,
            'r': True
        })

for letter in alphabet_classes:
    n_folder_path = os.path.join(alphabets_path, letter, letter)
    r_folder_path = os.path.join(alphabets_path, letter, 'r_'+letter)
    for file in os.listdir(n_folder_path):
        data.append({
            'image_path': os.path.join(n_folder_path, file),
            'category': 'alphabet',
            'target': letter,
            'r': False
        })
    for file in os.listdir(r_folder_path):
        data.append({
            'image_path': os.path.join(r_folder_path, file),
            'category': 'alphabet',
            'target': letter,
            'r': True
        })

for root, dirs, files in os.walk(free_patterns_path):
    for file in files:
        data.append({
            'image_path': os.path.join(root, file),
            'category': 'free_pattern',
            'target': None,
            'r': None
        })


df = pd.DataFrame(data)
df.sample(10)

Unnamed: 0,image_path,category,target,r
280,/drive/MyDrive/blocks_data/숫자인식/02/r_n02/4d_r_...,number,2,True
3915,/drive/MyDrive/blocks_data/자유패턴/40/40_r/40_r13...,free_pattern,,
2847,/drive/MyDrive/blocks_data/알파벳인식/S/S/4d_al_S_0...,alphabet,S,False
1767,/drive/MyDrive/blocks_data/알파벳인식/H/r_H/4d_al_r...,alphabet,H,True
1136,/drive/MyDrive/blocks_data/알파벳인식/B/B/4d_al_b_2...,alphabet,B,False
2969,/drive/MyDrive/blocks_data/알파벳인식/T/T/4d_al_T_0...,alphabet,T,False
2165,/drive/MyDrive/blocks_data/알파벳인식/L/L/4d_al_L_3...,alphabet,L,False
235,/drive/MyDrive/blocks_data/숫자인식/02/n02/4d_n02_...,number,2,False
2934,/drive/MyDrive/blocks_data/알파벳인식/T/T/4d_al_T_1...,alphabet,T,False
1903,/drive/MyDrive/blocks_data/알파벳인식/I/r_I/4d_al_r...,alphabet,I,True


# 데이터 전처리
- 이미지 전처리 함수 실행
- 분류 원핫 인코딩

In [5]:
from google.colab import files
src = list(files.upload().values())[0]
open('preprocessing.py','wb').write(src)

Saving preprocessing.py to preprocessing.py


3350

In [6]:
import cv2
import numpy as np
from preprocessing import preprocess_img
from sklearn.model_selection import train_test_split


X, y = df['image_path'], df[['category', 'target']]
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)

X_train_pp = X_train.apply(lambda x: preprocess_img(x, remove_bg=False))
X_train_pp_br = X_train.apply(lambda x: preprocess_img(x, remove_bg=True))

y_train_3 = y_train['category']
y_train_3 = pd.get_dummies(y_train_3)
y_train_3 = y_train_3[['number', 'alphabet', 'free_pattern']]

y_test_3 = y_test['category']
y_test_3 = pd.get_dummies(y_test_3)
y_test_3 = y_test_3[['number', 'alphabet', 'free_pattern']]


# 모델 선택

### CNN 조정할 파라미터
- 이미지 사이즈: 32x32x3, 64x64x3, 244x244x3
- 이미지 전처리: 배경 제거 x / 배경 제거 o
- 커널 크기: (3, 3), (5, 5)
- 필터 수: 32, 64
- 숨겨진 단위: 128, 256

In [12]:
X_train_pp_64 = X_train_pp.apply(lambda x : cv2.resize(x, (64, 64)))
X_train_pp_64 = np.stack(X_train_pp_64.values)
X_train_pp_64.shape

(3106, 64, 64, 3)

In [None]:
from sklearn.model_selection import GridSearchCV
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from tensorflow.keras.callbacks import EarlyStopping

# model generator for grid search
def create_model(image_size=(64, 64, 3), image_preprocessing=0, kernel_size=(3, 3), filters=32, hidden_units=128):
    model = Sequential([
        Conv2D(filters, kernel_size, activation='relu', input_shape=image_size),
        MaxPooling2D((2, 2)),
        Conv2D(filters*2, kernel_size, activation='relu'),
        MaxPooling2D((2, 2)),
        Flatten(),
        Dense(hidden_units, activation='relu'),
        Dense(3, activation='softmax')
    ])
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
model = KerasClassifier(build_fn=create_model, epochs=50, batch_size=32, callbacks=[early_stopping])

iter_grid = {
    'image_size': [(32, 32, 3), (64, 64, 3), (244, 244, 3)],
    'preprocessed_images': [X_train_pp, X_train_pp_br],
}

search_grid = {
    'kernel_size': [(3, 3), (5, 5)],
    'filters': [32, 64],
    'hidden_units': [128, 256]
}


# Loop through different image sizes and preprocessed image arrays and fit the grid search
best_scores = []
for size in iter_grid['image_size']:
    for preprocessed_images in iter_grid['preprocessed_images']:
        X_train_cur = preprocessed_images.apply(lambda x: cv2.resize(x, size[:2]))
        X_train_cur = np.stack(X_train_cur.values)

        search_grid['image_size'] = [size]

        grid = GridSearchCV(estimator=model, param_grid=search_grid, cv=3, verbose=1, n_jobs=-1)
        grid_result = grid.fit(X_train_cur, y_train_3, validation_split=0.2)
        best_scores.append(grid_result.best_score_)

# Print the best scores for each combination of image size and preprocessing function
param_combinations = [(size, i) for size in iter_grid['image_size'] for i in range(len(iter_grid['preprocessed_images']))]
for i, (size, img_idx) in enumerate(param_combinations):
    print(f"Best score for image size {size} and preprocessed images {img_idx}: {best_scores[i]}")

  model = KerasClassifier(build_fn=create_model, epochs=50, batch_size=32, callbacks=[early_stopping])


Fitting 3 folds for each of 8 candidates, totalling 24 fits




Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Fitting 3 folds for each of 8 candidates, totalling 24 fits
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Fitting 3 folds for each of 8 candidates, totalling 24 fits
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Fitting 3 folds for each of 8 candidates, totalling 24 fits
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Fitting 3 folds for each of 8 candidates, totalling 