In [1]:
import pandas as pd
import numpy as np
import os
import cv2
import math
import multiprocessing
from matplotlib import pyplot as plt

import keras
from keras import layers, models
from keras import Input
from keras.models import Model
from keras.models import Sequential
from keras.preprocessing.image import ImageDataGenerator
from keras import optimizers, initializers, regularizers, metrics
from keras.callbacks import ModelCheckpoint
from keras.layers import Dense, Dropout, Activation, Flatten
from keras.utils import np_utils

Using TensorFlow backend.


In [2]:
color_label = {'빨강': 0, '주황': 1, '노랑': 2, '연두': 3, '초록': 4, 
               '청록': 5, '파랑': 6, '남색': 7, '보라': 8, '분홍': 9,
               '자주': 10, '갈색': 11, '회색': 12, '검정': 13, '하양': 14, '투명': 15}
color_label

{'빨강': 0,
 '주황': 1,
 '노랑': 2,
 '연두': 3,
 '초록': 4,
 '청록': 5,
 '파랑': 6,
 '남색': 7,
 '보라': 8,
 '분홍': 9,
 '자주': 10,
 '갈색': 11,
 '회색': 12,
 '검정': 13,
 '하양': 14,
 '투명': 15}

In [3]:
DATA_PATH = '../data/mask_front'
LABEL_PATH = '../label'
SAVE_PATH = '../save_model'

In [4]:
# xls = pd.read_excel(os.path.join(LABEL_PATH, 'label_color.xls'))
# df = pd.DataFrame(xls)

In [5]:
# df['color_front'].unique()

In [6]:
# # csv 파일 처음 만들 때
# img_list = os.listdir(DATA_PATH)
# img_list = [e.split('.')[0] for e in img_list]

# xls = pd.read_excel(os.path.join(LABEL_PATH, 'label_color.xls'))
# df = pd.DataFrame(xls)
# df2 = df[df.columns[-1]].map(color_label)
# df2 = pd.DataFrame(df2)
# df['color_front'] = df2['color_front']
# df2 = df[df['No'].isin(img_list)]
# df2 = df2.drop_duplicates(['No'], keep='first')
# df2 = df2.reset_index()
# for i in range(len(df2)):
#     df2['No'][i] = str(df2['No'][i]) + '.jpg'
#     if i % 100 == 0:
#         print(i)

# # csv 파일로 만들기
# df2.to_csv(os.path.join(LABEL_PATH,'color.csv'), mode='w')
# dataset = df2
# dataset['color_front'] = dataset['color_front'].apply(str)

In [7]:
# csv 파일이 이미 있을 때
dataset = pd.read_csv(os.path.join(LABEL_PATH,'color.csv'))
dataset['color_front'] = dataset['color_front'].apply(str)

In [8]:
from sklearn.model_selection import train_test_split

dataset = dataset[['No', 'color_front']]
its = np.arange(dataset.shape[0])
train_idx, val_idx = train_test_split(its, train_size = 0.8, random_state=42)

X_train = dataset.iloc[train_idx, :]
X_val = dataset.iloc[val_idx, :]

print(X_train.shape)
print(X_val.shape)

(16767, 2)
(4192, 2)


In [9]:
params = {
    'img_size': (224, 224),
    'input_shape': (224, 224, 3),
    'nb_train_samples': len(X_train),
    'nb_validation_samples': len(X_val),
#     'img_size': (299, 299),
#     'input_shape': (299, 299, 3),
    'batch_size': 24,
    'epochs': 10,
    'nb_workers': multiprocessing.cpu_count()
}

In [10]:
def get_steps(num_samples, batch_size):
    if (num_samples % batch_size) > 0 :
        return (num_samples // batch_size) + 1
    else :
        return num_samples // batch_size

In [11]:
datagen=ImageDataGenerator(rescale=1./255, validation_split=0.2)
train_generator=datagen.flow_from_dataframe(dataframe=dataset,
                                            directory=DATA_PATH,
                                            x_col='No',
                                            y_col='color_front',
                                            subset='training',
                                            class_mode='categorical',
                                            target_size=params['img_size'],
                                            batch_size=params['batch_size'])

valid_generator=datagen.flow_from_dataframe(dataframe=dataset,
                                            directory=DATA_PATH,
                                            x_col='No',
                                            y_col='color_front',
                                            subset='validation',
                                            class_mode='categorical',
                                            target_size=params['img_size'],
                                            batch_size=params['batch_size'],
                                            shuffle=False)

Found 16768 validated image filenames belonging to 16 classes.
Found 4191 validated image filenames belonging to 16 classes.


In [12]:
# top-k
from functools import partial
top_2 = partial(keras.metrics.top_k_categorical_accuracy, k=2)
top_2.__name__ = 'top_2'

top_3 = partial(keras.metrics.top_k_categorical_accuracy, k=3)
top_3.__name__ = 'top_3'

top_5 = keras.metrics.top_k_categorical_accuracy
top_5.__name__ = 'top_5'

top_10 = partial(keras.metrics.top_k_categorical_accuracy, k=10)
top_10.__name__ = 'top_10'

In [13]:
######################## VGG16 #########################
from keras.applications import VGG16
cnn_model = VGG16(include_top=False, weights='imagenet', input_shape=params['input_shape'])
cnn_model.trainable = True
model = Sequential()
model.add(cnn_model)
model.add(layers.Flatten())
model.add(layers.Dense(4096, activation='relu'))
model.add(layers.Dropout(0.5))
model.add(layers.Dense(1024, activation='relu'))
model.add(layers.Dropout(0.5))
model.add(layers.Dense(16, activation='softmax', kernel_initializer='he_normal'))
model.summary()
filepath = os.path.join(SAVE_PATH, 'VGG16_color_ep{epoch:03d}_vloss-{val_loss:.4f}_vacc-{val_acc:.4f}.h5')

checkpoint = ModelCheckpoint(filepath=filepath, monitor='val_acc', verbose=1, save_best_only=True)
# earlystop = EarlyStopping(monitor='val_acc', min_delta=0, patience=5, verbose=1, mode='auto')

model.compile(loss='categorical_crossentropy', optimizer=optimizers.RMSprop(lr=2e-5), metrics=['acc', top_2, top_3])

history = model.fit_generator(train_generator,
                              steps_per_epoch = get_steps(params['nb_train_samples'], params['batch_size']),
                              epochs=params['epochs'],
                              validation_data=valid_generator, 
                              validation_steps = get_steps(params['nb_validation_samples'], params['batch_size']),
                              callbacks=[checkpoint],
                              workers=params['nb_workers'])

# from keras.models import load_model
# model.save(os.path.join(SAVE_PATH, 'VGG16.h5'))

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
vgg16 (Model)                (None, 7, 7, 512)         14714688  
_________________________________________________________________
flatten_1 (Flatten)          (None, 25088)             0         
_________________________________________________________________
dense_1 (Dense)              (None, 4096)              102764544 
_________________________________________________________________
dropout_1 (Dropout)          (None, 4096)              0         
_________________________________________________________________
dense_2 (Dense)              (None, 1024)              4195328   
_________________________________________________________________
dropout_2 (Dropout)  

In [14]:
dataset_test = pd.read_csv(os.path.join(LABEL_PATH,'shape_color_prediction.csv'))
dataset_test = dataset_test[['No']]

from keras.models import load_model
params.update({
    'nb_test_samples': len(dataset_test)
})

test_datagen = ImageDataGenerator(rescale=1./255)
test_generator = test_datagen.flow_from_dataframe(
    dataframe=dataset_test,
    directory=DATA_PATH,
    x_col='No',
    y_col=None,
    target_size=params['img_size'],
    class_mode=None,
    batch_size=params['batch_size'],
    shuffle=False)

dependencies = {
    'top_2': top_2,
    'top_3': top_3,
    'top_5': top_5
}

model = keras.models.load_model(os.path.join(SAVE_PATH, 'VGG16_color_ep007_vloss-0.4171_vacc-0.9003.h5'), custom_objects=dependencies)

prediction = model.predict_generator(generator = test_generator,
                                     steps = get_steps(params['nb_test_samples'], params['batch_size']),
                                     verbose=1,
                                     workers=params['nb_workers'])

Found 20959 validated image filenames.


In [15]:
predicted_class_indices=np.argmax(prediction, axis=1)

# Generator class dictionary mapping
labels = (train_generator.class_indices)
labels = dict((v, k) for k,v in labels.items())
predictions = [labels[k] for k in predicted_class_indices]

submission = pd.read_csv(os.path.join(LABEL_PATH, 'shape_color_prediction.csv'))
submission['color_top1'] = predictions
submission.to_csv(os.path.join(LABEL_PATH, 'shape_color_prediction.csv'), index=False)

In [38]:
def top_k_label(K):
    top_list = []
    predicted_class_indices=np.argmax(prediction, axis=1)
    labels = (train_generator.class_indices)
    labels = dict((v,k) for k,v in labels.items())
    class_probs = prediction
    for i, l in enumerate(predictions): # idx, label
        class_prob = class_probs[i]
        top_values = (-class_prob).argsort()[:K].tolist() # k 개까지 높은 확률 인덱스 저장
        top_list.append(top_values)
        
    top_arr = np.zeros((len(top_list), K))
    
    for i in range(len(top_list)): # key - values 값 바꾸기
        for j in range(K):
            tmp = top_list[i][j]
            top_arr[i][j] = labels[tmp]
    
    label_list = top_arr.tolist()
    return label_list


# top_k_label(3)

In [19]:
# top-k csv파일에 저장하기

df = pd.read_csv(os.path.join(LABEL_PATH,'shape_color_prediction.csv'))
df['color_top3'] = top_k_label(3)
df.to_csv(os.path.join(LABEL_PATH, 'shape_color_prediction.csv'), index=False)
df.head()

Unnamed: 0.1,Unnamed: 0,index,No,color_front,color_top1,color_top3,shape,shape_top1,shape_top3
0,0,0,197400571.jpg,2,2,"[2.0, 1.0, 3.0]",5,5,"[5.0, 10.0, 7.0]"
1,1,1,198803039.jpg,11,11,"[11.0, 0.0, 1.0]",8,8,"[8.0, 7.0, 10.0]"
2,2,2,200703756.jpg,9,9,"[9.0, 11.0, 1.0]",5,5,"[5.0, 10.0, 8.0]"
3,3,3,199703153.jpg,14,14,"[14.0, 12.0, 2.0]",8,8,"[8.0, 10.0, 4.0]"
4,4,4,200004062.jpg,14,14,"[14.0, 12.0, 2.0]",5,5,"[5.0, 10.0, 8.0]"
