In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:

# 样本文件路径
SAMPLE_FILE_PATH = "../input/hahahaha/icml_face_data.csv"

# 分类数量
NUM_CLASSES = 7

# 训练、校验、测试数据集HDF5文件的输出路径
TRAIN_HDF5 = "./train.hdf5"
VAL_HDF5 = "./val.hdf5"
TEST_HDF5 = "./test.hdf5"

# 每批次样本数量
BATCH_SIZE = 128

# 项目输出文件保存目录
OUTPUT_PATH = "./"

# 数据集样本RGB平均值存位置及文件名称
DATASET_MEAN_FILE = OUTPUT_PATH + "/rgb_mean.json"

# 模型保存位置及文件名称
MODEL_FILE = OUTPUT_PATH + "/model.h5"

In [None]:
# 导入必要包
from tensorflow.keras.callbacks import Callback
import os



class EpochCheckpoint(Callback):
    def __init__(self, output_path, every=5, start_at=0):
        super(Callback, self).__init__()
        self.output_path = output_path
        self.every = every
        self.start_epoch = start_at

    def on_epoch_end(self, epoch, logs={}):
        if (self.start_epoch + 1) % self.every == 0:
            p = os.path.sep.join([self.output_path,
                                  "epoch_{}.hdf5".format(self.start_epoch + 1)])
            self.model.save(p, overwrite=True)
        self.start_epoch += 1

In [None]:
import numpy as np
import h5py
from tensorflow.python.keras.utils import np_utils


class HDF5DatasetGenerator:
    def __init__(self, db_file, batch_size, preprocessors=None,
                 aug=None, binarize=True, classes=2):
        self.batchSize = batch_size
        self.preprocessors = preprocessors
        self.aug = aug
        self.binarize = binarize
        self.classes = classes
        self.db = h5py.File(db_file)
        self.numImages = self.db["labels"].shape[0]

    def generator(self, passes=np.inf):
        epochs = 0
        while epochs < passes:
            for i in np.arange(0, self.numImages, self.batchSize):
                images = self.db["images"][i:i + self.batchSize]
                labels = self.db["labels"][i:i + self.batchSize]

                if self.binarize:
                    labels = np_utils.to_categorical(labels, self.classes)

                if self.preprocessors is not None:
                    processed_images = []
                    for image in images:
                        for p in self.preprocessors:
                            image = p.preprocess(image)
                        processed_images.append(image)
                    images = np.array(processed_images)

                if self.aug is not None:
                    (images, labels) = next(self.aug.flow(images, labels,
                                                          batch_size=self.batchSize))
                yield images, labels
            epochs += 1

    def close(self):
        self.db.close()

In [None]:
import os
import h5py


class HDF5DatasetWriter:
    def __init__(self, dims, output_path, data_key="images", buf_size=1000):
        if os.path.exists(output_path):
            raise ValueError('你提供的输出文件{}已经存在，请先手工输出'.format(output_path))
        self.db = h5py.File(output_path, 'w')
        self.data = self.db.create_dataset(data_key, dims, dtype="float")
        self.labels = self.db.create_dataset("labels", (dims[0],), dtype="int")

        self.buf_size = buf_size
        self.buffer = {"data": [], "labels": []}
        self.idx = 0

    def add(self, raw, label):
        self.buffer["data"].extend(raw)
        self.buffer["labels"].extend(label)
        if len(self.buffer["data"]) >= self.buf_size:
            self.flush()

    def flush(self):
        i = self.idx + len(self.buffer["data"])
        self.data[self.idx:i] = self.buffer["data"]
        self.labels[self.idx:i] = self.buffer["labels"]
        self.idx = i
        self.buffer = {"data": [], "labels": []}

    def store_class_labels(self, class_labels):
        dt = h5py.special_dtype(vlen=str)
        label_dim = (len(class_labels),)
        label_set = self.db.create_dataset("label_names", label_dim, dtype=dt)
        label_set[:] = class_labels

    def close(self):
        if len(self.buffer["data"]) > 0:
            self.flush()
        self.db.close()


In [None]:
from tensorflow.keras.preprocessing.image import img_to_array


class ImageToArrayPreprocessor:
    def __init__(self, data_format=None):
        self.data_format = data_format

    def preprocess(self, image):
        return img_to_array(image, data_format=self.data_format)


In [None]:
from tensorflow.keras.callbacks import BaseLogger
import matplotlib.pyplot as plt
import numpy as np
import json
import os



class TrainingMonitor(BaseLogger):
    def __init__(self, fig_path, json_path=None, start_at=0):
        super(TrainingMonitor, self).__init__()
        self.history = {}
        self.fig_path = fig_path
        self.json_path = json_path
        self.start_at = start_at

    def on_train_begin(self, logs={}):
        if self.json_path is not None:
            if os.path.exists(self.json_path):
                self.history = json.loads(open(self.json_path).read())
                if self.start_at > 0:
                    for k in self.history.keys():
                        self.history[k] = self.history[k][:self.start_at]

    def on_epoch_end(self, epoch, logs={}):
        for (k, v) in logs.items():
            log = self.history.get(k, [])
            log.append(v)
            self.history[k] = log

        if self.json_path is not None:
            f = open(self.json_path, "w")
            f.write(json.dumps(self.history))
            f.close()

        if len(self.history["loss"])>1:
            N=np.arange(0,len(self.history["loss"]))
            plt.style.use("ggplot")
            plt.figure()
            plt.plot(N,self.history["loss"],label="train_loss")
            plt.plot(N,self.history["val_loss"],label="val_loss")
            plt.plot(N,self.history["accuracy"],label="train_acc")
            plt.plot(N,self.history["val_accuracy"],label="val_acc")
            epochs=len(self.history["loss"])
            plt.title("Training Loss & Accuracy [Epoch{}]".format(epochs))
            plt.xlabel("Epoch #")
            plt.ylabel("Loss/Accuracy")
            plt.legend()
            plt.savefig(self.fig_path)
            plt.close()


In [None]:
# 导包
import numpy as np
#from .HDF5DatasetWriter import HDF5DatasetWriter
#from config import setting

print("[信息]加载csv格式数据集文件...")
file = open(SAMPLE_FILE_PATH)

file.__next__()

(train_images, train_labels) = ([], [])
(val_images, val_labels) = ([], [])
(test_images, test_labels) = ([], [])

count_by_label_train = {}
count_by_label_val = {}
count_by_label_test = {}

for row in file:
    (label, usage, image) = row.strip().split(",")
    label = int(label)
    image = np.array(image.split(" "), dtype="uint8")
    image = image.reshape((48, 48))
    if usage == "Training":
        train_images.append(image)
        train_labels.append(label)
        count = count_by_label_train.get(label, 0)
        count_by_label_train[label] = count + 1

    elif usage == 'PublicTest':
        val_images.append(image)
        val_labels.append(label)
        count = count_by_label_val.get(label, 0)
        count_by_label_val[label] = count + 1
    elif usage == "PrivateTest":
        test_images.append(image)
        test_labels.append(label)
        count = count_by_label_test.get(label, 0)
        count_by_label_test[label] = count + 1
file.close()

print("[信息]训练样本数量：{}".format(len(train_images)))
print("[信息]校验样本数量：{}".format(len(val_images)))
print("[信息]测试样本数量：{}".format(len(test_images)))

print(count_by_label_train)
print("[信息]校验样本分布：")
print(count_by_label_val)
print("[信息]测试样本分布：")
print(count_by_label_test)

datasets = [(train_images, train_labels,TRAIN_HDF5),
            (val_images, val_labels, VAL_HDF5),
            (test_images, test_labels, TEST_HDF5)]

for (images, labels, outputPath) in datasets:
    print("[信息构建]{}...".format(outputPath))
    writer = HDF5DatasetWriter((len(images), 48, 48), outputPath)
    for (image, label) in zip(images, labels):
        writer.add([image], [label])

    writer.close()


In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras.layers import MaxPooling2D
from tensorflow.keras.layers import Activation
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import Dense
from tensorflow.keras.regularizers import l1, l2
from tensorflow.keras import backend


# 定义VGG11类
class VGG11:
    @staticmethod
    def build(width, height, channel, classes, reg=0.0002):
        model = Sequential(name="VGG11")
        shape = (height, width, channel)
        channel_dimension = -1

        if backend.image_data_format() == "channels_first":
            shape = (channel, height, width)
            channel_dimension = 1


        model.add(Conv2D(64, (3, 3), input_shape=shape, padding="same"))
        model.add(BatchNormalization(axis=channel_dimension))
        model.add(Activation("relu"))
        model.add(MaxPooling2D(pool_size=(2, 2), strides=(2, 2)))

        model.add(Conv2D(128, (3, 3), padding="same"))
        model.add(BatchNormalization(axis=channel_dimension))
        model.add(Activation("relu"))
        model.add(MaxPooling2D(pool_size=(2, 2), strides=(2, 2)))

        model.add(Conv2D(256, (3, 3), padding="same"))
        model.add(BatchNormalization(axis=channel_dimension))
        model.add(Activation("relu"))
        model.add(Conv2D(256, (3, 3), padding="same"))
        model.add(BatchNormalization(axis=channel_dimension))
        model.add(Activation("relu"))
        model.add(MaxPooling2D(pool_size=(2, 2), strides=(2, 2)))

        model.add(Conv2D(512, (3, 3), padding="same"))
        model.add(BatchNormalization(axis=channel_dimension))
        model.add(Activation("relu"))
        model.add(Conv2D(512, (3, 3), padding="same"))
        model.add(BatchNormalization(axis=channel_dimension))
        model.add(Activation("relu"))
        model.add(MaxPooling2D(pool_size=(2, 2), strides=(2, 2)))

        model.add(Conv2D(512, (3, 3), padding="same"))
        model.add(BatchNormalization(axis=channel_dimension))
        model.add(Activation("relu"))
        model.add(Dropout(0.5))
        model.add(Conv2D(512, (3, 3), padding="same"))
        model.add(BatchNormalization(axis=channel_dimension))
        model.add(Activation("relu"))
        model.add(Dropout(0.5))
        model.add(MaxPooling2D(pool_size=(2, 2), padding="same", strides=(1, 1)))

        model.add(Flatten())
        model.add(Dense(256, kernel_regularizer=l2(reg)))
        model.add(BatchNormalization(axis=channel_dimension))
        model.add(Activation("relu"))
        model.add(Dropout(0.5))

        model.add(Dense(128, kernel_regularizer=l2(reg)))
        model.add(BatchNormalization(axis=channel_dimension))
        model.add(Activation("relu"))
        model.add(Dropout(0.5))

        model.add(Dense(classes, kernel_regularizer=l1(reg)))
        model.add(Activation("softmax"))

        return model


if __name__ == "__main__":
    my_model = VGG11.build(width=48, height=48, channel=1, classes=7, reg=0.0002)
    print(my_model.summary())


In [None]:
import matplotlib


'''from config import setting
from utils.ImageToArrayPreprocessor import ImageToArrayPreprocessor
from utils.EpochCheckpoint import EpochCheckpoint
from utils.TrainingMonitor import TrainingMonitor
from utils.HDF5DatasetGenerator import HDF5DatasetGenerator
from mini_vgg_11 import VGG11'''
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import load_model
import tensorflow.keras.backend as K
import argparse
import os


matplotlib.use("Agg")

train_aug = ImageDataGenerator(rotation_range=10,
                   zoom_range = 0.1,
                   rescale=1 / 255.0,
                   fill_mode="nearest")
val_aug = ImageDataGenerator(rescale=1/255.0)

iap = ImageToArrayPreprocessor()

train_gen = HDF5DatasetGenerator(TRAIN_HDF5,
                                 BATCH_SIZE,
                                 aug=train_aug,
                                 preprocessors=[iap],
                                 classes=NUM_CLASSES)
val_gen = HDF5DatasetGenerator(VAL_HDF5,
                                 BATCH_SIZE,
                                 aug=val_aug,
                                 preprocessors = [iap],
                                 classes=NUM_CLASSES)

opt = Adam(lr = 1e-3)
model = VGG11.build(width=48,height=48,channel=1,classes=NUM_CLASSES)
model.compile(loss="categorical_crossentropy",optimizer=opt,metrics=["accuracy"])
fig_path = os.path.sep.join([OUTPUT_PATH, "{}.png".format(os.getpid())])
callbacks = [TrainingMonitor(fig_path=fig_path)]
model.fit_generator(train_gen.generator(),
                    steps_per_epoch=train_gen.numImages//BATCH_SIZE,
                    validation_data=val_gen.generator(),
                    validation_steps=val_gen.numImages // BATCH_SIZE,
                    epochs=50,
                    max_queue_size=BATCH_SIZE*2,
                    callbacks=callbacks,
                    verbose=1)
print("[信息] 保存模型...")
model.save(MODEL_FILE,overwrite=True)
train_gen.close()
val_gen.close()


In [None]:

from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.models import load_model

# 初始化图像预处理器
testAug = ImageDataGenerator(rescale=1 / 255.0)
iap = ImageToArrayPreprocessor()

# 初始化测试数据集生成器
testGen = HDF5DatasetGenerator(TEST_HDF5,
                               BATCH_SIZE,
                               aug=testAug,
                               preprocessors=[iap],
                               classes=NUM_CLASSES)

# 加载前面训练好的网络
print("[信息]加载网络模型")
model = load_model(MODEL_FILE)

# 评估网络模型
(loss, acc) = model.evaluate_generator(testGen.generator(),
                                       steps=testGen.numImages // BATCH_SIZE,
                                       max_queue_size=BATCH_SIZE * 2)
print("[信息]测试集准确率：{:.2f}%".format(acc * 100))

# 关闭数据集HDF5
testGen.close()
