# 数据预处理

在进行神经网络的训练之前，首先需要对数据集进行预处理，预处理的步骤分别如下：
* 彩色图像灰度化
* haar脸部检测器检测人脸，并提取人脸图片
* 图片尺寸归一化

In [1]:
import cv2 as cv2
import numpy as np
import pandas as pd
import os
import re

## 读取文件

为了方便文件的读取，这里将发过来的CASME_A和CASME_B数据集解压后放在datasets文件夹下

这里读取文件的思路首先需要了解一下excel文件中每一栏目代表的内容，如下图所示：

![](..\imgs\excel.png)

所以读取文件的路径可以是直接通过文件的根目录路径加上subject名再加上Filename得到对应表情的图片文件夹名或者视频名

下面这个cell中的前三行是section A的路径设置，要处理section A可以注释后面三行，然后运行之后的cell 

In [2]:
# base_path = '..\\datasets\\CASME_A\\Section A\\'
# processed_path = '..\\datasets\\CASME_processed\\'
# info_name = 'Section A.xls'
base_path = '..\\datasets\\CASME_B\\Section B\\'
processed_path = '..\\datasets\\CASME_processed\\'
info_name = 'Section B.xls'

In [3]:
# 读取info.csv文件并清除有NaN的列
info = pd.read_excel(base_path + info_name)
info.dropna(axis=1, how='any', inplace=True)
info.head()

Unnamed: 0,Subject,Filename,OnsetF,ApexF1,OffsetF,Onset,Total,AU,Emotion
0,8,EP12_11_1,63,69,79,116.666667,283.333,4+9,disgust
1,8,EP12_11_2,11,16,22,100.0,200.0,4+9,disgust
2,8,EP12_2_1,21,26,33,100.0,216.667,4+9,disgust
3,8,EP12_2_10,17,23,32,116.666667,266.667,1+4+15,sadness
4,8,EP12_2_11,31,39,52,150.0,366.667,1+2+4+34,surprise


### 截取脸部图片

以下代码是对原始的数据集进行预处理：
* 首先将图像转换为灰度图像，然后利用haar脸部检测器检测人脸并定位出脸部区域的图片
* 将截取的脸部区域图片resize为固定大小，然后保存在对应文件夹之下

> sectionA的图片名称和section B名称的序号不同，前者序号没有补零，后者需要固定三位长度右对齐左补零

> excel文件中最后一帧的信息有缺失，这里我想着是直接用最后一帧当做表情的结束

这段代码运行时间十分漫长，运行后得到的数据集在`datasets\CASME_processed`文件夹下

In [20]:
# 运行之前发现这一目录下的图片命名不标准，所以用这段代码修改一下
# dir = "..\\datasets\\CASME_B\\Section B\\sub17\\EP17_4"
# files = os.listdir(dir)
# for file in files:
#     os.rename(os.path.join(dir, file), os.path.join(dir, "EP17_4-" + file.split('-')[-1]))

In [22]:
from faceRecognize import FaceRecognize

resize_shape = (120, 140)

faceRecognizer = FaceRecognize()
clahe = cv2.createCLAHE(clipLimit=1.0, tileGridSize=(8,8))
# 遍历读取图片
for i in range(len(info)):
    # sub文件夹路径获取
    sub_path = 'sub'+ str(info['Subject'].iloc[i]).rjust(2, '0')
    # filename文件名获取
    filename = info['Filename'].iloc[i]
    # 合并根目录，得到图片路径
    path = os.path.join(base_path, sub_path, filename)
    # 读取文件夹下所有图片
    img_files = os.listdir(path)
    
    # 得到微表情阶段的图片，并将名称合并为一个列表
    if info['OffsetF'].iloc[i] == '\\':
        endFrame = len(img_files)
    else:
        endFrame = int(info['OffsetF'].iloc[i])
    # 将名称转换为合适的格式
    files = [filename + '-' + (str(frame) if info['Subject'].iloc[i] < 8 else str(frame).rjust(3, '0')) + '.jpg' for frame in range(int(info['OnsetF'].iloc[i]), endFrame)]
    # 遍历读取
    for file in files:

        face = faceRecognizer.getFace(os.path.join(path, file))
        # 合并保存路径，得到图片保存路径
        save_path = os.path.join(processed_path, sub_path, filename)
        # 如果不存在文件保存路径，则新建
        if not os.path.exists(save_path):
            os.makedirs(save_path)
        if face is not None:
            # 灰度直方图均衡
            hist_face = clahe.apply(np.array(face, dtype='uint8'))
            # 图像resize为(120, 140)
            img_resize = cv2.resize(hist_face, resize_shape, interpolation=cv2.cv2.INTER_CUBIC)
            # 保存图片
            cv2.imwrite(os.path.join(save_path, file), img_resize)
    # 显示处理进度条
    print('\r' + '▇'*(i//2) + "{:.3f}".format(i * 100.0 / float(len(info))) + '%', end='')
print('\r' + '▇'*(len(info)//2) + "100.000" + '%', end='')
print('finish preprocess imgs')


▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇100.000%finish preprocess imgs


## 合并标签文件

将Section A.xls和Section B.xls合并到一起，方便后续处理

In [10]:
a_infoPath = "..\\datasets\\CASME_A\\Section A\\Section A.xls"
b_infoPath = "..\\datasets\\CASME_B\\Section B\\Section B.xls"

infoPath = "..\\datasets\\CASME_processed\\info.csv"

In [16]:
a_info = pd.read_excel(a_infoPath).dropna(axis=1)
b_info = pd.read_excel(b_infoPath).dropna(axis=1)

In [19]:
a_info

Unnamed: 0,Subject,Filename,OnsetF,ApexF1,OffsetF,Onset,Total,AU,Emotion
0,1,EP01_12,73,81,91,150.000000,316.667,4,tense
1,1,EP01_12,163,169,177,116.666667,250,4,tense
2,1,EP01_5,113,121,133,150.000000,350,12,happiness
3,1,EP01_8,67,75,81,150.000000,250,14,repression
4,1,EP03_1,79,91,105,216.666667,450,17,repression
...,...,...,...,...,...,...,...,...,...
91,7,EP17_1,41,45,59,83.333333,316.667,4,tense
92,7,EP17_2,61,67,76,116.666667,266.667,4,tense
93,7,EP17_3,81,84,90,66.666667,166.667,4,tense
94,7,EP17_7,90,97,107,133.333333,300,4,tense


In [21]:
b_info

Unnamed: 0,Subject,Filename,OnsetF,ApexF1,OffsetF,Onset,Total,AU,Emotion
0,8,EP12_11_1,63,69,79,116.666667,283.333,4+9,disgust
1,8,EP12_11_2,11,16,22,100.000000,200,4+9,disgust
2,8,EP12_2_1,21,26,33,100.000000,216.667,4+9,disgust
3,8,EP12_2_10,17,23,32,116.666667,266.667,1+4+15,sadness
4,8,EP12_2_11,31,39,52,150.000000,366.667,1+2+4+34,surprise
...,...,...,...,...,...,...,...,...,...
96,19,EP07_9,46,55,90,150.000000,733.333,L14,repression
97,19,EP09_3,106,111,120,83.333333,233.333,4,tense
98,19,EP12_1,91,95,111,66.666667,333.333,1+2,surprise
99,19,EP12_2,39,48,57,150.000000,300,4,tense


In [29]:
info = pd.concat([a_info, b_info])
info

Unnamed: 0,Subject,Filename,OnsetF,ApexF1,OffsetF,Onset,Total,AU,Emotion
0,1,EP01_12,73,81,91,150.000000,316.667,4,tense
1,1,EP01_12,163,169,177,116.666667,250,4,tense
2,1,EP01_5,113,121,133,150.000000,350,12,happiness
3,1,EP01_8,67,75,81,150.000000,250,14,repression
4,1,EP03_1,79,91,105,216.666667,450,17,repression
...,...,...,...,...,...,...,...,...,...
96,19,EP07_9,46,55,90,150.000000,733.333,L14,repression
97,19,EP09_3,106,111,120,83.333333,233.333,4,tense
98,19,EP12_1,91,95,111,66.666667,333.333,1+2,surprise
99,19,EP12_2,39,48,57,150.000000,300,4,tense


In [30]:
info.to_csv(infoPath)

## 文件重命名

为了方便读取图片的顺序，将处理后图片的名称前面的英文去掉, 并去掉前面的自动补零

In [1]:
processed_path = '..\\datasets\\cropped-by Li Xiaobai\\Cropped'

subfiles = os.listdir(processed_path)
minValue = 100
for subfile in subfiles:
    if subfile.endswith('.csv'):
        continue
    sub_path = os.path.join(processed_path, subfile)
    epfiles = os.listdir(sub_path)
    for epfile in epfiles:
        ep_path = os.path.join(sub_path, epfile)
        imgfiles = os.listdir(ep_path)
        if len(imgfiles) < minValue:
            minValue = len(imgfiles)
        
# print(minValue)
        for imgfile in imgfiles:
            os.rename(os.path.join(ep_path, imgfile), os.path.join(ep_path, str(int(imgfile.split('-')[-1].split('.')[0]))+".jpg"))


8
