In [None]:
#這是jupyter notebook的magic word˙
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
from IPython import display

In [None]:
import os
#判斷是否在jupyter notebook上
def is_in_ipython():
    "Is the code running in the ipython environment (jupyter including)"
    program_name = os.path.basename(os.getenv('_', ''))

    if ('jupyter-notebook' in program_name or # jupyter-notebook
        'ipython'          in program_name or # ipython
        'jupyter' in program_name or  # jupyter
        'JPY_PARENT_PID'   in os.environ):    # ipython-notebook
        return True
    else:
        return False


#判斷是否在colab上
def is_in_colab():
    if not is_in_ipython(): return False
    try:
        from google import colab
        return True
    except: return False

#判斷是否在kaggke_kernal上
def is_in_kaggle_kernal():
    if 'kaggle' in os.environ['PYTHONPATH']:
        return True
    else:
        return False

if is_in_colab():
    from google.colab import drive
    drive.mount('/content/gdrive')

In [None]:
os.environ['TRIDENT_BACKEND'] = 'pytorch'

if is_in_kaggle_kernal():
    os.environ['TRIDENT_HOME'] = './trident'
    
elif is_in_colab():
    os.environ['TRIDENT_HOME'] = '/content/gdrive/My Drive/trident'

#為確保安裝最新版 

!pip uninstall tridentx -y
!pip install ../input/trident/tridentx-0.7.3.21-py3-none-any.whl --upgrade
#!pip install cupy

import json
import copy
import numpy as np
#調用trident api
import trident as T
from trident import *

from trident.layers.pytorch_initializers import orthogonal
import random
from tqdm import tqdm
import glob
import scipy
import time

這比賽是基於
0=Angry, 1=Disgust, 2=Fear, 3=Happy, 4=Sad, 5=Surprise, 6=Neutral

In [None]:
import pandas as pd
train_df = pd.read_csv('../input/challenges-in-representation-learning-facial-expression-recognition-challenge/icml_face_data.csv')
train_df

In [None]:
#train_df


data=open('../input/challenges-in-representation-learning-facial-expression-recognition-challenge/icml_face_data.csv','r',encoding='utf-8-sig').readlines()
print(len(data))
array2image(np.array(eval('['+data[1].split(',')[-1].replace(' ',', ')+']')).reshape((48,48)))

In [None]:
emotions=[]
images=[]
test_emotions=[]
test_images=[]
for i in tqdm(range(len(data)-1)):
    row=data[i+1]
    cols=row.split(',')
    if cols[1]=='Training':
        emotions.append(int(cols[0]))
        images.append(np.array(eval('['+cols[-1].replace(' ',', ')+']')).reshape((48,48,1)).astype(np.float32))
    else:
        test_emotions.append(int(cols[0]))
        test_images.append(np.array(eval('['+cols[-1].replace(' ',', ')+']')).reshape((48,48,1)).astype(np.float32))
        

print(len(emotions))
print(len(images))
print(len(test_emotions))
print(len(test_images))

在數據增強部分，像教於上一個案例，我們加入了仿設轉換，以及CLAHE(限制對比度自適應直方圖均衡化)，後者可以強化圖片的對比均衡，能夠將細微的肌肉線條更為凸顯，以便獲取五官以外的肌肉紋理特徵。

In [None]:
ds1=ImageDataset(images,symbol='images')
ds2=LabelDataset(emotions,symbol='emotions_label')
ds2.class_names=['Angry', 'Disgust', 'Fear', 'Happy', 'Sad', 'Surprise', 'Neutral']

test_ds1=ImageDataset(test_images,symbol='images')
test_ds2=LabelDataset(test_emotions,symbol='emotions_label')
test_ds2.class_names=['Angry', 'Disgust', 'Fear', 'Happy', 'Sad', 'Surprise', 'Neutral']

data_provider=DataProvider(traindata=Iterator(data=ds1,label=ds2),testdata=Iterator(data=test_ds1,label=test_ds2))


data_provider.image_transform_funcs=[
    ToRGB(),
    Resize(output_size=(112,112)),
    CLAHE(),
    RandomTransform(rotation_range=15,zoom_range=0.1, shift_range=0.05,  shear_range= 0.1,random_flip= 0.3),
    RandomAdjustGamma(gamma_range=(0.6,1.4)),
    RandomAdjustContrast(value_range=(0.6,1.4)),
    RandomAdjustHue(scale=(-0.3,0.3)),#調整色相
    RandomAdjustSaturation(scale=(0.6,1.4)),#調整飽和度
    RandomBlur(scale=(3,7)),#隨機模糊
    SaltPepperNoise(prob=0.001),#椒鹽噪音
    RandomErasing(size_range=(0.08,0.2),transparency_range=(0.4,0.6),transparancy_ratio=0.8),
    Normalize(127.5,127.5)]


In [None]:
img_data,emotion_data=data_provider.next()
print(img_data.shape)
print(emotion_data.shape)

data_provider.preview_images()

這次的骨幹是基於「口罩人臉識別」實作的成果，等於是arcFace再加上理解特徵點幾何位置的能力，以及復原臉部的腦捕能力的骨幹。由於表情與特徵點息息相關，因此使用這個骨幹效果會比一般骨幹來的更好。首先建構與「口罩人臉識別」相同的模型結構，並且載入訓練權重。

In [None]:
from trident.models import arcfacenet
num_faces=10575
#標準生成結構
#不包含原有分類器
se_resnet50 =arcfacenet.SEResNet_IR_50_512(include_top=False,
             pretrained=True,
             freeze_features=True,
             input_shape=(3,112,112))

#加入output_layer
se_resnet50.model.add_module('output_layer', 
    Sequential(
        Dropout(dropout_rate=0.4),
        Flatten(),
        Dense((512),use_bias=False,keep_output=True),
    ))

#se_resnet50.model.output_layer[0].inplace = False
se_resnet50.model.add_module('l2norm',L2Norm())
se_resnet50.model.add_module('fc',Dense((num_faces),use_bias=False,weights_norm='l2'))


#fc_weight=se_resnet50.model[-1].weight.data        
face_head=Sequential(se_resnet50.model[-3:])

landmark_head=Sequential(
    Conv2d((3,3),68*2,strides=1,auto_pad=True,activation='sigmoid',use_bias=False),
    GlobalAvgPool2d()
        )

decoder=Sequential(
        Conv2d_Block((3,3),128,strides=1,auto_pad=True,activation='leaky_relu',normalization='batch',use_bias=False) ,#((128,7,7))
        Upsampling2d(scale_factor=2,mode='pixel_shuffle'),
        Conv2d_Block((3,3),128,strides=1,auto_pad=True,activation='leaky_relu',normalization='batch',use_bias=False),#(128,14,14)
        Upsampling2d(scale_factor=2,mode='pixel_shuffle'),
        Conv2d_Block((3,3),128,strides=1,auto_pad=True,activation='leaky_relu',normalization='batch',use_bias=False,dropout_rate=0.2),#(128,28,28)
        Upsampling2d(scale_factor=2,mode='pixel_shuffle'),
        Conv2d_Block((3,3),128,strides=1,auto_pad=True,activation='leaky_relu',normalization='batch',use_bias=False),#(128,56,56)
        Upsampling2d(scale_factor=2,mode='bicubic'),
        Conv2d_Block((3,3),128,strides=1,auto_pad=True,activation='leaky_relu',normalization='batch',use_bias=False),#(128,112,112)
        Conv2d((1,1),3,strides=1,auto_pad=True,activation='tanh',use_bias=False)
    )


head=ModuleDict({'class_pred':face_head,
                    'landmark_pred':landmark_head,
                    'face_restructure':decoder
                    },is_multicasting=True)
se_resnet50.model.remove_at(-1)
se_resnet50.model.remove_at(-1)
se_resnet50.model.remove_at(-1)
se_resnet50.model.add_module('head',head)

is_resume=False
if is_resume and os.path.exists('./Models/arcface_with_mask.pth'):
    se_resnet50.load_model('./Models/arcface_with_mask.pth')
    print('./Models/arcface_with_mask.pth loaded')
else:
    if os.path.exists('../input/face-recognition-with-mask/Models/arcface_with_mask.pth'):
        se_resnet50.load_model('../input/face-recognition-with-mask/Models/arcface_with_mask.pth')
se_resnet50.summary()

將整個模型設定為不可訓練，移除掉最後一層的head，然後依序加入一層通道數為7的卷積層，再透過GlobalAvgPool2d移除掉空間維度，再加上全連接層以及SoftMax即可完成第一個fintune的模型。

In [None]:
se_resnet50.model.trainable=False
se_resnet50.model.remove_at(-1)
se_resnet50.model.add_module('last_conv',Conv2d((1,1),7,strides=1,auto_pad=True,use_bias=False,activation=None))
se_resnet50.model.add_module('pool',GlobalAvgPool2d())
se_resnet50.model.add_module('fc',Dense(7))
se_resnet50.model.add_module('softmax',SoftMax(-1,add_noise=True,noise_intensity=0.08))

In [None]:
se_resnet50.summary()

為了能夠有效的比較各種技巧對於模型效果的差異，我在此又建構了第二個finetune model

In [None]:
#標準生成結構
#不包含原有分類器
se_resnet50_2 =arcfacenet.SEResNet_IR_50_512(include_top=False,
             pretrained=True,
             freeze_features=True,
             input_shape=(3,112,112))

#加入output_layer
se_resnet50_2.model.add_module('output_layer', 
    Sequential(
        Dropout(dropout_rate=0.4),
        Flatten(),
        Dense((512),use_bias=False,keep_output=True),
    ))

#se_resnet50.model.output_layer[0].inplace = False
se_resnet50_2.model.add_module('l2norm',L2Norm())
se_resnet50_2.model.add_module('fc',Dense((num_faces),use_bias=False,weights_norm='l2'))


#fc_weight=se_resnet50.model[-1].weight.data        
face_head2=Sequential(se_resnet50_2.model[-3:])

landmark_head2=Sequential(
    Conv2d((3,3),68*2,strides=1,auto_pad=True,activation='sigmoid',use_bias=False),
    GlobalAvgPool2d()
        )

decoder2=Sequential(
        Conv2d_Block((3,3),128,strides=1,auto_pad=True,activation='leaky_relu',normalization='batch',use_bias=False) ,#((128,7,7))
        Upsampling2d(scale_factor=2,mode='pixel_shuffle'),
        Conv2d_Block((3,3),128,strides=1,auto_pad=True,activation='leaky_relu',normalization='batch',use_bias=False),#(128,14,14)
        Upsampling2d(scale_factor=2,mode='pixel_shuffle'),
        Conv2d_Block((3,3),128,strides=1,auto_pad=True,activation='leaky_relu',normalization='batch',use_bias=False,dropout_rate=0.2),#(128,28,28)
        Upsampling2d(scale_factor=2,mode='pixel_shuffle'),
        Conv2d_Block((3,3),128,strides=1,auto_pad=True,activation='leaky_relu',normalization='batch',use_bias=False),#(128,56,56)
        Upsampling2d(scale_factor=2,mode='bicubic'),
        Conv2d_Block((3,3),128,strides=1,auto_pad=True,activation='leaky_relu',normalization='batch',use_bias=False),#(128,112,112)
        Conv2d((1,1),3,strides=1,auto_pad=True,activation='tanh',use_bias=False)
    )


head2=ModuleDict({'class_pred':face_head2,
                    'landmark_pred':landmark_head2,
                    'face_restructure':decoder2
                    },is_multicasting=True)
se_resnet50_2.model.remove_at(-1)
se_resnet50_2.model.remove_at(-1)
se_resnet50_2.model.remove_at(-1)
se_resnet50_2.model.add_module('head',head2)

is_resume=False
if is_resume and os.path.exists('./Models/arcface_with_mask.pth'):
    se_resnet50_2.load_model('./Models/arcface_with_mask.pth')
    print('./Models/arcface_with_mask.pth loaded')
else:
    if os.path.exists('../input/face-recognition-with-mask/Models/arcface_with_mask.pth'):
        se_resnet50_2.load_model('../input/face-recognition-with-mask/Models/arcface_with_mask.pth')
se_resnet50_2.summary()

其中最大的差異在於加入了類別活化映射模組(Class Activation Mapping, CAM)，基本上這等於是特徵內的注意力機制，我們首先透過一個卷積層將通道數降為7(等於分類類別數)，然後再將輸出複製一份，透過GlobalAvgPool2d將它去重空間維度後，再透過1x1卷積再變為二為空間結構，再與原來的輸出進行點乘後，透過Sigmoid將其數值確保介於0\~1之間，這樣的結構可以強化通道與類別的偶和強度，來獲得更好的分類效果。

In [None]:
se_resnet50_2.model.trainable=False
se_resnet50_2.model.remove_at(-1)
se_resnet50_2.model.add_module('last_conv',Conv2d_Block((3,3),num_filters=7,use_bias=False,activation=None, normalization='l2'))
se_resnet50_2.model.add_module('dropout',Dropout(0.2))
cam=ShortCut(
    Identity(),
    Sequential(
    GlobalAvgPool2d(),
    Reshape((7,1,1)),
    Conv2d((1,1),num_filters=7,use_bias=False,activation=None)
    )
,mode='dot'
)

se_resnet50_2.model.add_module('cam',cam)
se_resnet50_2.model.add_module('aggregate',Aggregation('sum',axis=[2,3]))
se_resnet50_2.model.add_module('reshape',Reshape((7)))
se_resnet50_2.model.add_module('sigmoid',Sigmoid())
se_resnet50_2.model.add_module('fc',Dense((7)))
se_resnet50_2.model.add_module('softmax',SoftMax(axis=-1,add_noise=True,noise_intensity=0.08))
se_resnet50_2.summary()

透過撰寫以training_context為引數的函數，搭配模型的trigger_when，就可以輕鬆地在學習階段與指定時點，執行指定的任務，我們在與預計在第5,10個epoch開始，開放兩層網路的權重供其訓練。

In [None]:
def unfreeze(training_context):
    
    if training_context['steps']==450*5:
        training_context['current_model'].body[23].trainable=True
    if training_context['steps']==450*10:
        training_context['current_model'].body[22].trainable=True

In [None]:


se_resnet50.with_optimizer(optimizer=DiffGrad, lr=1e-3, betas=(0.9, 0.999),gradient_centralization='all') \
    .with_loss(CrossEntropyLoss(label_smooth=True)) \
    .with_metric(accuracy, name='accuracy') \
    .with_regularizer('l2',reg_weight=1e-5) \
    .with_accumulate_grads(10)\
    .with_model_save_path('./Models/emotions_dtection.pth')\
    .with_callbacks(PrintGradientsCallback(frequency=500)) \
    .trigger_when(when='on_batch_end',frequency=1,unit='batch',action=unfreeze)\
    .with_automatic_mixed_precision_training()
    

在第二個finetune模型中，我還加入了Mixup機制，也就是一次將兩張圖積於透明度混合，而最後標籤則是由兩張圖均分。這技術有助於緩解softmax的缺陷，

In [None]:

se_resnet50_2.with_optimizer(optimizer=DiffGrad, lr=1e-3, betas=(0.9, 0.999),gradient_centralization='all') \
    .with_loss(CrossEntropyLoss(label_smooth=True)) \
    .with_metric(accuracy, name='accuracy') \
    .with_regularizer('l2',reg_weight=1e-5) \
    .with_accumulate_grads(10)\
    .with_model_save_path('./Models/emotions_dtection2.pth')\
    .with_callbacks(PrintGradientsCallback(frequency=500)) \
    .with_callbacks(MixupCallback(alpha= 1,loss_criterion=CrossEntropyLoss,loss_weight=0.5))\
    .trigger_when(when='on_batch_end',frequency=1,unit='batch',action=unfreeze)\
    .with_automatic_mixed_precision_training()

In [None]:
#
plan=TrainingPlan()\
    .add_training_item(se_resnet50, name='arcface')\
    .add_training_item(se_resnet50_2, name='arcface2')\
    .with_data_loader(data_provider)\
    .repeat_epochs(300)\
    .with_batch_size(64)\
    .print_progress_scheduling(10,unit='batch')\
    .out_sample_evaluation_scheduling(100,unit='batch')\
    .display_loss_metric_curve_scheduling(200,unit='batch',imshow=True)\
    .save_model_scheduling(50,unit='batch')

第二個finetune模型因為有加入dropout以及mixup，初期訓練的效度會比較差，但是隨著訓練到後期，則效果會越來越好，切換回eval模式時，由於不再有dropout，效度還會再次提升。

In [None]:
plan.start_now()