# Fine-tuning of the model


In [1]:
import copy
import glob
import sys
import os
sys.path.append('..')

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from PIL import Image
# from sklearn.metrics import plot_confusion_matrix
import timm
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
from torchvision import transforms
from torchvision.models import resnet101, mobilenet_v2
from tqdm.notebook import tqdm


from robust_optimization import RobustOptimizer

print(f'Torch: {torch.__version__}')
print(f'Timm: {timm.__version__}')

Torch: 2.2.2+cu121
Timm: 0.9.7


In [2]:

os.environ["CUDA_VISIBLE_DEVICES"] = "6"

In [3]:

print(torch.cuda.device_count())  # Should be 1 (only GPU 6 is visible)
print(torch.cuda.current_device())  # Should return 0 (since only one GPU is visible)
print(torch.cuda.get_device_name(0))

1
0
NVIDIA A16


In [4]:
device = torch.device("cuda:5")

In [5]:
# device

In [2]:
affectnet_dir = '../../split'
backbone = 'efficientnet_b2'

In [3]:
# Training settings
batch_size = 16 #48# 32# 32 #16 #8 #
epochs = 40
lr = 3e-5
gamma = 0.7
seed = 42
device = 'cuda'
use_cuda = torch.cuda.is_available()
print(use_cuda)
# IMG_SIZE = 224

True


In [4]:
IMG_SIZE=260 if 'b2' in backbone else 224 # 300 # 80 #
train_transforms = transforms.Compose(
    [
        transforms.Resize((IMG_SIZE,IMG_SIZE)),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                     std=[0.229, 0.224, 0.225])
    ]
)

test_transforms = transforms.Compose(
    [
        transforms.Resize((IMG_SIZE,IMG_SIZE)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                     std=[0.229, 0.224, 0.225])
    ]
)
print(test_transforms)

Compose(
    Resize(size=(260, 260), interpolation=bilinear, max_size=None, antialias=True)
    ToTensor()
    Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
)


In [5]:
kwargs = {'num_workers': 0, 'pin_memory': True} if use_cuda else {}

In [6]:
print("Current working directory:", os.getcwd())

Current working directory: /home/jovyan/CapitaSelecta/face-emotion-recognition-main/src/affectnet


## AffectNet Dataloader

In [7]:
class AffectNet(data.Dataset):
    def __init__(self, aff_path, phase, use_cache=True, transforms=None, force=False):
        self.phase = phase
        self.transforms = transforms
        self.aff_path = aff_path
        self.base_path = os.path.join(self.aff_path, f'{self.phase}_set/')
        
        if use_cache:
            cache_path = os.path.join(aff_path,f'affectnet_{phase}.csv')
            if os.path.exists(cache_path) and not force:
                df = pd.read_csv(cache_path)
            else:
                df = self.get_df()
                df.to_csv(cache_path)
        else:
            df = self.get_df()

        self.data = df[df['phase'] == phase]

        self.file_paths = self.data.loc[:, 'img_path'].values
        self.label = self.data.loc[:, 'label'].values

        self.emotion_labels=['Neutral','Happiness', 'Sadness', 'Surprise', 'Fear', 'Disgust', 'Anger']
        sample_label, sample_counts = np.unique(self.label, return_counts=True)
        for l, c in zip(sample_label, sample_counts):
            print(f'{self.emotion_labels[l]}: {c} ', end='')
        print(f'\n{len(self)} images')

    def get_df(self):
        data = []
        
        for anno in glob.glob(self.base_path + 'annotations/*_exp.npy'):
            idx = os.path.basename(anno).split('_')[0]
            img_path = f'images/{idx}.jpg'
            label = int(np.load(anno))
            data.append([self.phase,img_path,label])
        
        return pd.DataFrame(data = data,columns = ['phase','img_path','label'])

    def get_weight(self):
        self.emotion_labels=['Neutral','Happiness', 'Sadness', 'Surprise', 'Fear', 'Disgust', 'Anger']
        self.class_to_idx = {}
        self.idx_to_class = {}
        for i, emotion in enumerate(self.emotion_labels):
            self.class_to_idx[emotion] = i
            self.idx_to_class[i] = emotion
        sample_label, sample_counts = np.unique(self.label, return_counts=True)
        for l, c in zip(sample_label, sample_counts):
            print(f'{self.emotion_labels[l]}: {c} ', end='')
        print('')
        
        cw = 1/sample_counts
        cw /= cw.min()
        class_weights = {i:cwi for i, cwi in zip(sample_label, cw)}
        print(class_weights)
        return class_weights

    def __len__(self):
        return len(self.file_paths)

    def __getitem__(self, idx):
        path = os.path.join(self.base_path, self.file_paths[idx])
        image = Image.open(path).convert('RGB')
        label = self.label[idx]

        if self.transforms is not None:
            image = self.transforms(image)
        
        return image, label

In [12]:
# from torch.utils.data import DataLoader
# import numpy as np

# IMG_SIZE=260 if 'b2' in backbone else 224

# # Temporary transform to only convert images to tensors
# temp_transform = transforms.Compose([
#     transforms.Resize((IMG_SIZE, IMG_SIZE)),
#     transforms.ToTensor(),
# ])

# # Use the dataset with the temporary transform
# temp_dataset = AffectNet( affectnet_dir, phase='train', transforms=temp_transform)
# temp_loader = DataLoader(temp_dataset, batch_size=16, shuffle=False)

# # Compute mean and std
# mean = 0.0
# std = 0.0
# total_images = 0

# for images, _ in temp_loader:
#     total_images += images.size(0)  # Batch size
#     # print(total_images)
#     mean += images.mean(dim=[0, 2, 3]) * images.size(0)
#     std += images.std(dim=[0, 2, 3]) * images.size(0)

# mean /= total_images
# std /= total_images

# print(f"Mean: {mean}, Std: {std}")


In [13]:
# train_transforms = transforms.Compose(
#     [
#         transforms.Resize((IMG_SIZE, IMG_SIZE)),
#         transforms.RandomHorizontalFlip(),
#         transforms.ToTensor(),
#         transforms.Normalize(mean=mean.tolist(), std=std.tolist())
#     ]
# )

# test_transforms = transforms.Compose(
#     [
#         transforms.Resize((IMG_SIZE, IMG_SIZE)),
#         transforms.ToTensor(),
#         transforms.Normalize(mean=mean.tolist(), std=std.tolist())
#     ]
# )


In [8]:
trainset = AffectNet(affectnet_dir, 'train', transforms=train_transforms, force=False)
valset = AffectNet(affectnet_dir, 'test', transforms=test_transforms)
trainloader = data.DataLoader(trainset, batch_size=batch_size, shuffle=True, **kwargs)
valloader = data.DataLoader(valset, batch_size=batch_size, shuffle=False, **kwargs)

Neutral: 1435 Happiness: 835 Sadness: 124 Surprise: 132 Fear: 145 Disgust: 102 Anger: 575 
3348 images
Neutral: 264 Happiness: 139 Sadness: 19 Surprise: 19 Fear: 23 Disgust: 20 Anger: 97 
581 images


In [9]:
class_weights = trainset.get_weight()

Neutral: 1435 Happiness: 835 Sadness: 124 Surprise: 132 Fear: 145 Disgust: 102 Anger: 575 
{0: 1.0, 1: 1.718562874251497, 2: 11.57258064516129, 3: 10.871212121212121, 4: 9.89655172413793, 5: 14.068627450980392, 6: 2.4956521739130437}


## Functions

In [10]:
#adapted from https://pytorch.org/tutorials/beginner/finetuning_torchvision_models_tutorial.html
def set_parameter_requires_grad(model, requires_grad):
    for param in model.parameters():
        param.requires_grad = requires_grad

In [11]:
# loss function
weights = torch.FloatTensor(list(class_weights.values())).cuda()

def label_smooth(target, n_classes: int, label_smoothing=0.1):
    # convert to one-hot
    batch_size = target.size(0)
    target = torch.unsqueeze(target, 1)
    soft_target = torch.zeros((batch_size, n_classes), device=target.device)
    soft_target.scatter_(1, target, 1)
    # label smoothing
    soft_target = soft_target * (1 - label_smoothing) + label_smoothing / n_classes
    return soft_target

def cross_entropy_loss_with_soft_target(pred, soft_target):
    #logsoftmax = nn.LogSoftmax(dim=-1)
    return torch.mean(torch.sum(- weights*soft_target * torch.nn.functional.log_softmax(pred, -1), 1))

def cross_entropy_with_label_smoothing(pred, target):
    soft_target = label_smooth(target, pred.size(1)) #num_classes) #
    return cross_entropy_loss_with_soft_target(pred, soft_target)

criterion=cross_entropy_with_label_smoothing

In [12]:
def train(model,n_epochs=epochs, learningrate=lr, robust=False):
    # optimizer
    if robust:
        optimizer = RobustOptimizer(filter(lambda p: p.requires_grad, model.parameters()), optim.Adam, lr=learningrate)
        #print(optimizer)
    else:
        optimizer=optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=learningrate)
    # scheduler
    #scheduler = StepLR(optimizer, step_size=1, gamma=gamma)
    best_acc=0
    best_model=None
    for epoch in range(n_epochs):
        epoch_loss = 0
        epoch_accuracy = 0
        model.train()
        for data, label in tqdm(trainloader):
            data = data.to(device)
            label = label.to(device)

            output = model(data)
            loss = criterion(output, label)

            if robust:
                #optimizer.zero_grad()
                loss.backward()
                optimizer.first_step(zero_grad=True)
  
                # second forward-backward pass
                output = model(data)
                loss = criterion(output, label)
                loss.backward()
                optimizer.second_step(zero_grad=True)
            else:
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

            acc = (output.argmax(dim=1) == label).float().sum()
            epoch_accuracy += acc
            epoch_loss += loss
        epoch_accuracy /= len(trainset)
        epoch_loss /= len(trainset)
        
        model.eval()
        with torch.no_grad():
            epoch_val_accuracy = 0
            epoch_val_loss = 0
            for data, label in valloader:
                data = data.to(device)
                label = label.to(device)

                val_output = model(data)
                val_loss = criterion(val_output, label)

                acc = (val_output.argmax(dim=1) == label).float().sum()
                epoch_val_accuracy += acc
                epoch_val_loss += val_loss
        epoch_val_accuracy /= len(valset)
        epoch_val_loss /= len(valset)
        print(
            f"Epoch : {epoch+1} - loss : {epoch_loss:.4f} - acc: {epoch_accuracy:.4f} - val_loss : {epoch_val_loss:.4f} - val_acc: {epoch_val_accuracy:.4f}\n"
        )
        if best_acc<epoch_val_accuracy:
            best_acc=epoch_val_accuracy
            best_model=copy.deepcopy(model.state_dict())
        #scheduler.step()
    
    if best_model is not None:
        model.load_state_dict(best_model)
        print(f"Best acc:{best_acc}")
        model.eval()
        with torch.no_grad():
            epoch_val_accuracy = 0
            epoch_val_loss = 0
            for data, label in valloader:
                data = data.to(device)
                label = label.to(device)

                val_output = model(data)
                val_loss = criterion(val_output, label)

                acc = (val_output.argmax(dim=1) == label).float().sum()
                epoch_val_accuracy += acc
                epoch_val_loss += val_loss
        epoch_val_accuracy /= len(valset)
        epoch_val_loss /= len(valset)
        print(
            f"val_loss : {epoch_val_loss:.4f} - val_acc: {epoch_val_accuracy:.4f}\n"
        )
    else:
        print(f"No best model Best acc:{best_acc}")

## Train

In [13]:
state_dict = torch.load('../../models/affectnet_emotions/enet_b2_8.pt')
print(type(state_dict))

<class 'timm.models.efficientnet.EfficientNet'>


In [14]:
state_dict = state_dict.state_dict()
state_dict

OrderedDict([('conv_stem.weight',
              tensor([[[[-6.9361e-01, -4.8171e-01, -4.0217e-01],
                        [-6.4064e-01, -6.1880e-01,  9.8463e-02],
                        [-3.1046e-01, -3.1120e-01, -9.2515e-02]],
              
                       [[-5.5262e-01,  6.1485e-02,  2.8445e-01],
                        [-3.3870e-01, -1.1033e-01, -5.7082e-02],
                        [ 7.2248e-02,  8.7730e-02,  2.6900e-01]],
              
                       [[-2.3909e-01,  1.3394e-01,  1.3763e-01],
                        [-8.6096e-02, -5.0733e-03,  3.0582e-02],
                        [ 3.5005e-02,  1.2699e-03, -2.6725e-01]]],
              
              
                      [[[ 3.2973e-01,  1.4515e+00,  3.7029e-01],
                        [ 1.6524e-01,  2.5951e-01,  1.2196e-02],
                        [ 1.2905e-01,  6.0088e-02,  3.0516e-01]],
              
                       [[ 7.6487e-02,  1.5512e+00, -3.9789e-01],
                        [-9.9902e-02,  1.

In [15]:
model = timm.create_model('tf_efficientnet_b2_ns', pretrained=False)

# Modify the classifier to match the checkpoint's output
model.classifier = torch.nn.Linear(model.classifier.in_features, 8)  # Match to checkpoint size

# Load the state_dict
loaded_model = torch.load('../../models/affectnet_emotions/enet_b2_8.pt')
state_dict = loaded_model.state_dict()
model.load_state_dict(state_dict)

  model = create_fn(


<All keys matched successfully>

In [16]:
num_classes = len(trainset.emotion_labels)
model.classifier = nn.Sequential(nn.Linear(in_features=1408, out_features=num_classes)) #1792 #1280 #1536
#model.head.fc=nn.Linear(in_features=3072, out_features=num_classes)
#model.head=nn.Sequential(nn.Linear(in_features=768, out_features=num_classes))
model=model.to(device)
print(model)

EfficientNet(
  (conv_stem): Conv2dSame(3, 32, kernel_size=(3, 3), stride=(2, 2), bias=False)
  (bn1): BatchNormAct2d(
    32, eps=0.001, momentum=0.1, affine=True, track_running_stats=True
    (drop): Identity()
    (act): SiLU(inplace=True)
  )
  (blocks): Sequential(
    (0): Sequential(
      (0): DepthwiseSeparableConv(
        (conv_dw): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=32, bias=False)
        (bn1): BatchNormAct2d(
          32, eps=0.001, momentum=0.1, affine=True, track_running_stats=True
          (drop): Identity()
          (act): SiLU(inplace=True)
        )
        (se): SqueezeExcite(
          (conv_reduce): Conv2d(32, 8, kernel_size=(1, 1), stride=(1, 1))
          (act1): SiLU(inplace=True)
          (conv_expand): Conv2d(8, 32, kernel_size=(1, 1), stride=(1, 1))
          (gate): Sigmoid()
        )
        (conv_pw): Conv2d(32, 16, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn2): BatchNormAct2d(
          16, eps=

In [17]:
set_parameter_requires_grad(model, requires_grad=True)
train(model,6,1e-4,robust=True)


  0%|          | 0/210 [00:00<?, ?it/s]

Epoch : 1 - loss : 0.3681 - acc: 0.3070 - val_loss : 0.4676 - val_acc: 0.1170



  0%|          | 0/210 [00:00<?, ?it/s]

Epoch : 2 - loss : 0.2396 - acc: 0.6735 - val_loss : 0.5574 - val_acc: 0.1670



  0%|          | 0/210 [00:00<?, ?it/s]

Epoch : 3 - loss : 0.2110 - acc: 0.7876 - val_loss : 0.5200 - val_acc: 0.1945



  0%|          | 0/210 [00:00<?, ?it/s]

Epoch : 4 - loss : 0.1966 - acc: 0.8408 - val_loss : 0.5168 - val_acc: 0.1773



  0%|          | 0/210 [00:00<?, ?it/s]

Epoch : 5 - loss : 0.1904 - acc: 0.8784 - val_loss : 0.5530 - val_acc: 0.1670



  0%|          | 0/210 [00:00<?, ?it/s]

Epoch : 6 - loss : 0.1824 - acc: 0.9041 - val_loss : 0.5402 - val_acc: 0.1515

Best acc:0.19449225068092346
val_loss : 0.5200 - val_acc: 0.1945



In [19]:
PATH = f'../../models/affectnet_emotions/affectnet_FT_1.pt'

In [20]:
# Save
torch.save(model, PATH)