# Facial key points

The training of the facial key points dataset is performed here, we select a basic CNN model and consider two basic metric changes - 
* 1 The preoprocessing, there are NaN's in the dataset - an investigation of three choices is employed
    * Forward fill
    * Fill with average of column (constant)
    * Fill with a simple fitted guassian distribution

* 2 The consideration of a non-CNN model and a full CNN model.
    * It makes sense to benchmark with a non-convoluted model, since this is a simpler model

# 1 . Imports

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch.optim as optim
import logging

import torch
import torch.nn.functional as F
import torch.nn as nn
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

import gc
%matplotlib inline

# 2. Opening data and preprocessing

In [None]:
df = pd.read_csv('/kaggle/input/facial-keypoints-detection/training/training.csv')
#null values:
null = df.isnull().sum()
null = null[null!=0]
null

So there a NaN values in the dataset, we now consider the different preoprocessing operations:

In [None]:
def fillNaN(df_series, method):
    """
    NaN filling method.
    Input:
        df_series [pandas.core.series.Series] : A pandas series type with NaN to be filled in.
        method [string] : A string valued method to identify how the NaN handling should take place.
    Output:
        None, the NaN handling is performed in-place
    """
    if method == "forwardFill":
        df_series = df_series.fillna(method='ffill')
    elif method == "constantMean":
        mean = df_series.mean()
        df_series = df_series.fillna(mean)
    elif method == "sampleDist":
        mean = df_series.mean()
        std = df_series.std()
        df_series = df_series.fillna(pd.Series(np.random.normal(mean, std, len(df_series))))
    else:
        raise ValueError("Method not identified : must be in forwardFill, constantMean or sampleDist")
    
    return df_series

In [None]:
#create a preprocess dictionary encapsulating all the data
data_processing = {}
data_processing['forwardFill'] = df.copy()
data_processing['constantMean'] = df.copy()
data_processing['sampleDist'] = df.copy()

for key in data_processing.keys():
    temp_df = data_processing[key]
    for null_keys in null.keys():
        data_processing[key][null_keys] = fillNaN(data_processing[key][null_keys], key)

We process the images

In [None]:
df_images = df['Image'].map(lambda x : list(map(int,x.split(' '))))

#Normalise the image data
df_images = df_images.map(lambda x: list((np.array(x) - np.array(x).mean())/ np.array(x).std()))

#delete the image columns from the output data
for key in data_processing.keys():
    del data_processing[key]['Image']

So so far we have the following data:
* A dictionary of dataframes dictated by the NaN handling method : data_processing
* The corresponding images (dataframe) which have been normalised : df_images

# 3. Set up train-validation sets

We define two classes :
* processTrainValidation : provides some data preprocessings and creates the training/validation sets
* sampler : a utlity class used for sampling stochastically

In [None]:
class processTrainValidation:
    def __init__(self, k, image_data, keypoint_data):
        self.k = k
        
        self.image_data = image_data
        self.keypoint_data = keypoint_data
        self._process()
        
        self.N = len(image_data)
        self.splitter = lambda i : int((i/self.k)*self.N)
        
    def _process(self):
        self.X = self.image_data.values
        self.X = np.array([np.array(x) for x in self.X])
        
        self.Y = self.keypoint_data[self.keypoint_data.columns].values
        
        del self.image_data
        del self.keypoint_data
        gc.collect()
        
    def create_set(self, I):
        X_val, X_train = self.X[self.splitter(I):self.splitter(I+1)], \
        np.concatenate([self.X[self.splitter(0):self.splitter(I)],self.X[self.splitter(I+1):]])
        y_val, y_train = self.Y[self.splitter(I):self.splitter(I+1)], \
        np.concatenate([self.Y[self.splitter(0):self.splitter(I)],self.Y[self.splitter(I+1):]])
        
        return X_train, y_train, X_val, y_val
    
class sampler:
    def __init__(self, indices, sample_size):
        self.indices = indices
        self.sample_size = sample_size
        self.dontStop = True
        
    def sample(self):
        self.dontStop = len(self.indices) > self.sample_size
        
        if not self.dontStop:
            return self.indices
        sample = np.random.choice(self.indices,self.sample_size,False)
        self.indices = list(set(self.indices) - set(sample))
        return sample
    
    def getDontStop(self):
        return self.dontStop

# 4. Define the models

We now define two models:
* A model completely built from linear models (called lin_mod), which is our benchmark model.
* A model which includes a CNN (call cnn_mod).

In [None]:
class c_unit(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size):
        super(c_unit, self).__init__()
        self.conv1 = nn.Conv2d(in_channels=in_channels, out_channels=out_channels, kernel_size=kernel_size)
        self.conv1_bn = nn.BatchNorm2d(out_channels)
        
    def forward(self, x, with_max_pool = True):
        x = self.conv1(x)
        x = self.conv1_bn(x)
        x = F.relu(x)
        if with_max_pool:
            x = F.max_pool2d(x, kernel_size=2)
        return x
    
class r_unit(nn.Module):
    def __init__(self, in_dim, out_dim):
        super(r_unit, self).__init__()
        self.fc1 = nn.Linear(in_dim, out_dim)
        self.bm = nn.BatchNorm1d(out_dim)
        
    def forward(self, x, with_batch_norm = False):
        x = self.fc1(x)
        if with_batch_norm:
            x = self.bm(x)
        x = F.relu(x)
        return x
    
class lin_mod(nn.Module):
    def __init__(self):
        super(lin_mod, self).__init__()        
        self.r_unit_1 = r_unit(9216, 30)
        self.dp1 = nn.Dropout(p=0.4)
    
    def forward(self, x, verbose=False):
        x = x.view(-1, 9216)
        
        x = self.r_unit_1(x)
        x = F.relu(x)
        return x

class cnn_mod(nn.Module):
    def __init__(self):
        super(cnn_mod, self).__init__()
        self.c_unit_1 = c_unit(1,12,2)
        self.c_unit_2 = c_unit(12,64,2)
        self.c_unit_3 = c_unit(64,128,2)
        self.c_unit_4 = c_unit(128,256,2)
        self.c_unit_5 = c_unit(256,512,2)
        
        self.r_unit_1 = r_unit(512*2*2, 1024)
        self.r_unit_2 = r_unit(1024,256)
        self.r_unit_3 = r_unit(256,30)
        self.dp1 = nn.Dropout(p=0.4)
    
    def forward(self, x, verbose=False):
        x = self.c_unit_1(x)
        x = self.dp1(x)
    
        x = self.c_unit_2(x)
        x = self.dp1(x)
        
        x = self.c_unit_3(x)
        x = self.dp1(x)
        
        x = self.c_unit_4(x)
        x = self.dp1(x)
        
        x = self.c_unit_5(x)
        x = self.dp1(x)

        x = x.view(-1, 512*2*2)
        
        # now use FC layer with relu
        x = self.r_unit_1(x, True)
        x = self.dp1(x)
        
        x = self.r_unit_2(x, True)
        x = self.dp1(x)
        
        x = self.r_unit_3(x)
        x = F.relu(x)
        return x

Define a training function

In [None]:
def train(data_sampler, number_of_epochs, X_train, X_val, y_train, y_val, model, optimiser,\
         criterion,metric, validate_every, l,v,m,_):
    N = len(X_train)
    i = 0
    while data_sampler.getDontStop():
        sample = data_sampler.sample()
        XX = torch.tensor(X_train[sample]).view(-1,1,96,96).cuda().float()
        YY = torch.tensor(y_train[sample]).cuda().float()
        
        loss = criterion(model(XX),YY)
        optimiser.zero_grad()
        loss.backward()
            
        optimiser.step()
        l.append(loss.item())
        Pyloss = loss.item()
        del loss
        del XX
        del YY
        gc.collect()
            
        print('\r', 'Epoch', _, 'Iteration',i,'of',int(N/data_sampler.sample_size), 'current_loss',Pyloss,end='')
        i+=1
            
    if _%validate_every==0:
        XX = torch.tensor(X_val).view(-1,1,96,96).cuda().float()
        YY = torch.tensor(y_val).cuda().float()
        val_loss = criterion(model(XX),YY)
        v.append(val_loss.item())
        PyVal_loss = val_loss.item()
        mae = metric(model(XX),YY)
        PyMae = mae.item()
        m.append(PyMae)
        del val_loss
        del mae
        del XX
        del YY
        print(' ')
        print('loss : ', Pyloss)
        print('validation loss : ', PyVal_loss)
        print('MAE : ', PyMae)
        print(' ')

    if (len(v) >2 ) and v[-1] - v[-2] < 0:
        optimiser.param_groups[0]['lr'] = optimiser.param_groups[0]['lr']/10
        print('Optimiser learning rate reduced to '+str(optimiser.param_groups[0]['lr']))
        
    return l,v,m

# 5. Train and validate

We now perform a grid search along two axis: 
* preprocessing : forwadFill, constantMean and sampleDist
* Model used : lin_mod and cnn_mod

We do this over 5 validation sets, over 15 epochs and average over the validation sets score to gain a result.

In [None]:
def train_and_validate(criterion, metric, process_train_validation,model, number_of_epochs,\
                      sample_size, weight_init):
    Loss, Validation, MeanAbsError = [],[],[]
    for I in range(0,process_train_validation.k):
        print('TRAIN/VAL SET NUMBER : ',I)
        X_train, y_train, X_val, y_val = pTV.create_set(I)    
        model.load_state_dict(weight_init)
        model.cuda()
        optimiser = optim.Adam(model.parameters(), lr=0.01,amsgrad=True)

        l, v, m = [], [], []
        clip = 0.1
        torch.nn.utils.clip_grad_norm_(model.parameters(),clip)
        loss = None

        idxs = np.arange(X_train.shape[0])
        N = len(idxs)

        for _ in range(number_of_epochs):
            data_sampler = sampler(idxs, sample_size)
            l,v,m = train(data_sampler, number_of_epochs, X_train, X_val, y_train, y_val, model, optimiser,\
             criterion,metric,1,l,v,m,_)
            
        Loss.append(l)
        Validation.append(v)
        MeanAbsError.append(m)
        
    return Loss, Validation, MeanAbsError

In [None]:
criterion = nn.MSELoss()
metric = nn.L1Loss()
sample_size = 300
number_of_epochs = 5


results = {'lin_mod' : {'forwardFill' : 0 ,'constantMean' : 0, 'sampleDist' : 0},\
           'cnn_mod' :{'forwardFill' : 0 ,'constantMean' : 0, 'sampleDist' : 0}}


In [None]:
#linear model

for key in data_processing.keys():
    print('========= process : '+key+' =========')
    pTV = processTrainValidation(5,df_images,data_processing[key])
    model = lin_mod().train()
    weight_inits = model.state_dict()

    Loss, Validation, Mae = train_and_validate(criterion, metric, pTV, model, number_of_epochs, sample_size,\
                                               weight_inits)
    results['lin_mod'][key] = Validation

In [None]:
#cnn model

for key in data_processing.keys():
    print('========= process : '+key+' =========')
    pTV = processTrainValidation(5,df_images,data_processing[key])
    model = cnn_mod().train()
    weight_inits = model.state_dict()

    Loss, Validation, Mae = train_and_validate(criterion, metric, pTV, model, number_of_epochs, sample_size,\
                                               weight_inits)
    results['cnn_mod'][key] = Validation

In [None]:
def save_obj(obj, name ):
    with open(name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(name ):
    with open(name + '.pkl', 'rb') as f:
        return pickle.load(f)

In [None]:
import pickle
save_obj(results,'validation_results')

# 6. Pick a model and train on full data

The best performing processing/model is constantMean with cnn_mod, so we know train on the full dataset.

In [None]:
def train(data_sampler, number_of_epochs, X_train, y_train, model, optimiser, criterion,metric, l, m, _):
    N = len(X_train)
    i = 0
    while data_sampler.getDontStop():
        sample = data_sampler.sample()
        XX = torch.tensor(X_train[sample]).view(-1,1,96,96).cuda().float()
        YY = torch.tensor(y_train[sample]).cuda().float()
        
        predict = model(XX)
        
        loss = criterion(predict,YY)
        optimiser.zero_grad()
        loss.backward()
            
        optimiser.step()
        l.append(loss.item())
        Pyloss = loss.item()
        mae = metric(predict, YY)
        PyMae = mae.item()
        m.append(PyMae)
        del loss
        del XX
        del YY
        del mae
        gc.collect()
            
        print('\r', 'Epoch', _, 'Iteration',i,'of',int(N/data_sampler.sample_size), 'current_loss',Pyloss,end='')
        i+=1
    return l,m

In [None]:
y_train = data_processing['constantMean'].values
X_train = df_images.values
X_train = np.array([np.array(x) for x in X_train])

criterion = nn.MSELoss()
metric = nn.L1Loss()

model = cnn_mod().train()

number_of_epochs = 50
sample_size = 300
model.cuda()
optimiser = optim.Adam(model.parameters(), lr=0.01,amsgrad=True)

l, m = [], [],
clip = 0.1
torch.nn.utils.clip_grad_norm_(model.parameters(),clip)
loss = None

idxs = np.arange(X_train.shape[0])
N = len(idxs)

epoch_level_mae = []

for _ in range(number_of_epochs):
    data_sampler = sampler(idxs, sample_size)
    l,m = train(data_sampler, number_of_epochs, X_train, y_train, model, optimiser,criterion, metric, l,m,_)
    epoch_level_mae.append(m[-1])
    print('')
    print('Epoch',_)
    print('current loss : ',l[-1])
    print('current mae : ',m[-1])
    if (len(epoch_level_mae) >2 ) and epoch_level_mae[-1] - epoch_level_mae[-2] < 0:
        optimiser.param_groups[0]['lr'] = optimiser.param_groups[0]['lr']/10
        print('Optimiser learning rate reduced to '+str(optimiser.param_groups[0]['lr']))

In [None]:
torch.save(model.state_dict(), 'cnn_mod.pth')