In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        break

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Import libraries and check versions

In [None]:
from fastai.vision.all import *
import matplotlib.pyplot as plt

In [None]:
print(torch.cuda.is_available())
print(torch.version.cuda)
print(torch.__version__)

## Explore the input data

In [None]:
id_lookup_table_csv = pd.read_csv('../input/facial-keypoints-detection/IdLookupTable.csv', index_col='RowId')
print(id_lookup_table_csv.head())
print(id_lookup_table_csv.describe())

In [None]:
train_csv = pd.read_csv('../input/facial-keypoints-detection/training.zip')
train_csv_images = train_csv['Image']
train_csv = train_csv.drop(columns='Image')
train_csv.head()

The data frame info() function tells us that we have lots of NaNs in our data.
Pandas provides some functions to replace NaNs with other plausible values (e.g. median),
but that's not a really satisfying solution.
We can however choose to ignore coordinates with NaNs during loss value computation.

In [None]:
train_csv.info()

Let's convert the training image pixel values to actual image data:

In [None]:
train_images = [np.fromstring(train_csv_images.iloc[i], sep=' ').reshape([96,96]) for i in range(train_csv_images.size)]
train_points = [train_csv.iloc[k].values.reshape([15,2]) for k in range(train_csv.shape[0]) ]

Have a look at a sample training image and its feature points:

In [None]:
print(train_points[0])
plt.imshow(train_images[0], cmap='gray')
plt.plot(train_points[0][:,0], train_points[0][:,1], 'gx')


Just like the training data, we convert the test data for easier access and display. 

In [None]:
test_csv = pd.read_csv('../input/facial-keypoints-detection/test.zip')
test_csv_images = test_csv['Image']
test_csv = test_csv.drop(columns='Image')
print(test_csv.head())

In [None]:
test_images = [np.fromstring(test_csv_images[i], sep=' ').reshape([96,96]) for i in range(test_csv_images.size)]

In [None]:
plt.imshow(test_images[0], cmap='gray')

For convenience, we define a display function that can plot training and test data, and additionally predicted feature points (if given):

In [None]:
def show_image_and_points(img, true_pnts=None, pred_pnts=None):
    ax = plt.imshow(img, cmap='gray')
    if true_pnts is not None:
        plt.plot(true_pnts[:,0], true_pnts[:,1], 'gx')
    if pred_pnts is not None:
        plt.plot(pred_pnts[:,0], pred_pnts[:,1], 'r+')
def show_test(i):
    pred,_,_ = learner.predict(test_images[i])
    show_image_and_points(test_images[i], pred_pnts=pred)
def show_train(i, learner=None):
    fully_decoded = None
    if learner is not None:
        fully_decoded, loss_func_decoded, probabilities = learner.predict(train_images[i])
    show_image_and_points(train_images[i], true_pnts=train_points[i], pred_pnts=fully_decoded)

Just check some sample image:

In [None]:
show_train(2010)

# Augment the training data
In addition to FastAI's on-the-fly augmentation, we deliberately augment our training set with shift, rotation and scaling augmentations. The intention here is to add augmentation in a non-probabilistic way that pushes our model to make good predictions for the original image _and_ a shifted/rotated/scaled version of the same image during each epoch.

First, we define our augmentation function that takes a training image and training points and returns
a tuple with the transformed image and training points.  
After the transformation, point coordinates may be outside the valid range of [0,96], so we make sure we
set these points' coordinates to NaN.

In [None]:
def augment(img, pnts, rot_deg, zoom_factor, x_shift_pix, y_shift_pix):
    sz = img.shape[-2:]
    def get_rotation(x):
        mysz = x.new_ones(x.shape[0])
        rot_rad = torch.ones_like(mysz)*(rot_deg / 180.0 * np.pi)
        m11 = rot_rad.cos() / zoom_factor
        m12 = rot_rad.sin() / zoom_factor
        t0 = torch.ones_like(mysz)*(x_shift_pix/48.0)
        t1 = torch.ones_like(mysz)*(y_shift_pix/48.0)
        return affine_mat(m11, m12, t0, -m12, m11, t1)
    t1 = AffineCoordTfm(aff_fs=get_rotation, size=sz)
    p1 = Pipeline(funcs=t1)
    x = TensorImage(img).view([1,1,96,96])
    y = TensorPoint(pnts, img_size=[96,96]).view([1,15,2])
    x,y = p1((x,y/48.0-1.0))
    y = y.view([15,2])
    coord_ok = (y[:,0] > -1.0) & (y[:,0] < 1.0) & (y[:,1] > -1.0) & (y[:,1] < 1.0)
    coord_ok = torch.stack([coord_ok, coord_ok], dim=1)
    y = y.where(coord_ok, tensor(np.nan))
    y = y*48.0+48.0
    return np.array(x.view([96,96])), np.array(y)

Let's check visually that our function works. Note that the augmented points have been correctly transformed so that they align with the actual facial features of the transformed image.

In [None]:
aug_img, aug_pnts = augment(train_images[0], train_points[0], 10.0, 1.0, 16.0, -16.0)
show_image_and_points(aug_img, true_pnts=aug_pnts)

We define augmentation with x-shift from -10...+10 pixels, y-shift from -10...+10 pixels, rotations from -10...+10 degrees, and scale factor from 90% to 110%.

In [None]:
augs = []
one_pixel = 2.0/96.0
for dx in range(21):
    for dy in range(21):
        if dx==10 and dy==10:
            continue
        augs.append([0.0, 1.0, dx-10.0, dy-10.0])
for rot_deg in range(21):
    if rot_deg==10:
        continue
    augs.append([rot_deg-10, 1.0, 0.0, 0.0])
for scale in range(21):
    if scale==10:
        continue
    augs.append([0.0, 0.9 + 0.01*scale, 0.0, 0.0])
print(len(augs))

So we have a total of 480 augmentation transformations.  
Now create the augmented training data set, interleaving original training data with augmented training data.

In [None]:
aug_images = []
aug_points = []
aug_ind = 0
for k1 in range(len(train_images)):
    img,pnt = augment(train_images[k1], train_points[k1], *augs[aug_ind])
    aug_images.append(train_images[k1])
    aug_points.append(train_points[k1])
    aug_images.append(img)
    aug_points.append(pnt)
    aug_ind = (aug_ind + 1) % len(augs)
print(len(aug_images))

This gives us 14098 augmented training images (+ their feature points).  
Finally, replace the training data set with the augmented data set.

In [None]:
train_images = aug_images
train_points = aug_points

### Visually check some training images and their augmented versions:
In particular, we verify that the feature points of our augmented images align with the actual augmented image data.

In [None]:
print("Original")
show_train(0)

In [None]:
print("Augmented (shifted right+down)")
show_train(1)

## Define Loss function for use with NANs
The original input data as well as our augmented training points may contain NaNs.
In order to enable training with NaN coordinates, let's define a loss function that computes a meaningful loss value even if some points cannot be used.  
The below definition might look a little complicated at first glance but is actually a copy
of the implementation of BaseLoss in Lib\site-packages\fastai\loss.py, with only a small modification in the call() function.
This modification simply sets the loss value to zero for those coordinates that are NaN.

In [None]:
class MyBaseLoss():
    "Same as my `loss_cls`, but flattens input and target."
    activation=decodes=noops
    def __init__(self, loss_cls, *args, axis=-1, flatten=True, floatify=False, is_2d=True, **kwargs):
        store_attr("axis,flatten,floatify,is_2d")
        self.func = loss_cls(*args,**kwargs)
        functools.update_wrapper(self, self.func)

    def __repr__(self): return f"MyFlattenedLoss of {self.func}"
    @property
    def reduction(self): return self.func.reduction
    @reduction.setter
    def reduction(self, v): self.func.reduction = v

    def _contiguous(self,x):
        return TensorBase(x.transpose(self.axis,-1).contiguous()) if isinstance(x,torch.Tensor) else x

    def __call__(self, inp, targ, **kwargs):
        inp,targ  = map(self._contiguous, (inp,targ))
        if self.floatify and targ.dtype!=torch.float16: targ = targ.float()
        if targ.dtype in [torch.int8, torch.int16, torch.int32]: targ = targ.long()
        if self.flatten: inp = inp.view(-1,inp.shape[-1]) if self.is_2d else inp.view(-1)
        tmptarg2 = targ.view(inp.shape)
        tmptarg3 = torch.where(torch.isnan(tmptarg2), inp, tmptarg2)
        tmptarg4 = tmptarg3.view(-1) if self.flatten else tmptarg3
        return self.func.__call__(inp, tmptarg4, **kwargs)

Similarly to BaseLoss, we define our version of MSELossFlat based on the original implementation, returning our customized loss from above: 

In [None]:
@use_kwargs_dict(reduction='mean')
def MyMSELossFlat(*args, axis=-1, floatify=True, **kwargs):
    "Same as MY `nn.MSELoss`, but flattens input and target."
    return MyBaseLoss(nn.MSELoss, *args, axis=axis, floatify=floatify, is_2d=False, **kwargs)


## Training/Validation data split
Having inserted an augmented image after each training image, we choose to split our data at a predefined index (without randomization). This way we can avoid that accidentally an augmented training image slips into our validation data set. 

In [None]:
def MySplitter(valid_pcts=0.2):
    def _inner(item_range):
        cut = int(len(item_range) * (1.0-valid_pcts))
        mylist = list(item_range)
        l1 = mylist[:cut]
        l2 = mylist[cut:]
        return L(l1), L(l2)
    return _inner


## Defining our DataLoaders
We use the DataBlock API to define images (ImageBlock) as input and 2D points (PointBlock) as output.

Also, we enable FastAI's on-the-fly augmentation transformations for increased robustness.
Note that it is necessary to pass "do_flip=False", otherwise the model will get confused because
the image and its associated training points will get horizontally flipped, but the model will still
interpret e.g. the "left" eye on the left side of the flipped image,
which would cause a very large loss and thus disturb the model's weights.

In [None]:
def get_x(ind):
    return train_images[ind]
def get_y(ind):
    return train_points[ind]
def get_items(i): return i
db = DataBlock(blocks=[ImageBlock, PointBlock],
               get_items=get_items,
               get_x=get_x, get_y=get_y,
               item_tfms=Resize([96,96]),
               splitter=MySplitter(0.2),
               batch_tfms=aug_transforms(size=[96,96],
                                         mult=1.0, max_rotate=8.0, 
                                         flip_vert=False, 
                                         do_flip=False, 
                                         pad_mode='border', # 'border' or 'reflection'
                                         max_zoom=1.0, min_zoom=0.9, max_lighting=0.1, max_warp=0.0)
              )

We can use the summary() function to check whether the DataBlock definition works:

In [None]:
db.summary(range(len(train_images)))

Finally create the DataLoaders from the DataBlock, using a batch size of 64.  
Also, we override the standard loss with our custom loss function.

In [None]:
dls = db.dataloaders(range(len(train_images)), bs=64)
dls.train_ds.loss_func = MyMSELossFlat()

In [None]:
dls.show_batch(cmap='gray', unique=False)

## Define Learner
Create a convolutional neural network learner from the DataLoaders.
We use transfer learning based on resnet18.  
Using deeper resnets does not improve accuracy.
Also, accuracy get worse if setting limits with y_range=(-1,1).


In [None]:
learner = cnn_learner(dls, resnet18)

We can have a look the model architecture using the summary() function, which also tells us the activations' shape at several points in the model.

In [None]:
learner.summary()

In the above output, we note that our custom loss function is indeed used.

Another way to look inside our model is to directly print the model, which gives more details about the PyTorch model layers, but does not tell us about FastAI's added functionality:

In [None]:
learner.model

## Training loop

Let's use the learning rate finder to find out what learning rate makes sense:

In [None]:
learner.lr_find()

We pick 1e-2 as our learning rate and start training for "one cycle":

In [None]:
print('Starting fit one cycle')
learner.fit_one_cycle(15, lr_max=1e-2)

Plot training and validation loss:

In [None]:
learner.recorder.plot_loss()

Let's look at some sample predictions on our training data:

In [None]:
learner.show_results(ds_idx=0, shuffle=False, nrows=2, ncols=4)

Now look at some sample predictions on the validation data:

In [None]:
learner.show_results(nrows=3, ncols=4, max_n=16)

We try to improve our model using fine tuning: 

In [None]:
print('Starting fine tuning')
learner.fine_tune(75)
print('Fine tuning finished')

Again, plot training and validation loss:

In [None]:
learner.recorder.plot_loss()

Validation loss has improved, but reached a plateau.  
Show some validation images with predictions:

In [None]:
learner.show_results(nrows=3, ncols=4, max_n=16)

## Generate predictions for the test data

Depending on the input data, our model might predict coordinates outside of the range (0,96).  
Since a submission is only valid if all its coordinates are in (0,96),
we compute our predictions for all test images and then clamp the coordinates to (0,96):

In [None]:
test_predictions = np.zeros([len(test_images), 30])
for k in range(len(test_images)):
    pred, pred_loss, pred_prob = learner.predict(test_images[k])
    pred = torch.where(pred>=96, torch.ones(pred.shape)*95.99, pred)
    pred = torch.where(pred<=0, torch.ones(pred.shape)*0.01, pred)
    test_predictions[k] = pred.view([30])

## Prepare submission file


The IdLookupTable.csv file tells us which feature of which image to output, for each row of the submission file.

In [None]:
lut = pd.read_csv('../input/facial-keypoints-detection/IdLookupTable.csv', index_col='RowId')
lut.head()

In the above output, we note that ImageIDs are starting with 1 (i.e. are 1-based), so we need to subtract 1 later when accessing our predictions array.

Let's read the sample submission file.  
We will prepare the submission by simply replacing the Location entry in each row with our own predictions

In [None]:
sample = pd.read_csv('../input/facial-keypoints-detection/SampleSubmission.csv', index_col='RowId')
sample['Location'] = sample['Location'].astype(np.float)
sample.head()

For easier mapping of column names to prediction array indices, we create a dictionary:

In [None]:
namedict = {train_csv.columns[k1]: k1 for k1 in range(len(train_csv.columns))}
print(namedict)

Now we can access the predicted facial features with their column names:

In [None]:
print(test_predictions.shape)
print(test_predictions[0][:])
print(namedict['right_eye_center_y'])
print(test_predictions[0][namedict['right_eye_center_y']])

Now all we have to do is to go through the sample submission file, extract the image id and feature name from each row,
and then insert the predicted coordinate into the sample DataFrame.

In [None]:
for k1 in range(sample.shape[0]):
    imageid = lut.iloc[k1]['ImageId']-1
    featurename = lut.iloc[k1]['FeatureName']
    featurecol = namedict[featurename]
    sample.iloc[k1]['Location'] = test_predictions[imageid,featurecol]

Check that the sample dataframe now contains our predictions:

In [None]:
sample.head()

Finally output the sample DataFrame as 'submission.csv' file: 

In [None]:
sample.to_csv('submission.csv')