In [None]:
import numpy as np
import pandas as pd
import os

import cv2
from PIL import Image as pil_image
from tqdm import tqdm

import matplotlib
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots

import albumentations as A

In [None]:
PROJECT_FOLDER = '../input/hotel-id-to-combat-human-trafficking-2022-fgvc9/'

In [None]:
def show_im_with_channels(image):
    channels = ["Red", "Green", "Blue"]
    
    fig, ax = plt.subplots(1, 4, figsize=(22,6))
    ax[0].imshow(image)
    for i in range(3):
        ax[i+1].imshow(image[:, :, i])
        ax[i+1].set_title(channels[i])
        
def open_img_as_array(im_path):
    return np.array(pil_image.open(im_path)).astype(np.uint8)

def pad_and_resize(img, img_size):
    w, h, c = np.shape(img)
    if w > h:
        pad = int((w - h) / 2)
        img = cv2.copyMakeBorder(img, 0, 0, pad, pad, cv2.BORDER_CONSTANT, value=0)
    else:
        pad = int((h - w) / 2)
        img = cv2.copyMakeBorder(img, pad, pad, 0, 0, cv2.BORDER_CONSTANT, value=0)
        
    img = cv2.resize(img, (img_size, img_size))
    return img

# Occlusions in test dataset

Unlike last year's competition the test set images in this competition are all partially masked to emulate the added challenge of having a person in the blocking part of the view of the room.

In [None]:
test_image = open_img_as_array(PROJECT_FOLDER + '/test_images/abc.jpg')
show_im_with_channels(test_image)

# Provided ~~train~~ test masks

**train_masks** - Occlusions like the ones that will be present in the images in the test set.

[Abby Stylianou [Competition Host]](https://www.kaggle.com/competitions/hotel-id-to-combat-human-trafficking-2022-fgvc9/discussion/313547#1732937)
> The mask files are available for your convenience -- if there's a query image called 0001.jpg and a mask called 0001.png, the mask is simply a PNG that includes the exact mask from the query JPG. You can use these for processing of the query images in case it's easier than detecting the mask from the JPG, and also may also use any of the masks for whatever purpose in your training (as some other comments have suggested). You aren't required to use the PNG masks for anything -- like I said, they're just there for convenience.

In [None]:
mask_image = open_img_as_array('../input/hotel-id-to-combat-human-trafficking-2022-fgvc9/train_masks/00000.png')
show_im_with_channels(mask_image)

## Mapping masks to images?
> if there's a query image called 0001.jpg and a mask called 0001.png, the mask is simply a PNG that includes the exact mask from the query JPG

In [None]:
# load image and mask names
MASK_FOLDER = PROJECT_FOLDER + 'train_masks/'
mask_df = pd.DataFrame(data={"mask_id": os.listdir(MASK_FOLDER)}).sort_values(by="mask_id")
train_df = pd.read_csv('../input/hotelid-2022-train-images-256x256/train.csv')

print("Images:", train_df["image_id"].sort_values().head().values)
print("Masks:", mask_df["mask_id"].sort_values().head().values)

In [None]:
# replace file extension for easy mapping
train_id = train_df["image_id"].str.replace('.jpg', '', regex=False)
mask_id = mask_df["mask_id"].str.replace('.png', '', regex=False)

print("Image ids:", train_id.sort_values().head().values)
print("Mask ids:", mask_id.sort_values().head().values)

# find if any mask id appear in train dataset ids
print("Number of mask ids matching image ids:", mask_id.isin(train_id).sum())

~~But **there are no masks matching any images in train dataset**.~~

~~## No matching masks and images. Try to match it and display result
The name format is not the same so maybe if we prepand the mask name with additional 0s to match training image names it might work. Lets try to display some examples.
The resolution of mask and image is not always the same so they probably don't belong together.~~

In [None]:
# def load_and_display_pair(image_id, hotel_id, mask_id):
#     train_image = open_img_as_array(f"{PROJECT_FOLDER}train_images/{hotel_id}/{image_id}")
#     mask_image = open_img_as_array(f"{MASK_FOLDER}{mask_id}")

#     fig, ax = plt.subplots(1,2, figsize=(16, 4))
#     ax[0].imshow(train_image)
#     ax[0].set_title(f"{image_id} {np.shape(train_image)}")
#     ax[1].imshow(mask_image)
#     ax[1].set_title(f"{mask_id} {np.shape(mask_image)}")

# load_and_display_pair("000000000.jpg", "95500", "00000.png")
# load_and_display_pair("000001694.jpg", "209817", "01694.png")
# load_and_display_pair("000005235.jpg", "204287", "05235.png")

[Abby Stylianou [Competition Host]](https://www.kaggle.com/competitions/hotel-id-to-combat-human-trafficking-2022-fgvc9/discussion/313547#1734687)
> Oh! I have figured out the source of the confusion. **There was a mixup on the host end -- the "train_masks" folder should be named "test_masks"** (I've asked the kaggle team to update this). There are no training masks provided. This matches the real world setting, where the "test" images (from investigations) have occlusions in the region of the image where the victim is located.
>
> Training images, on the other hand, are not (by default) occluded. Competitors may choose to include occlusions in their training process, but we do not dictate that (or any other approach). If a competitor chose to incorporate masks, they could either generate their own, or repurpose the ones that match the test images (resizing them as necessary).

## Plot occlusion areas and image size
We can calculate what part of test images is covered by provided occlusions so we can simulate it better during training.

In [None]:
IMG_SIZE = 256

x_dim_array = [] # image size X
y_dim_array = [] # image size Y
occ_area = [] # percentage of occlusion in image
occ_img = np.zeros((IMG_SIZE, IMG_SIZE, 3))

mask_files = os.listdir(MASK_FOLDER)
for mask_id in tqdm(mask_files):
    mask_image = open_img_as_array(MASK_FOLDER + mask_id)
    X, Y, C = np.shape(mask_image)
    occ = (mask_image[:, :, 0] > 0).mean()
    x_dim_array.append(X)
    y_dim_array.append(Y)
    occ_area.append(occ)
    
    occ_img += pad_and_resize(mask_image[:, :, :3], IMG_SIZE)
    
occ_img /= len(mask_files)
occ_img = occ_img.astype(int)


In [None]:
fig = px.histogram(occ_area, nbins=50, marginal="box", height=350)
fig.update_layout(title="Distribution of occlusion coverage in test images")
fig.update_traces(hovertemplate="Image count: %{y} <br>Occlusion coverage: %{x}")
fig.show()

Mean over all masks to show most common position.

In [None]:
plt.figure(figsize=(6,6))
plt.imshow(occ_img)
plt.suptitle('Projection of all masks')

Test images should have the same size as masks so we can plot dimensions of images in test dataset.

In [None]:
fig = go.Figure()
fig.add_trace(go.Box(x=y_dim_array, name="Y", boxpoints="all"))
fig.add_trace(go.Box(x=x_dim_array, name="X", boxpoints="all"))
fig.update_yaxes(title="Axis")
fig.update_xaxes(title="Pixels")
fig.update_layout(title=f"Box plots - dimensions of images in test dataset")
fig.show()

# Simulating occlusions using albumentation CoarseDropout
To simulate the masks we can use CoarseDropout from [Albumentation](https://github.com/albumentations-team/albumentations) library.

In [None]:
train_image = open_img_as_array('../input/hotelid-2022-train-images-256x256/images/000000000.jpg')
show_im_with_channels(train_image)

Use albumentations CoarseDropout with fill_value=(255,0,0) to simulate red rectangle oclussion in dataset.

In [None]:
IMG_SIZE = 256
dropout = A.CoarseDropout(p=1., max_holes=1, 
                          min_height=IMG_SIZE//4, max_height=IMG_SIZE//2,
                          min_width=IMG_SIZE//4,  max_width=IMG_SIZE//2, 
                          fill_value=(255,0,0))


train_image = dropout(image=train_image)["image"]
show_im_with_channels(train_image)

The result looks similar to test image but it's not perfect. The location of occlusion in test masks is mainly center while with CoarseDropout it will be (should be) evenly distributed in the image.

In [None]:
test_image = open_img_as_array(PROJECT_FOLDER + '/test_images/abc.jpg')
show_im_with_channels(test_image)