In [None]:
import os
import pathlib

import numpy as np
import pandas as pd
from PIL import Image as pil_img
import cv2

from tqdm import tqdm
from joblib import Parallel, delayed

# Globals

In [None]:
# directory of the data
data_dir = pathlib.Path("/kaggle/input/hotel-id-to-combat-human-trafficking-2022-fgvc9/")

# Work directory, where to store the data
working_dir = pathlib.Path("./")

# Locations of the train, mask and test images in the data directory
train_dir = data_dir / pathlib.Path("train_images")
train_mask_dir = data_dir / pathlib.Path("train_masks")
test_dir = data_dir / pathlib.Path("test_images")

# Where to store the data in the work directory
train_out_dir = working_dir / pathlib.Path("pad_and_resize/train_images")
test_out_dir = working_dir / pathlib.Path("pad_and_resize/test_images")

train_out_dir.mkdir(parents=True, exist_ok=True)
test_out_dir.mkdir(parents=True, exist_ok=True)

In [None]:
SEED = 42

# Wheter to PAD the images
PAD = True

# The size to resize the images to
PATCH = (512, 512)

# Load data

In [None]:
chain_names = os.listdir(train_dir)

train_file = working_dir / pathlib.Path("train.csv")
mask_file = working_dir / pathlib.Path("mask.csv")
test_file = working_dir / pathlib.Path("test.csv")

if not train_file.exists():
    train_df = pd.DataFrame(columns={'image_id', 'hotel_id'})
    for hotel_id in chain_names:
        for image_id in os.listdir(train_dir / hotel_id):
            train_df = train_df.append({'image_id': image_id, 'hotel_id': hotel_id}, ignore_index=True)
    train_df.to_csv(train_file, index=False)
else:
    train_df = pd.read_csv(train_file)

if not mask_file.exists():
    mask_df = pd.DataFrame(columns={'image_id'})
    for image_id in os.listdir(train_mask_dir):
        mask_df = mask_df.append({'image_id': image_id}, ignore_index=True)
    mask_df.to_csv(mask_file, index=False)
else:
    mask_df = pd.read_csv(mask_file)    

if not test_file.exists():
    test_df = pd.DataFrame(columns={'image_id'})
    for image_id in os.listdir(test_dir):
        test_df = test_df.append({'image_id': image_id}, ignore_index=True)
    test_df.to_csv(test_file, index=False)
else:
    test_df = pd.read_csv(test_file)

In [None]:
print("Number of train images:", len(train_df))
print("Number of test images:", len(test_df))
print("Number of different classes:", len(chain_names))
train_df.head()

# Preprocessor

In [None]:
def process_img(img_path: pathlib.Path):
    img = cv2.imread(str(img_path))
    
    if PAD: img = pad(img)
    
    return cv2.resize(img, PATCH)

def save_img(img_path: pathlib.Path, img: np.array):
    cv2.imwrite(str(img_path), img)

"""
def pad(img):
    w, h, c = np.shape(img)
    const = 0
        
    if w == h: return img
    elif (w - h) % 2 != 0: const = 1
        
    if w < h:
        half_py = (h - w) // 2       
        return cv2.copyMakeBorder(img, 0, 0, half_py, half_py + const, cv2.BORDER_CONSTANT, value=0)
    elif h < w:
        half_px = (w - h) // 2
        return cv2.copyMakeBorder(img, half_px, half_px + const, 0, 0, cv2.BORDER_CONSTANT, value=0)
"""    

def pad(img):
    w, h, c = np.shape(img)
    if w > h:
        pad = int((w - h) / 2)
        img = cv2.copyMakeBorder(img, 0, 0, pad, pad, cv2.BORDER_CONSTANT, value=0)
    else:
        pad = int((h - w) / 2)
        img = cv2.copyMakeBorder(img, pad, pad, 0, 0, cv2.BORDER_CONSTANT, value=0)
        
    return img

In [None]:
def process_train(data_folder, chain_name, out_dir):
    chain_folder = data_folder / chain_name
    
    for img_name in os.listdir(chain_folder):
        img = process_img(chain_folder / img_name)
        save_img(out_dir / img_name, img)


dfs_proc = Parallel(n_jobs=4, prefer='threads')(delayed(process_train)(train_dir, chain_names[i], train_out_dir) for i in range(0, len(chain_names)))

In [None]:
!cd /kaggle/working/pad_and_resize & zip -jqr images.zip .
!find . -name "*.jpg" -delete