In [None]:
import os
import numpy as np
import pandas as pd
import cv2
import random

from tqdm import tqdm
from joblib import Parallel, delayed

import matplotlib
import matplotlib.pyplot as plt

import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots

In [None]:
PAD = True
WIDTH = 256
HEIGHT = 256

TARGET_SAMPLE_COUNT = 20

ONE_UNCROPPED_SAMPLE = False
MIN_DIM_MIN_CROP = 0.5
MAX_DIM_MIN_CROP = 0.9

In [None]:
data_folder = "/kaggle/input/hotel-id-to-combat-human-trafficking-2022-fgvc9/"
train_folder = os.path.join(data_folder, 'train_images')
chain_names = os.listdir(train_folder)

#chain_names = chain_names[:3]

print(os.listdir(data_folder))
print(len(chain_names))

In [None]:
def pad_image(img):
    w, h, c = np.shape(img)
    if w > h:
        pad = int((w - h) / 2)
        img = cv2.copyMakeBorder(img, 0, 0, pad, pad, cv2.BORDER_CONSTANT, value=0)
    else:
        pad = int((h - w) / 2)
        img = cv2.copyMakeBorder(img, pad, pad, 0, 0, cv2.BORDER_CONSTANT, value=0)
        
    return img

def get_random_crop(image, crop_height, crop_width):
    max_x = image.shape[1] - crop_width
    max_y = image.shape[0] - crop_height

    x = np.random.randint(0, max_x)
    y = np.random.randint(0, max_y)

    crop = image[y: y + crop_height, x: x + crop_width]

    return crop

def crop_randomly(img):
    img_size = img.shape
    crop_height = img_size[0]
    crop_width = img_size[1]
    if crop_height > crop_width:
        crop_height = np.random.randint(img_size[0]*MIN_DIM_MIN_CROP, img_size[0])
        crop_width = np.random.randint(img_size[1]*MAX_DIM_MIN_CROP, img_size[1])
    else:
        crop_height = np.random.randint(img_size[0]*MAX_DIM_MIN_CROP, img_size[0])
        crop_width = np.random.randint(img_size[1]*MIN_DIM_MIN_CROP, img_size[1])
    return get_random_crop(img, crop_height, crop_width)


def open_and_preprocess_image(image_folder, image_name, random_crop=True):
    img = cv2.imread(os.path.join(image_folder, image_name))
    
    if random_crop:
        img = crop_randomly(img)
    
    if PAD:
        img = pad_image(img)
    
    img = cv2.resize(img, (WIDTH, HEIGHT))
    img = cv2.transpose(img)
    
    return img


def save_image(path, img):
    cv2.imwrite(path, img)


def process_chain(data_folder, chain_name, train_df, valid_df):
    chain_folder = os.path.join(data_folder, chain_name)
    image_names = os.listdir(chain_folder)
    original_sample_count = len(image_names)
    
    validPopI = np.random.randint(0, len(image_names))
    image_name = image_names.pop(validPopI)
    #img = cv2.imread(os.path.join(chain_folder, image_name))
    img = open_and_preprocess_image(chain_folder, image_name, random_crop=False)
    save_image("valid_images/{}.jpg".format(image_name), img)
    valid_df = valid_df.append({'image_id': image_name, 'hotel_id': chain_name}, ignore_index=True)
    
    if original_sample_count <= 1:
        image_names = os.listdir(chain_folder)
    
    nr = 0
    images = image_names.copy()
    reset_images = False
    target_count = int(TARGET_SAMPLE_COUNT*0.5 + len(images)*0.5 + 0.5)
    for i in range(target_count):
        if len(images) <= 0:
            images = image_names.copy()
            reset_images = True

        popI = np.random.randint(0, len(images))
        image_name = images.pop(popI)
        image_nr = int(image_name.split(".")[0])

        save_name = "1{}{:07d}".format(image_nr, nr)
        img = open_and_preprocess_image(chain_folder, image_name, random_crop=(reset_images or not ONE_UNCROPPED_SAMPLE))
        save_image("train_images/{}.jpg".format(save_name), img)
        train_df = train_df.append({'image_id': save_name, 'hotel_id': chain_name}, ignore_index=True)
        nr += 1

    return train_df, valid_df

In [None]:
if not os.path.isdir("train_images"):
    os.mkdir("train_images")
if not os.path.isdir("valid_images"):
    os.mkdir("valid_images")

In [None]:
%%time
#dfs_proc = Parallel(n_jobs=4, prefer='threads')(delayed(process_chain)(train_folder, chain_names[i]) for i in range(0, len(chain_names)))

train_df = pd.DataFrame(columns={'image_id', 'hotel_id'})
valid_df = pd.DataFrame(columns={'image_id', 'hotel_id'})

for hotel_id in tqdm(chain_names):
    train_df, valid_df = process_chain(train_folder, hotel_id, train_df, valid_df)

In [None]:
print("Output count train:", len(os.listdir("train_images")))
print("Output count valid:", len(os.listdir("valid_images")))

In [None]:
group_df = train_df.groupby(["hotel_id"]).size().to_frame("image_count").sort_values("image_count")[::-1].reset_index()

# top and low
low_df = group_df.iloc[-50:]
top_df = group_df.iloc[:50]

fig = make_subplots(rows=2, cols=2, 
                    specs=[[{"colspan": 2}, None], [{}, {}]],
                    horizontal_spacing=0.02, vertical_spacing=0.2, 
                    shared_yaxes=False,
                    subplot_titles=("", "Top 50", "Bottom 50"))


fig.add_trace(go.Scatter(x=group_df["hotel_id"], y=group_df["image_count"], showlegend = False), 1, 1)
fig.add_trace(go.Bar(x=top_df["hotel_id"], y=top_df["image_count"], showlegend = False), 2, 1)
fig.add_trace(go.Bar(x=low_df["hotel_id"], y=low_df["image_count"], showlegend = False), 2, 2)

fig.update_yaxes(title_text="Image count", row=1, col=1)
fig.update_yaxes(title_text="Image count", row=2, col=1)
fig.update_xaxes(type="category", visible=False, row=1, col=1)
fig.update_xaxes(title_text="Hotel ID", type="category", row=2, col=1)
fig.update_xaxes(title_text="Hotel ID", type="category", row=2, col=2)

fig.update_layout(title="Image count per hotel in training df", height=550)
fig.show()

In [None]:
#!cd /kaggle/working/images/ & zip -jqr images.zip .
#!find . -name "*.jpg" -delete
print(train_df)
train_df.to_csv('train.csv', index=False)

print(valid_df)
valid_df.to_csv('valid.csv', index=False)

from PIL import Image as pil_image

class HotelTrainDataset:
    def __init__(self, data, transform=None, data_path="train_images/"):
        self.data = data
        self.data_path = data_path
        self.transform = transform

    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        record = self.data.iloc[idx]
        image_path = ""
        if isinstance(record["image_id"], str):
            image_path = "{}{}.jpg".format(self.data_path, record["image_id"])
        else:
            image_path = "{}{:07d}.jpg".format(self.data_path, record["image_id"])
        image = np.array(pil_image.open(image_path).resize((HEIGHT, WIDTH))).astype(np.uint8)

        if self.transform:
            transformed = self.transform(image=image)
            image = transformed["image"]
        
        return {
            "image" : image,
            "target" : 0,
        }

def show_images(ds, title_text, n_images=5):
    fig, ax = plt.subplots(1,5, figsize=(22,8))
    
    ax[0].set_ylabel(title_text)
    
    for i in range(5):
        d = ds.__getitem__(i)
        ax[i].imshow(d["image"])
    
train_dataset = HotelTrainDataset(train_df, None, data_path="train_images/")
show_images(train_dataset, 'Training Images')