In [None]:
import os
import pandas as pd
from PIL import Image
import matplotlib.pyplot as plt
import random

In [None]:
SIZE = (512, 512)
IN_DIR  = '/kaggle/input/hotel-id-to-combat-human-trafficking-2022-fgvc9/train_images/'
OUT_DIR = '/kaggle/working/train_images/'

In [None]:
os.makedirs(OUT_DIR, exist_ok=True)

In [None]:
crops_per_chain = []
for dirname, _, filenames in os.walk(IN_DIR):
    if len(filenames) == 0:
        continue
    
    num_hotels = len(filenames)
    max_crops  = num_hotels * 3
    crops_per_chain.append(max_crops)

avg_max_crops_per_chain = sum(crops_per_chain) / len(crops_per_chain)
avg_max_crops_per_chain

In [None]:
crops_per_chain.sort()
plt.plot(crops_per_chain)

In [None]:
plt.plot(crops_per_chain[:3000])

In [None]:
plt.plot(crops_per_chain[-25:])

In [None]:
def get_distribution(y, z, scale):
    '''
    Parameters
    ----------
    y : int
        Number of images for this hotal chain
    z : int
        Average of maximum number of crops for all chains
    '''
    chance = 3 / 2 - z / (2 * y)
    chance = max(0, min(1, chance))
    comp   = min(1, z / y)
    pct    = max(1, 1 + (1 - z / y) * scale) * comp
    return chance, pct

In [None]:
def clamp01(x):
    return max(0, min(1, x))

def get_distribution_total(y, z, scale):
    '''
    Parameters
    ----------
    y : int
        Number of images for this hotal chain
    z : int
        Average of maximum number of crops for all chains
    '''
    chance = 3 / 2 - z / (2 * y)
    chance = clamp01(chance)
    comp   = min(1, z / y)
    pct    = max(1, 1 + (1 - z / y) * scale) * comp
    
    total = chance * y + (1 - chance) * y * 3
    return pct * total

In [None]:
dist = [get_distribution_total(max_crops, avg_max_crops_per_chain, 5) for max_crops in crops_per_chain]
plt.plot(dist)

In [None]:
def generate_images(df, imgs, dirname, avg_max_crops_per_chain):
    num_images = len(imgs)
    one_crop_pct, pct = get_distribution(num_images, avg_max_crops_per_chain, 5)
    
    for idx, (im, filename) in enumerate(imgs):
        t = idx / num_images
        im = Image.open(dirname + '/' + filename)

        w, h = im.size
        if w > h:
            # Image is in landscape mode
            pad = int((w - h) / 2)
            crops = [
                im.crop((0, 0, h, h)),         # Left
                im.crop((pad, 0, w - pad, h)), # Center
                im.crop((w - h, 0, w, h)),     # Right
            ]
        else:
            # Image is in portait mode
            pad = int((h - w) / 2)
            crops = [
                im.crop((0, 0, w, w)),         # Top
                im.crop((0, pad, w, h - pad)), # Center
                im.crop((0, h - w, w, h)),     # Bottom
            ]
        
        if t < one_crop_pct:
            # Only keep center crop
            crops = [crops[1]]

        image_id = filename[:-4] # Remove '.jpg' part
        hotel_id = dirname[75:]  # Remove IN_DIR part

        for i, crop in enumerate(crops):
            if random.random() > pct:
                continue
            
            crop = crop.resize(SIZE, Image.ANTIALIAS)
            name = image_id + str(i) + '.jpg'
            crop.save(OUT_DIR + name)

            df = df.append({
                'hotel_id': hotel_id,
                'image_id': name,
            }, ignore_index=True)

        return df

In [None]:
def load_images(dirname, filenames):
    return [(Image.open(dirname + '/' + filename), filename) for filename in filenames]

def absolute_aspect_ratio(image):
    w, h = image.size
    if w > h:
        # Image is in landscape mode
        return w / h
    else:
        # Image is in portait mode
        return h / w

In [None]:
df = pd.DataFrame(columns=['hotel_id', 'image_id'])

for dirname, _, filenames in os.walk(IN_DIR):
    if len(filenames) == 0:
        continue
    
    print(f'Processing directory {dirname}')
    
    imgs = load_images(dirname, filenames)
    ratios = [absolute_aspect_ratio(im) for (im, _) in imgs]
    imgs = [tup for _, tup in sorted(zip(ratios, imgs), key=lambda pair: pair[0])]
    imgs = imgs[::-1]
    
    df = generate_images(df, imgs, dirname, avg_max_crops_per_chain)

In [None]:
df.reset_index(inplace=True)
df.drop(columns=['index'], inplace=True)
df.head()

In [None]:
df.to_csv('train_df.csv')