# Import code needed for dataset exploration and model training 

In [None]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns
import PIL
import tensorflow as tf
import random
from IPython.core.debugger import set_trace
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras import Sequential, layers
from tensorflow.keras.callbacks import EarlyStopping
import cv2

In [None]:
pwd

In [None]:
ls

# **Let's create out dataframe with pictures and masks**

# First we create a global dataframe with all picture files in our data folder

In [None]:
df = pd.DataFrame(columns=['directory','hotel_id', 'image_id', 'image_width', 'image_height','image_size'])

In [None]:
%%time

for dirname, _, filenames in os.walk('/kaggle/input/hotel-id-to-combat-human-trafficking-2022-fgvc9/'):
    for filename in filenames: 
        try:
#             set_trace()
            if '/kaggle/input/hotel-id-to-combat-human-trafficking-2022-fgvc9/train_images/' in dirname :
                hotel_id = dirname.replace('/kaggle/input/hotel-id-to-combat-human-trafficking-2022-fgvc9/train_images/','')
            else:
                hotel_id = None
            image_size=PIL.Image.open(os.path.join(dirname, filename)).size
            row = pd.DataFrame({'directory':dirname,'hotel_id':hotel_id, 'image_id':filename, 'image_width':[image_size[0]], 'image_height':[image_size[1]],'image_size':[image_size]})
            df = pd.concat([df,row])
            print(hotel_id,filename,image_size)
        except:
            pass

In [None]:
pwd

In [None]:
ls

In [None]:
df.reset_index(inplace=True)

In [None]:
df.drop(columns=['index'],inplace=True)

In [None]:
df.drop(df.tail(1).index,inplace=True)

In [None]:
df

In [None]:
df.to_csv('df.csv')

# Then we can create our image dataframe containing all the unmasked pictures

In [None]:
image_df = df[df['hotel_id'].notnull()]

In [None]:
image_df

In [None]:
image_df.to_csv('image_df.csv')

**We are able to analyze the data from images :**

**Data about hotel chains**

In [None]:
chains = image_df.groupby('hotel_id').size()

In [None]:
chains.sort_values(inplace=True)

In [None]:
print('we have a total of',chains.count(),'hotel chains, and there is an average of', round(chains.mean()), 'pictures per chain, but as it is a skewed representation, we should consider the median which is',chains.median())

In [None]:
%%time

hotel_id_size = sns.barplot(x=chains.index,y=chains.values)
hotel_id_size.set_title('Number of pictures available per hotel chain')
hotel_id_size.set_xlabel('Hotel chain')
hotel_id_size.set_ylabel('Number of Pictures')
plt.ylim(0,100)

**Data about picture sizes**

In [None]:
picture_sizes = image_df.groupby('image_size').size()

In [None]:
picture_sizes.sort_values(inplace=True)

In [None]:
picture_sizes

In [None]:
print('we have a total of',picture_sizes.count(),'picture sizes, the most represented size is', picture_sizes.tail(1).index[0] )

In [None]:
%%time

picture_shape = sns.barplot(x=picture_sizes.index,y=picture_sizes.values)
picture_shape.set_title('Picture size population')
picture_shape.set_xlabel('Picture size')
picture_shape.set_ylabel('Number of Pictures')

In [None]:
mask_df = df[df['hotel_id'].isnull()]

In [None]:
mask_df.reset_index(inplace=True)

In [None]:
mask_df

In [None]:
mask_df.to_csv('mask_df.csv')

In [None]:
mask_df.groupby('image_size').size().sort_values()

In [None]:
print('we have a total of',mask_df.groupby('image_size').size().sort_values().count(),'mask sizes, the most represented size is', mask_df.groupby('image_size').size().sort_values().tail(1).index[0] )

**After data analysis, we can see that we have some problems to deal with in our later preprocessing, modeling and training :**

* Unbalanced dataset : Some Hotel chains have much more pictures than others and there is a risk that the model learns more from these chains

* Image ratio variety : We find a wide variety of image sizes and image ratios which makes it difficult to resize all images to have the same shape

* Mask size variety : I am not sure how the masks are meant to be used but we have a wide range of masks sizes and ratio.

**In the next steps we will try to :**

* Reduce image size but keep the same image ratio

* Use data augmentation on the complete dataset and try to resize as (256,256)

* Implement random mask for each picture during the data augmentation process

In [None]:
pwd

In [None]:
os.makedirs('hotel_id_dataset_512x512',exist_ok=True)

In [None]:
def prepare_image(dirname,filename):
    max_size=(512,512)
    picture=PIL.Image.open(os.path.join(dirname,filename))
    cover = mask_df.iloc[random.randint(0,4949)]
    mask=PIL.Image.open(os.path.join(cover['directory'],cover['image_id']))
    if picture.width > picture.height:
         picture=picture.rotate(90,expand=True)
    if mask.width > mask.height:
        mask=mask.rotate(90,expand=True)
    picture.thumbnail(max_size) 
    mask.thumbnail(max_size)
    picture.paste(mask,(0,0),mask)
    new_filename = 'preproc_'+filename
    new_dirname = dirname.replace('/kaggle/input/hotel-id-to-combat-human-trafficking-2022-fgvc9/','/kaggle/working/hotel_id_dataset_512x512/')
    os.makedirs(new_dirname,exist_ok=True)
    picture.save(os.path.join(new_dirname,new_filename))
    return picture

In [None]:
%%time

for dirname, _, filenames in os.walk('/kaggle/input/hotel-id-to-combat-human-trafficking-2022-fgvc9'):
    for filename in filenames:
        try:
            if '/kaggle/input/hotel-id-to-combat-human-trafficking-2022-fgvc9/train_images/' in dirname and '.jpg' in filename:
                  prepare_image(dirname,filename)
        except:
            pass

# We have now a complete set of masked images with max size of 512x512 but keeping aspect ratio