# Preparación para el TF MDS

## Preparación de los datos
### Creación de un dataset muy pequeño (toy dataset) para facilitar el diseño del modelo

Dataset: https://www.yelp.com/dataset/documentation/main

Extraer fotos del RAR en /dataset/photos.

Requiere ejecución prévia del notebook 00_EDA.

In [1]:
import pandas as pd
from PIL import Image
import math
import numpy as np
import pickle
from tqdm.notebook import trange, tqdm
import torchvision.transforms as T
import matplotlib.pyplot as plt

In [2]:
SUBSET_SIZE = 500
PREFIX = 'toy'
BATCH_SIZE = 20000 # Para reducir el consumo de memoria en el procesamiento de imagenes
PHOTO_SIZE = 224.0 # De cara a usar transformer 224x224
OUTPUT_FOLDER = 'dataset/processed' # donde volcaremos los datos

In [3]:
# cargar listado de fotos con dimensiones > 224 x 224
photo_data = pickle.load(open('checkpoints/df5.pkl','rb'))

Vamos crear un sub-dataset con 100 fotos de cada etiqueta

In [4]:
photo_data.head(2)

Unnamed: 0,photo_id,label,x_dim,y_dim,z_channels,pixels,drink,food,inside,menu,outside
0,--0h6FMC0V8aMtKQylojEg,inside,400.0,300.0,3.0,120000.0,0.0,0.0,1.0,0.0,0.0
1,--3JQ4MlO-jHT9xbo7liug,food,400.0,400.0,3.0,160000.0,0.0,1.0,0.0,0.0,0.0


In [5]:
total_photos = photo_data.label.value_counts()
total_photos

food       106262
inside      55214
outside     18189
drink       15412
menu         1583
Name: label, dtype: int64

In [6]:
df_subset = pd.DataFrame([], columns=['photo_id',
                                        'label',
                                        'x_dim', 
                                        'y_dim', 
                                        'z_channels', 
                                        'pixels', 
                                        'drink',
                                        'food',
                                        'inside',
                                        'menu',
                                        'outside'])

In [7]:
for label in total_photos.index:
    if total_photos[label] > SUBSET_SIZE: # take sample of photos
        df_subset = pd.concat([df_subset,
                               photo_data.loc[photo_data.label == label].sample(n=SUBSET_SIZE)])
    else: # keep all photos
        df_subset = pd.concat([df_subset,
                               photo_data.loc[photo_data.label == label]])

In [8]:
len(df_subset)

2500

In [9]:
df_subset.label.value_counts()

food       500
inside     500
outside    500
drink      500
menu       500
Name: label, dtype: int64

In [10]:
pickle.dump(df_subset, open("checkpoints/df_subset.pkl", "wb"))

In [11]:
# checkpoint
df_subset = pickle.load(open("checkpoints/df_subset.pkl",'rb'))

## Escalar y recortar las fotos del subset

In [12]:
photo_counter = 0

for img in tqdm(range(len(df_subset)), desc='Photos cropped', miniters=100):
    # load image
    im = Image.open('dataset/photos/' + df_subset.iloc[img].photo_id + '.jpg')

    # resize smallest dimension to PHOTO_SIZE
    if (df_subset.iloc[img].y_dim < df_subset.iloc[img].x_dim): # narrow image
        width = int(PHOTO_SIZE)
        height = math.floor(PHOTO_SIZE * df_subset.iloc[img].x_dim/df_subset.iloc[img].y_dim)
    else: # wide image
        width = math.floor(PHOTO_SIZE * df_subset.iloc[img].y_dim/df_subset.iloc[img].x_dim)
        height = int(PHOTO_SIZE)
    
    resized = T.Resize((height, width))(im)
    cropped = T.CenterCrop(size=int(PHOTO_SIZE))(resized)

    # convert to numpy array
    img_np = np.array(cropped)

    if photo_counter % BATCH_SIZE == 0: # save images to file to preserve memory
        if photo_counter > 0:
            pickle.dump(images,
                        open('{}/_images_{}.pkl'.format(OUTPUT_FOLDER,
                                                        PREFIX, photo_counter),
                             'wb'))
            # print('Saved file images_{}.pkl'.format(photo_counter))
        images = [img_np]           # reset image list
    else:
        images += [img_np]          # add image to list
    
    photo_counter += 1
    
pickle.dump(images, open('{}/{}_images_remainder.pkl'.format(OUTPUT_FOLDER,
                                                             PREFIX),
                         'wb'))

Photos cropped:   0%|          | 0/2500 [00:00<?, ?it/s]

In [13]:
if SUBSET_SIZE < BATCH_SIZE: # only one file
    images = pickle.load(open('{}/{}_images_remainder.pkl'.format(OUTPUT_FOLDER,
                                                                  PREFIX),
                              'rb'))
else:
    file_list = ['{}/{}_images_{}.pkl'.format(OUTPUT_FOLDER,
                                              PREFIX,
                                              y*BATCH_SIZE) for y in range(1,len(df_subset) //BATCH_SIZE + 1)] + ['{}/subset_images_remainder.pkl'.format(OUTPUT_FOLDER)]
    images = []
    for file in tqdm(file_list, desc='Files loaded', miniters=1):
        images += pickle.load(open(file,'rb'))

In [14]:
pickle.dump(images, open('{}/full_{}.pkl'.format(OUTPUT_FOLDER, PREFIX),'wb'))

In [15]:
len(images)

2500

In [16]:
len(df_subset)

2500

In [17]:
df_subset['image'] = images

In [27]:
df_subset.head(2)

Unnamed: 0,photo_id,label,x_dim,y_dim,z_channels,pixels,drink,food,inside,menu,outside,image
130133,d_hQrQ_RDFFPffjVls0gyA,food,400.0,600.0,3.0,240000.0,0.0,1.0,0.0,0.0,0.0,"[[[173, 190, 206], [173, 190, 206], [176, 193,..."
148638,jXgxVPQMkaE5CdXmG_rjyw,food,379.0,600.0,3.0,227400.0,0.0,1.0,0.0,0.0,0.0,"[[[24, 33, 18], [25, 34, 19], [29, 37, 22], [3..."


In [28]:
df_subset.iloc[0].image.shape

(224, 224, 3)

In [29]:
pickle.dump(df_subset, open('{}/{}_dataset.pkl'.format(OUTPUT_FOLDER, PREFIX), 'wb'))