# Preparación para el TF MDS

## Preparación de los datos - enfoque en subconjunto del dataset

Repo para trabajo de preparación

Dataset: https://www.yelp.com/dataset/documentation/main

Extraer fotos del RAR en /dataset/photos.

Extraer archivo photos.json en /dataset

In [3]:
import pandas as pd
from PIL import Image
import math
import numpy as np
import pickle
from tqdm.notebook import trange, tqdm
import torchvision.transforms as T
import matplotlib.pyplot as plt

In [4]:
# cargar listado de fotos con dimensiones > 224 x 224
photo_data = pickle.load(open('checkpoints/df5.pkl','rb'))

Vamos crear un sub-dataset con hasta 20.000 fotos de cada etiqueta

In [None]:
SUBSET_SIZE = 20000

In [5]:
photo_data.head(2)

Unnamed: 0,photo_id,label,x_dim,y_dim,z_channels,pixels,drink,food,inside,menu,outside
0,--0h6FMC0V8aMtKQylojEg,inside,400.0,300.0,3.0,120000.0,0.0,0.0,1.0,0.0,0.0
1,--3JQ4MlO-jHT9xbo7liug,food,400.0,400.0,3.0,160000.0,0.0,1.0,0.0,0.0,0.0


In [19]:
total_photos = photo_data.label.value_counts()
total_photos

food       106262
inside      55214
outside     18189
drink       15412
menu         1583
Name: label, dtype: int64

In [28]:
df_subset = pd.DataFrame([], columns=['photo_id',
                                        'label',
                                        'x_dim', 
                                        'y_dim', 
                                        'z_channels', 
                                        'pixels', 
                                        'drink',
                                        'food',
                                        'inside',
                                        'menu',
                                        'outside'])

In [29]:
for label in total_photos.index:
    if total_photos[label] > SUBSET_SIZE: # take sample of 20.000 photos
        df_subset = pd.concat([df_subset,
                               photo_data.loc[photo_data.label == label].sample(n=SUBSET_SIZE)])
    else: # keep all photos
        df_subset = pd.concat([df_subset,
                               photo_data.loc[photo_data.label == label]])

In [30]:
len(df_subset)

75184

In [31]:
df_subset.label.value_counts()

food       20000
inside     20000
outside    18189
drink      15412
menu        1583
Name: label, dtype: int64

In [33]:
pickle.dump(df_subset, open("checkpoints/df_subset.pkl", "wb"))

In [None]:
# checkpoint
df_subset = pickle.load(open("checkpoints/df_subset.pkl",'rb'))

## Escalar y recortar las fotos del subset

In [34]:
BATCH_SIZE = 20000 # Para reducir el consumo de memoria en el procesamiento de imagenes
PHOTO_SIZE = 224.0 # De cara a usar https://huggingface.co/facebook/deit-tiny-patch16-224

In [35]:
photo_counter = 0

for img in tqdm(range(len(df_subset)), desc='Photos cropped', miniters=100):
    # load image
    im = Image.open('dataset/photos/' + df_subset.iloc[img].photo_id + '.jpg')

    # resize smallest dimension to PHOTO_SIZE
    if (df_subset.iloc[img].y_dim < df_subset.iloc[img].x_dim): # narrow image
        width = int(PHOTO_SIZE)
        height = math.floor(PHOTO_SIZE * df_subset.iloc[img].x_dim/df_subset.iloc[img].y_dim)
    else: # wide image
        width = math.floor(PHOTO_SIZE * df_subset.iloc[img].y_dim/df_subset.iloc[img].x_dim)
        height = int(PHOTO_SIZE)
    
    resized = T.Resize((height, width))(im)
    cropped = T.CenterCrop(size=int(PHOTO_SIZE))(resized)

    # convert to numpy array
    img_np = np.array(cropped)

    if photo_counter % BATCH_SIZE == 0: # save images to file to preserve memory
        if photo_counter > 0:
            pickle.dump(images,
                        open('dataset/processed/subset_images_{}.pkl'.format(photo_counter),
                             'wb'))
            # print('Saved file images_{}.pkl'.format(photo_counter))
        images = [img_np]           # reset image list
    else:
        images += [img_np]          # add image to list
    
    photo_counter += 1
    
pickle.dump(images, open('dataset/processed/subset_images_remainder.pkl','wb'))

Photos cropped:   0%|          | 0/75184 [00:00<?, ?it/s]

In [37]:
file_list = ['dataset/processed/subset_images_{}.pkl'.format(y*BATCH_SIZE) for y in range(1,len(df_subset) //BATCH_SIZE + 1)] + ['dataset/processed/subset_images_remainder.pkl']

In [39]:
images = []
for file in tqdm(file_list, desc='Files loaded', miniters=1):
    images += pickle.load(open(file,'rb'))

Files loaded:   0%|          | 0/4 [00:00<?, ?it/s]

## La celda siguiente requiere 22,5 GB de memória

In [40]:
pickle.dump(images, open('dataset/processed/full_subset.pkl','wb'))