## Crear &#129303; Huggin Face dataset

In [1]:
import pandas as pd
from PIL import Image
import pickle

In [2]:
df = pd.read_json(r'dataset/photos.json', lines=True)
df = df.drop(columns = ['business_id', 'caption'])
contador = 0
discarded = 0
for img in df.photo_id.tolist():
    if (contador%30 == 0) and (contador > 0):
        print('Procesadas {:7,} mil fotos de las cuales {} están corruptas'.format(contador, discarded), end='\r');
    contador += 1
    try:
        img = Image.open('dataset/photos/'+img+'.jpg')
        img.verify()
    except:
        # file is corrupt
        df = df.drop(df.loc[df.photo_id == img].index)
        discarded += 1

Procesadas 200,070 mil fotos de las cuales 106 están corruptas

In [3]:
# dataframe('photo_id', 'label') contiene todas las fotos válidas del dataset
pickle.dump(df, open('checkpoints/valid.pkl', 'wb'))

## &#128190; Checkpoint

In [29]:
import pandas as pd
from tqdm.notebook import trange, tqdm
from PIL import Image
import pickle
import os

In [30]:
df = pickle.load(open('checkpoints/valid.pkl','rb'))

In [31]:
categorias = df.label.value_counts().index

In [32]:
dataset = {'train' : {},
          'test': {},
          'validate': {}}

In [33]:
for categoria in categorias:
    slice_80 = int(len(df.loc[df.label == categoria]) * 0.8)
    slice_10 = int(len(df.loc[df.label == categoria]) * 0.1)
    dataset['train'].update({categoria: df.loc[df.label == categoria][0:slice_80]})
    dataset['test'].update({categoria: df.loc[df.label == categoria][slice_80:slice_80+slice_10]})
    dataset['validate'].update({categoria: df.loc[df.label == categoria][slice_80+slice_10:]})

In [34]:
random_state = 42 # La solución de Douglas Adams para todo
max_photos = 2500

subset_max_photos = {
    'train': int(max_photos * 0.8),
    'test': int(max_photos * 0.1)
}

subset_max_photos.update({
    'validate': max_photos - subset_max_photos['train'] - subset_max_photos['test'] # las que quedan
})

# hacer que los subsets tengan max_photos imagenes
for subset in ['train', 'test', 'validate']:
    for categoria in categorias:
        if len(dataset[subset][categoria]) == subset_max_photos[subset]: # tenemos el numero correcto
            pass            
        elif len(dataset[subset][categoria]) > subset_max_photos[subset]: # demasiadas fotos - resample
            dataset[subset][categoria] = dataset[subset][categoria].sample(n=subset_max_photos[subset],
                                                                           random_state=random_state,
                                                                           replace=False, # solo 1x cada foto
                                                                           ignore_index=True)

In [35]:
dataset_origin_path='dataset/photos'
dataset_output_path='dataset/yelp'
dataset_json_file='dataset/photos.json'

import shutil

# crear estructura de carpetas
if os.path.isdir(dataset_output_path):
    shutil.rmtree(dataset_output_path, ignore_errors=False, onerror=None) # remove tree
    print('{} removed'.format(dataset_output_path))
    
os.mkdir(dataset_output_path)
print('{} created'.format(dataset_output_path))
    
for subset in ['train', 'test', 'validate']:
    os.mkdir(dataset_output_path + '/' + subset)
    print('{} created'.format(dataset_output_path + '/' + subset))
    for categoria in categorias:
        os.mkdir(dataset_output_path + '/' + subset + '/' + categoria)
        print('{} created'.format(dataset_output_path + '/' + subset + '/' + categoria))


dataset/yelp removed
dataset/yelp created
dataset/yelp/train created
dataset/yelp/train/food created
dataset/yelp/train/inside created
dataset/yelp/train/outside created
dataset/yelp/train/drink created
dataset/yelp/train/menu created
dataset/yelp/test created
dataset/yelp/test/food created
dataset/yelp/test/inside created
dataset/yelp/test/outside created
dataset/yelp/test/drink created
dataset/yelp/test/menu created
dataset/yelp/validate created
dataset/yelp/validate/food created
dataset/yelp/validate/inside created
dataset/yelp/validate/outside created
dataset/yelp/validate/drink created
dataset/yelp/validate/menu created


In [36]:
# copiar o generar imagenes

from torchvision import transforms as T

random_state = 42 # La solución de Douglas Adams para todo

from torch import manual_seed
manual_seed(random_state)

import random
random.seed(random_state)

max_photos = 2500

subset_max_photos = {
    'train': int(max_photos * 0.8),
    'test': int(max_photos * 0.1)
}

subset_max_photos.update({
    'validate': max_photos - subset_max_photos['train'] - subset_max_photos['test'] # las que quedan
})

pipeline = T.Compose([
    T.ColorJitter(brightness=.5, hue=.3),
    T.RandomAffine(degrees=(30, 70), scale=(0.9, 1.1))
])

for subset in ['train', 'test', 'validate']:
    for categoria in categorias:
        # copiar
        contador = 0
        for img in dataset[subset][categoria].photo_id:
            shutil.copyfile(dataset_origin_path + '/' + img + '.jpg',
                            dataset_output_path + '/' + subset + '/' + categoria + '/' + img + '.jpg')
            if (contador%30 == 0) and (contador > 0):
                print('{} - copiadas {:7,} fotos'.format(dataset_output_path + '/' + subset + '/' + categoria,
                                                         contador),
                      end='\r');
            contador += 1
        print('{} - copiadas {:7,} fotos'.format(dataset_output_path + '/' + subset + '/' + categoria,
                                                 contador));
    
        # generar imagenes si necesario
        if len(dataset[subset][categoria]) < subset_max_photos[subset]: # faltan imagenes

            # determinar cuantas imagenes tendremos que generar
            n_missing = subset_max_photos[subset] - len(dataset[subset][categoria])
            
            # seleccionar imagenes que vamos usar como base
            df_base_images = dataset[subset][categoria].sample(n=n_missing,
                                                               random_state=random_state,
                                                               replace=False, # solo 1x cada foto
                                                               ignore_index=True)
            # abrir original > transformar > guardar imagen transformada > añadir nueva foto al subset
            contador = 0
            for img_name in df_base_images.photo_id:
                original_image_path = dataset_origin_path + '/' + img_name + '.jpg'
                new_image_path = dataset_output_path + '/' + subset + '/' + categoria + '/' + img_name + '.jpg''_tr.jpg'
                with Image.open(original_image_path) as img:
                    augmented_image = pipeline(img = img)
                    augmented_image.save(new_image_path)
                                
                new_img = pd.DataFrame(data={'photo_id': img_name+'_tr.jpg',
                                             'label': df.loc[df.photo_id == img_name].label})
                dataset[subset][categoria] = pd.concat([dataset[subset][categoria], new_img])
                if (contador%30 == 0) and (contador > 0):
                    print('{} - generadas {:7,} fotos'.format(dataset_output_path + '/' + subset + '/' + categoria,
                                                             contador),
                          end='\r');
                contador += 1

            print('{} - generadas {:7,} fotos'.format(dataset_output_path + '/' + subset + '/' + categoria,
                                                     contador));


dataset/yelp/train/food - copiadas   2,000 fotos
dataset/yelp/train/inside - copiadas   2,000 fotos
dataset/yelp/train/outside - copiadas   2,000 fotos
dataset/yelp/train/drink - copiadas   2,000 fotos
dataset/yelp/train/menu - copiadas   1,342 fotos
dataset/yelp/train/menu - generadas     658 fotos
dataset/yelp/test/food - copiadas     250 fotos
dataset/yelp/test/inside - copiadas     250 fotos
dataset/yelp/test/outside - copiadas     250 fotos
dataset/yelp/test/drink - copiadas     250 fotos
dataset/yelp/test/menu - copiadas     167 fotos
dataset/yelp/test/menu - generadas      83 fotos
dataset/yelp/validate/food - copiadas     250 fotos
dataset/yelp/validate/inside - copiadas     250 fotos
dataset/yelp/validate/outside - copiadas     250 fotos
dataset/yelp/validate/drink - copiadas     250 fotos
dataset/yelp/validate/menu - copiadas     169 fotos
dataset/yelp/validate/menu - generadas      81 fotos
