**Función:** crear un sub dataset 10k limpio balanceado, en formato ndarray.

**Requiere:**
- ../datasets/df_final_bal.pkl

**Crea:**
- ../datasets/X_train.pkl
- ../datasets/X_test.pkl
- ../datasets/y_train.pkl
- ../datasets/y_test.pkl

In [None]:
ejecutadoEnGoogleDrive = False # True si se ejecuta en Google Drive

In [None]:
if ejecutadoEnGoogleDrive:
    from google.colab import drive
    drive.mount('/content/drive')
    PATH = '/content/drive/clasificacion_imagenes'
else:
    PATH = '../'

In [1]:
import pandas as pd
import pickle
import numpy as np
import math
import torchvision.transforms as T
from sklearn.preprocessing import OneHotEncoder
from tqdm.notebook import trange, tqdm
from PIL import Image

In [2]:
photo_path = PATH + 'dataset/photos'
img_size = 224

# Create Train Test DFs. 

In [3]:
df = pickle.load(open(PATH + 'datasets/df_final_bal.pkl','rb')).rename(columns={'reclass_label': 'label'})
df.head(2)

Unnamed: 0,photo_id,label
1416,sKR4OiR14nD4NqfZFAsaHA,menu
1509,V4tsc_w3NtKOin03WMVm1g,menu


In [4]:
df.label.value_counts()

menu       1375
food       1375
inside     1375
drink      1375
outside    1375
Name: label, dtype: int64

In [5]:
round(df.label.value_counts() * 0.8, 0)

menu       1100.0
food       1100.0
inside     1100.0
drink      1100.0
outside    1100.0
Name: label, dtype: float64

In [6]:
def create_df(num_samples, dataframe):
    labels = list(df.label.value_counts().reset_index()['index'])
    proportion = list(df.label.value_counts() / df.shape[0])
    
    dict_samples = {}
    
    for (i, j) in zip(labels, proportion):
        dict_samples[i] = int(round(j * num_samples, 0))

    df_short = pd.DataFrame()

    for i in dict_samples.keys():
        df_temp = dataframe[dataframe['label'] == i].sample(dict_samples[i])
        df_short = pd.concat([df_short, df_temp], axis=0)

    return df_short


In [7]:
train_num = round(df.shape[0] * 0.8, 0)
df_train = create_df(train_num, df)

In [8]:
df_train.reset_index(drop=True, inplace=True)
df_train.tail()

Unnamed: 0,photo_id,label
5495,rZ_96jzHCaBIXHrsCVwQQg,outside
5496,ph0pihKLWOinxXfNR5J84w,outside
5497,G85boFMSAGOqxdX3OPjmJQ,outside
5498,SdHobz_v44IOuI44TTpu1w,outside
5499,5RoT372KtJ-Y2aTwz0jw7A,outside


In [9]:
compared = df.merge(df_train.drop_duplicates(), on='photo_id', how='left', indicator=True)
compared.head(2)

Unnamed: 0,photo_id,label_x,label_y,_merge
0,sKR4OiR14nD4NqfZFAsaHA,menu,menu,both
1,V4tsc_w3NtKOin03WMVm1g,menu,menu,both


In [10]:
df_test = compared.loc[compared._merge == 'left_only'].drop(columns=['_merge', 'label_y']).rename(columns={'label_x':'label'})
df_test.head(2)

Unnamed: 0,photo_id,label
8,lZhO_oA6JH6e5CSiBEUOjg,menu
10,ockFtmG4btt1bf9GgVRNSg,menu


In [11]:
print(f'El tamaño de Train es de {df_train.shape[0]} y el de Test es de {df_test.shape[0]}')

El tamaño de Train es de 5500 y el de Test es de 1375


# Reescalar

In [12]:
def resize_pictures(dataframe, img_path, size_rescaled):
    df_temp = dataframe.copy()
    df_temp['image'] = ''
    
    for im_pos in tqdm(range(len(df_temp)), desc='Photos cropped', miniters=len(df_temp)//100):
        
        with Image.open('{}/{}.jpg'.format(img_path, df_temp.iloc[im_pos].photo_id)) as image:

            width = image.size[0]
            height = image.size[1]

            # redimensionar
            if (height < width): # imagen estrecha
                height = int(size_rescaled)
                height = math.floor(size_rescaled * width/height)
            else: # imagen ancha
                width = math.floor(size_rescaled * height/width)
                height = int(size_rescaled)

            resized = T.Resize((height, width))(image)
            cropped = T.CenterCrop(size=int(size_rescaled))(resized)
            df_temp.iloc[im_pos, 2] = cropped

    return df_temp

In [13]:
# Create new train DF
df2_train = resize_pictures(df_train, photo_path, img_size)

# Create new Test DF
df2_test = resize_pictures(df_test, photo_path, img_size)

Photos cropped:   0%|          | 0/5500 [00:00<?, ?it/s]

Photos cropped:   0%|          | 0/1375 [00:00<?, ?it/s]

In [14]:
np.array(df2_train.iloc[1, 2]).shape

(224, 224, 3)

In [15]:
display(df2_train.head(2))
display(df2_test.head(2))

Unnamed: 0,photo_id,label,image
0,4ZFZhO1gtME2s_YD-uZWZg,menu,<PIL.Image.Image image mode=RGB size=224x224 a...
1,hr8M8sHhxRwLnZiq_2MIEg,menu,<PIL.Image.Image image mode=RGB size=224x224 a...


Unnamed: 0,photo_id,label,image
8,lZhO_oA6JH6e5CSiBEUOjg,menu,<PIL.Image.Image image mode=RGB size=224x224 a...
10,ockFtmG4btt1bf9GgVRNSg,menu,<PIL.Image.Image image mode=RGB size=224x224 a...


In [16]:
np.array(df2_train.iloc[1, 2].getdata())

array([[ 1, 45, 30],
       [ 1, 45, 30],
       [ 1, 45, 30],
       ...,
       [61, 57, 48],
       [73, 69, 60],
       [59, 55, 46]])

In [17]:
series_X_train = df2_train.image.map(lambda img: np.array(img.getdata()).reshape(img_size, img_size, 3))
X_train = np.stack(series_X_train.to_numpy(copy=True))
X_train.shape

(5500, 224, 224, 3)

In [18]:
X_train = X_train.astype('float32')
X_train /= 255

In [19]:
series_X_test = df2_test.image.map(lambda img: np.array(img.getdata()).reshape(img_size, img_size, 3))
X_test = np.stack(series_X_test.to_numpy(copy=True))
X_test.shape

(1375, 224, 224, 3)

In [20]:
X_test = X_test.astype('float32')
X_test /= 255

In [23]:
pickle.dump(X_test, open(output_path + 'X_test.pkl','wb'))

In [24]:
categories = [['food', 'inside', 'outside', 'drink', 'menu']]
enc = OneHotEncoder(categories=categories, drop=None)

In [25]:
y_train = enc.fit_transform(df_train[['label']]).astype('float32').toarray()

In [26]:
y_test = enc.fit_transform(df_test[['label']]).astype('float32').toarray()

In [27]:
type(y_train)

numpy.ndarray

In [28]:
img_rows, img_cols, img_ch = 224, 224, 3
num_labels = 5

In [29]:
# test de los datos de entrada
if   (X_train.shape[1] != img_rows) \
  or (X_train.shape[2] != img_cols)\
  or (X_train.shape[3] != img_ch)\
  or (y_train.shape[1] != num_labels)\
  or (X_train.dtype != 'float32')\
  or (X_test.shape[1] != img_rows) \
  or (X_test.shape[2] != img_cols)\
  or (X_test.shape[3] != img_ch)\
  or (y_test.shape[1] != num_labels)\
  or (X_test.dtype != 'float32')\
  or (y_train.dtype != 'float32')\
  or (y_test.dtype != 'float32')\
  or (X_train.shape[0] != y_train.shape[0])\
  or (X_test.shape[0] != y_test.shape[0]\
  or type(X_train) != np.ndarray\
  or type(X_test) != np.ndarray\
  or type(y_train) != np.ndarray\
  or type(y_test) != np.ndarray):
  raise Exception("El dataset no tiene las dimensiones esperadas!")
else:
  print('El dataset tiene las dimensiones esperadas')

El dataset tiene las dimensiones esperadas


In [30]:
pickle.dump(X_train, open(PATH + 'datasets/X_train.pkl','wb'))

In [31]:
pickle.dump(X_test, open(PATH + 'datasets/X_test.pkl','wb'))

In [32]:
pickle.dump(y_train, open(PATH + 'datasets/y_train.pkl','wb'))

In [33]:
pickle.dump(y_test, open(PATH + 'datasets/y_test.pkl','wb'))