**Función:** crear un sub dataset 10k balanceado limpio combinando 5 ficheros de fotos reclasificadas

**Requiere:**
- ../datasets/food_rev.json
- ../datasets/inside_rev.json
- ../datasets/outside_rev.json
- ../datasets/drink_rev.json
- ../datasets/menu_rev.json
- ../datasets/final_rev_menu.json
- ../datasets/final_rev_outside.json
- ../datasets/final_rev_drink.json

**Crea:**
- ../datasets/df_final_bal.pkl

In [1]:
import pandas as pd
import pickle

In [None]:
ejecutadoEnGoogleDrive = False # True si se ejecuta en Google Drive

In [None]:
if ejecutadoEnGoogleDrive:
    from google.colab import drive
    drive.mount('/content/drive')
    PATH = '/content/drive/clasificacion_imagenes'
else:
    PATH = '../'

In [3]:
food_1 = 'food_rev.json'
inside_1 = 'inside_rev.json'
outside_1 = 'outside_rev.json'
drink_1 = 'drink_rev.json'
menu_1 = 'menu_rev.json'
# fotos adicionales que no estaban en 10k (para incrementar labels menos representadas)
menu_2 = 'final_rev_menu.json'
outside_2 = 'final_rev_outside.json'
drink_2 = 'final_rev_drink.json'

In [4]:
df_food_1 = pd.read_json(PATH + 'datasets/' + food_1)
df_inside_1 = pd.read_json(PATH + 'datasets/'+ inside_1)
df_outside_1 = pd.read_json(PATH + 'datasets/'+ outside_1)
df_drink_1 = pd.read_json(PATH + 'datasets/'+ drink_1)
df_menu_1 = pd.read_json(PATH + 'datasets/'+ menu_1)
df_menu_1.head(2)

Unnamed: 0,photo_id,original_label,reclass_label,status
0,H1T4jXaxP4Hpg_zZ7ckWvA,menu,menu,OK
1,ockFtmG4btt1bf9GgVRNSg,menu,menu,OK


In [5]:
df_outside_2 = pd.read_json(PATH + 'datasets/' outside_2, orient='index').reset_index(drop=False).rename(columns=
                                                                                               {'index': 'photo_id',
                                                                                                0: 'original_label',
                                                                                                1: 'reclass_label',
                                                                                                2: 'status'})
df_drink_2 = pd.read_json(PATH + 'datasets/' + drink_2, orient='index').reset_index(drop=False).rename(columns=
                                                                                           {'index': 'photo_id',
                                                                                                0: 'original_label',
                                                                                                1: 'reclass_label',
                                                                                                2: 'status'})
df_menu_2 = pd.read_json(PATH + 'datasets/' + menu_2, orient='index').reset_index(drop=False).rename(columns=
                                                                                        {'index': 'photo_id',
                                                                                                0: 'original_label',
                                                                                                1: 'reclass_label',
                                                                                                2: 'status'})
df_menu_2.head(2)

Unnamed: 0,photo_id,original_label,reclass_label,status
0,6LJa8QZOGBjt9maE5miLhw,menu,menu,OK
1,DtsfhqhqUB-tjs5XbwJfXw,menu,menu,OK


In [6]:
df_menu_1.columns

Index(['photo_id', 'original_label', 'reclass_label', 'status'], dtype='object')

In [7]:
df_menu_2.columns

Index(['photo_id', 'original_label', 'reclass_label', 'status'], dtype='object')

In [8]:
df_revisado = pd.concat([df_food_1,
                         df_inside_1,
                         df_outside_1,
                         df_drink_1,
                         df_menu_1,
                         df_outside_2,
                         df_drink_2,
                         df_menu_2], axis=0)
df_revisado.head(2)

Unnamed: 0,photo_id,original_label,reclass_label,status
0,fB-J1F60DgMRoc8eI0z0Ew,food,food,OK
1,n2hdfzYgxL0twgfSDbro1w,food,food,OK


In [9]:
# df_revisado = df_revisado.to_json('/content/drive/Shareddrives/The Valley - TFM/Ejercicio clasificación imagenes/Ejercicio 2 - Balanced/df_clean_withKO.json', orient='records')

In [10]:
df_revisado.reclass_label.value_counts()

food       4807
inside     2170
KO         1649
drink      1580
outside    1485
menu       1375
Name: reclass_label, dtype: int64

In [11]:
df_limpias = df_revisado[df_revisado.reclass_label != 'KO']

In [12]:
df_limpias.tail()

Unnamed: 0,photo_id,original_label,reclass_label,status
1392,OFHzlQEuitFAE_ICfdsQhQ,menu,menu,OK
1393,x4acVHke3X7o6hrVaulmQA,menu,menu,OK
1394,8ZldnphijLWEvWWX6Y57Lg,menu,menu,OK
1395,wq2E-d6NuiCP5_DCC_6e6g,menu,menu,OK
1396,ZtuEsa-qc7gfy7m5UnfaZw,menu,menu,OK


In [13]:
df_limpias.reclass_label.value_counts()

food       4807
inside     2170
drink      1580
outside    1485
menu       1375
Name: reclass_label, dtype: int64

In [14]:
df_limpias = df_limpias[['photo_id', 'reclass_label']]

In [15]:
df_limpias

Unnamed: 0,photo_id,reclass_label
0,fB-J1F60DgMRoc8eI0z0Ew,food
1,n2hdfzYgxL0twgfSDbro1w,food
2,Z77aLYI_SvAl7PaJ62umJQ,food
3,tBdw-FR8aNZcJLa5bX-SdA,food
4,z-ftfswcyAAERjoMzKFJ5A,food
...,...,...
1392,OFHzlQEuitFAE_ICfdsQhQ,menu
1393,x4acVHke3X7o6hrVaulmQA,menu
1394,8ZldnphijLWEvWWX6Y57Lg,menu
1395,wq2E-d6NuiCP5_DCC_6e6g,menu


In [16]:
categorias = ['food', 'inside', 'drink', 'outside']

In [17]:
balanced = df_limpias.loc[df_limpias.reclass_label == 'menu']
num_photos = len(balanced)
for cat in categorias:
    balanced = pd.concat([balanced, df_limpias.loc[df_limpias.reclass_label == cat].sample(n=num_photos,
                                                                                           replace=False)])
balanced.reclass_label.value_counts()

menu       1375
food       1375
inside     1375
drink      1375
outside    1375
Name: reclass_label, dtype: int64

In [19]:
pickle.dump(balanced, open(PATH + 'datasets/df_final_bal.pkl','wb'))