**Función:** crea listado de fotos para el sub dataset 10k (por limpiar). No incluye ficheros corruptos.

**Requiere:**
- ../datasets/photos (con todas las fotos del dataset)
- ../datasets/photos_train.json (fichero json con labels de todas las fotos)

**Crea:**
- ../datasets/corrupt_ids.json
- ../datasets/df_10k.json

In [None]:
import pandas as pd
import json
import os
from PIL import Image

In [None]:
ejecutadoEnGoogleDrive = False # True si se ejecuta en Google Drive

In [None]:
if ejecutadoEnGoogleDrive:
    from google.colab import drive
    drive.mount('/content/drive')
    PATH = '/content/drive/clasificacion_imagenes'
else:
    PATH = '../'

Mounted at /content/drive


# Detección de fotos corruptas

In [None]:
# Loop por todas las fotos para ver IDs que no abren. 
folder = PATH + 'datasets/photos'
item_list = os.listdir(folder)
ok_ids = []
corrupt_ids = []
for i in item_list:
    try:
        im = Image.open(folder + "/" + i)
    except:
        splitter = i.split(".")[0]
        corrupt_ids.append(splitter)

In [None]:
# Guardar los ids de fotos corruptas
with open(PATH + 'datasets/corrupt_ids.json', 'w') as corrupt_json:
    json.dump(corrupt_ids, corrupt_json)

In [None]:
print(corrupt_ids)

['74upe0h6XxwgzqpdnAh_7Q', '-BIybLxzoFt2d2zbYRcfHA', '-NGY_19QK2zq913HdiYc5A', 'm3oIKhKKCQD54y1E-dBKSw', 'feUGw0P5byOq4U40C77tyQ', 'AkiGRjaMKHdJyV7bdHsQjw', 'n6Q9vNuxz7786ESEfautxQ', '9BvYOtforBBP6MvvDogtmw', 'juDNZOOnkgG3QINFrulsAg', 'pY32hIagdxrL4Nsi959EQg', 'E7Wpzn-1fCnVJ8_zKpecPQ', 'ytJ4lihJrvyzMMRG-WwDNw', '0fac-NlXqfBO2pWRkmM9aw', 'NKEFWvRriK-LvagPz2QRxw', 'jU-dKl2Ye4L_5x602yoctQ', '0TpeNZPs3Gu8s30KVXudcg', '1MOGQBWogR8oJr1WgERi9g', 'qxSXsYMA3aWuAfigeqeOOQ', 'K6pfRNwGodm1m1gFVQlj-Q', 'rrfwGSwt3eHxxypfu5PGTA', '9X4YPM8nYFjf7hY8xUdc6Q', 'RIeulJUzgemFugkkgg4qgA', 'MZj64XNUN6Og178-6XYR6g', '9RDbbAZB0HnL4hndCWB58w', 'PFD3ykdI1WVhvZ8IX4PmLQ', 'hclqCX1FWcV_TtJJoI3BpQ', 'IExxMfr1h0bxw54jsanyKA', 'lrfy4UVIWtj0xwboLgUreQ', 'UG2JuFFa_WxhPEtMOtq-JQ', 'OK6HsALzFcBAUlrroKHZGg', 'JGpfPj8VEvnq1B-Xqr3w-A', 'c73YwNh1JsYR5Hz-u_bOrg', 'IB2ZjqjtS1W_DadQoPPdgg', 'O0bVFyP58TOEix6IjERXQA', 'DMCTwC3UT2w5QzHOQoqBPw', 'ke4ohxa93GJz0KH9H2kwsQ', 'MduVueqYTBlEkX-axrh1ug', 'WGmGujPl5BmR_fCUZnoe9w', 'W94rrCn0O5

In [None]:
print(len(item_list) - len(corrupt_ids))

199992


# Eliminación de corruptos

In [None]:
# Read JSON
json_train_path = PATH + 'datasets/photos_train.json'
df = pd.read_json(json_train_path, lines=True)
df.head()

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 180090 entries, 0 to 180089
Data columns (total 4 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   photo_id     180090 non-null  object
 1   business_id  180090 non-null  object
 2   caption      180090 non-null  object
 3   label        180090 non-null  object
dtypes: object(4)
memory usage: 5.5+ MB


In [None]:
df_clean = df[df['photo_id'].isin(corrupt_ids) == False]
df_clean.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 179997 entries, 0 to 180089
Data columns (total 4 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   photo_id     179997 non-null  object
 1   business_id  179997 non-null  object
 2   caption      179997 non-null  object
 3   label        179997 non-null  object
dtypes: object(4)
memory usage: 6.9+ MB


In [None]:
print(df.shape[0] - df_clean.shape[0])

93


# Creación del sub dataset 10k (por limpiar)
(misma proporción que el dataset completo)

In [None]:
df.label.value_counts() / df.shape[0]

food       0.540491
inside     0.280016
outside    0.092798
drink      0.078311
menu       0.008385
Name: label, dtype: float64

In [None]:
# Create a dataframe. No usar clean_df para no contar con archivos corruptos. 
def create_df(num_samples, dataframe):
    labels = list(df.label.value_counts().reset_index()['index'])
    proportion = list(df.label.value_counts() / df.shape[0])   
    dict_samples = {}
    for (i, j) in zip(labels, proportion):
        dict_samples[i] = int(round(j * num_samples, 0))
    df_short = pd.DataFrame()
    for i in dict_samples.keys():
        df_temp = dataframe[dataframe['label'] == i].sample(dict_samples[i])
        df_short = pd.concat([df_short, df_temp], axis=0)
    return df_short
df_reduced = create_df(10000, df_clean)
df_reduced.label.value_counts()

food       5405
inside     2800
outside     928
drink       783
menu         84
Name: label, dtype: int64

In [None]:
# Check archivos únicos
df_reduced.photo_id.nunique()

10000

In [None]:
df_clean_json = df_reduced.to_json(PATH + 'datasets/df_10k.json', orient = 'records')

# mini EDA del 10k

In [None]:
# Ejemplo numero de fotos por categoría
labels = list(df.label.value_counts().reset_index()['index'])
proportion = list(df.label.value_counts() / df.shape[0])
fotos = 10000
cummulative = []
for i in proportion:
    cummulative.append(round(i * fotos, 0))
print(dict(zip(labels, cummulative)))
print(sum(cummulative))

{'food': 5405.0, 'inside': 2800.0, 'outside': 928.0, 'drink': 783.0, 'menu': 84.0}
10000.0
