# Prepare image list

## Import anything

In [1]:
!pip install boto3 tqdm -q

[K     |████████████████████████████████| 133kB 10.8MB/s 
[K     |████████████████████████████████| 71kB 5.5MB/s 
[K     |████████████████████████████████| 7.2MB 16.5MB/s 
[K     |████████████████████████████████| 143kB 55.7MB/s 
[31mERROR: requests 2.23.0 has requirement urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1, but you'll have urllib3 1.26.3 which is incompatible.[0m
[31mERROR: datascience 0.10.6 has requirement folium==0.2.1, but you'll have folium 0.8.3 which is incompatible.[0m
[?25h

In [2]:
!wget https://storage.googleapis.com/openimages/v6/oidv6-train-annotations-bbox.csv -q
!wget https://storage.googleapis.com/openimages/v5/validation-annotations-bbox.csv -q
!wget https://storage.googleapis.com/openimages/v5/test-annotations-bbox.csv -q
!wget https://storage.googleapis.com/openimages/v5/class-descriptions-boxable.csv -q
!wget https://raw.githubusercontent.com/openimages/dataset/master/downloader.py -q

In [3]:
import os
import io
import glob
import cv2
import shutil
import numpy as np
import pandas as pd

In [4]:
train = pd.read_csv('/content/oidv6-train-annotations-bbox.csv')
valid = pd.read_csv('/content/validation-annotations-bbox.csv')
test = pd.read_csv('/content/test-annotations-bbox.csv')
class_desc = pd.read_csv('/content/class-descriptions-boxable.csv',
                         names=['LabelName', 'ClassName'])

## Find LabelName for each subset

In [6]:
# https://storage.googleapis.com/openimages/2018_04/bbox_labels_600_hierarchy_visualizer/circle.html
subset = [
'Chair',
'Desk',
'Couch',
'Bookcase',
'Table',
'Cupboard',
'Bench',
'Drawer',
'Stool',
'Shelf'
 ]

In [7]:
def find_label_name(class_desc_df, subset):
    ''' 
    Function to find LabelName of subset

    Arguments:
        class_desc_df (df)   : Class description dataframe 
        subset (list)        : list of subset class
    Returns:
        labels(list)         : List of LabelName
        labels_map(dict)     : Dict to map ClassName into LabelName
    '''
    filter_class = class_desc_df.loc[class_desc_df['ClassName'].isin(subset)].copy()
    filter_class['ClassName'].replace(' ', '_', regex=True, inplace=True)
    labels = filter_class.LabelName.values
    labels_map = filter_class.set_index('LabelName').T.to_dict('records')[0]
    return labels, labels_map

In [8]:
labels, labels_map = find_label_name(class_desc, subset)

## Filter dataset

In [9]:
def filter_dataset(df, labels, labels_map, sample=None):
    '''
    Function to filter image by LabelName
    Arguments:
        df (df)         : Image dataframe (train, validation, or test)
        labels (list)   : labels to filter LabelName
        labels_map (dict):Dictionary to create ClassName
    Return:
        filter_df (df)  : Filtered dataframe by labels
    '''
    filter_df = df.loc[df['LabelName'].isin(labels)].reset_index(drop=True)
    filter_df['ClassName'] = filter_df['LabelName'].map(labels_map)
    if sample is not None:
        unique_img = filter_df.groupby('LabelName', group_keys=False).apply(lambda df: df.drop_duplicates(subset='ImageID', ignore_index=True)[:sample])
        unique_img = unique_img.ImageID.values
        filter_df = filter_df.loc[filter_df['ImageID'].isin(unique_img)].reset_index(drop=True)
    print('There is {} label and {} unique images.'.format(len(filter_df), filter_df.ImageID.nunique()))
    return filter_df

**Only download 1000 images each class**

In [19]:
# An images has chair, table, and drawer,
# so the same images appears in 3 class.
# Thats why even we set limit to 1000 images per class
# we still got more than 1000 images.
filter_train = filter_dataset(train, labels, labels_map, sample=1000)
display(filter_train.groupby('ClassName')['ImageID'].nunique())
display(filter_train.groupby('ClassName')['ImageID'].count())

There is 28585 label and 7967 unique images.


ClassName
Bench       1000
Bookcase    1041
Chair       1665
Couch       1012
Cupboard     872
Desk        1234
Drawer      1132
Shelf       1466
Stool        576
Table       2227
Name: ImageID, dtype: int64

ClassName
Bench       1987
Bookcase    1611
Chair       7619
Couch       1263
Cupboard    1353
Desk        1518
Drawer      3089
Shelf       4647
Stool       1254
Table       4244
Name: ImageID, dtype: int64

In [20]:
filter_valid = filter_dataset(valid, labels, labels_map)
display(filter_valid.groupby('ClassName')['ImageID'].nunique())
display(filter_valid.groupby('ClassName')['ImageID'].count())

There is 3130 label and 1073 unique images.


ClassName
Bench         6
Bookcase     78
Chair       311
Couch        49
Cupboard    103
Desk        133
Drawer       99
Shelf       194
Stool        18
Table       593
Name: ImageID, dtype: int64

ClassName
Bench         9
Bookcase    118
Chair       827
Couch        61
Cupboard    174
Desk        214
Drawer      255
Shelf       491
Stool        32
Table       949
Name: ImageID, dtype: int64

## Create list of image

In [21]:
def create_images_list(temp_df, split, dir):
    '''
    Function to filter image by LabelName
    Arguments:
        temp_df (df)    : Image dataframe (train, validation, or test)
        split (string)  : 'train', 'validation', or 'test'
        dir (string)    : directory to save txt file
    Return:
        image_list (list) : list
    '''
    df = temp_df.copy()
    img_id = df.ImageID.drop_duplicates().values
    img_list = str(split)+'/'+img_id
    print('There is {} unique images.'.format(len(img_id)))
    print('Write IMAGE_LIST_FILE to...')
    print(dir)
    name = str(split) + '_list.txt'
    np.savetxt(os.path.sep.join([dir, name]),
               img_list, fmt='%s')
    return img_list

In [22]:
train_list = create_images_list(filter_train, 'train', '/content/')
valid_list = create_images_list(filter_valid, 'validation', '/content/')

There is 7967 unique images.
Write IMAGE_LIST_FILE to...
/content/
There is 1073 unique images.
Write IMAGE_LIST_FILE to...
/content/


# DOWNLOAD IMAGES

In [23]:
# Download train image_list.txt
!python downloader.py /content/train_list.txt --download_folder=/content/OI/train/ --num_processes=5

Downloading images: 100% 7967/7967 [07:18<00:00, 18.17it/s]


In [25]:
# Download validation image_list.txt
!python downloader.py /content/validation_list.txt --download_folder=/content/OI/validation/ --num_processes=5

Downloading images: 100% 1073/1073 [00:57<00:00, 18.58it/s]


In [26]:
!du -sh /content/OI/

2.6G	/content/OI/


# Create CSV annotations

## Get Image Size

In [27]:
def get_image_size(temp_df, dir):
    '''Very slow please help!!!'''
    df = temp_df.copy()
    valid_h = {}
    valid_w = {}

    img_id = df.ImageID.drop_duplicates().values
    for id in img_id:
        img = cv2.imread(dir + id +'.jpg')
        valid_h[id] = img.shape[0]
        valid_w[id] = img.shape[1]

    df['Height'] = df['ImageID'].map(valid_h)
    df['Width'] = df['ImageID'].map(valid_w)

    return df

In [28]:
%%time
filter_train = get_image_size(filter_train, '/content/OI/train/')
filter_valid = get_image_size(filter_valid, '/content/OI/validation/')

CPU times: user 2min 10s, sys: 1.84 s, total: 2min 12s
Wall time: 2min 13s


In [29]:
# Save
filter_train.to_csv('/content/OI/train.csv', index_label=False)
filter_valid.to_csv('/content/OI/validation.csv', index_label=False)

# Zip and Move

In [None]:
!cd /content/OI/; zip -r /content/oi-furniture-1k.zip .

In [37]:
%%time
!cp -r /content/oi-furniture-1k.zip /content/drive/MyDrive/DATASET/oi-mini-furniture-1k.zip

CPU times: user 101 ms, sys: 49.5 ms, total: 150 ms
Wall time: 51.8 s


# Create pbtxt

In [None]:
%%writefile /content/OI/label_map.pbtxt
item {
    id: 1
    name: 'Chair'
}

item {
    id: 2
    name: 'Desk'
}

item {
    id: 3
    name: 'Couch'
}

item {
    id: 4
    name: 'Bookcase'
}

item {
    id: 5
    name: 'Table'
}

item {
    id: 6
    name: 'Cupboard'
}

item {
    id: 7
    name: 'Bench'
}

item {
    id: 8
    name: 'Drawer'
}

item {
    id: 9
    name: 'Stool'
}

item {
    id: 10
    name: 'Shelf'
}