# Exploration: custom image training sets from OpenImages

## Before you get started
```
$ pip instal quilt
```

This notebook documents the **exploratory process** by which the `get_trainable` function in [the main notebook](./Custom training sets from OpenImages.ipynb). If you just want the working code, skip this notebook.

In [1]:
import pandas as pd
import quilt

## Acquire and import OpenImages meta-data
Quilt package: 

In [2]:
# get and import data
quilt.install('examples/openimages', force=True, hash='144ec58d177c7d')
from quilt.data.examples import openimages as oi    

Downloading package metadata...


100%|██████████| 2.74G/2.74G [00:00<00:00, 452GB/s]

Downloading 49 fragments (2736683377 bytes before compression)...





## What's in this quilt data package?
See also [examples/openimages](https://quiltdata.com/package/examples/openimages)

In [3]:
oi

<PackageNode '/root/.local/share/QuiltCli/quilt_packages/pkgs/Quilt/examples/openimages'>
github/
test/
train/
validation/
README
build_yml
class_descriptions
classes
classes_bbox
classes_bbox_trainable
classes_trainable

## merge classes with descriptions

In [4]:
trainable = pd.merge(oi.classes_trainable(), oi.class_descriptions(), on='0', how='left', validate="1:1")

In [5]:
trainable.head()

Unnamed: 0,0,1
0,/m/010dmf,Isle of man tt
1,/m/010jjr,Amusement park
2,/m/010l12,Roller coaster
3,/m/01_12b,Granny smith
4,/m/0117z,Air show


In [6]:
oi.train

<GroupNode>
annotations_human
annotations_human_bbox
annotations_machine
images

In [7]:
annotated = oi.train.annotations_human()
annotated.head()

Unnamed: 0,ImageID,Source,LabelName,Confidence
0,000002b66c9c498e,crowdsource-verification,/m/01kcnl,1
1,000002b66c9c498e,verification,/m/014l8n,0
2,000002b66c9c498e,verification,/m/015p6,0
3,000002b66c9c498e,verification,/m/015zfk,0
4,000002b66c9c498e,verification,/m/0167gd,0


## Confidence has only two values (for human labeled data)

In [8]:
annotated['Confidence'].unique()

array([1, 0])

In [9]:
confident = annotated[annotated['Confidence'] == 1]

In [10]:
trainable_confident = pd.merge(trainable, confident, left_on='0', right_on='LabelName', validate='1:m')

In [11]:
trainable_confident.tail()

Unnamed: 0,0,1,ImageID,Source,LabelName,Confidence
8865381,/m/0zvk5,Helmet,ffe78faeafcb264c,verification,/m/0zvk5,1
8865382,/m/0zvk5,Helmet,ffe88ba93fcfd67d,verification,/m/0zvk5,1
8865383,/m/0zvk5,Helmet,ffec9006ceeefa02,verification,/m/0zvk5,1
8865384,/m/0zvk5,Helmet,ffef3e3e3ecec097,verification,/m/0zvk5,1
8865385,/m/0zvk5,Helmet,fff7fc0b862a971b,verification,/m/0zvk5,1


In [12]:
image_counts = trainable_confident.groupby('LabelName').agg('count')

In [13]:
image_counts.head()

Unnamed: 0_level_0,0,1,ImageID,Source,Confidence
LabelName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
/m/010dmf,211,211,211,211,211
/m/010jjr,491,491,491,491,491
/m/010l12,335,335,335,335,335
/m/0117z,384,384,384,384,384
/m/0118ms9c,480,480,480,480,480


In [14]:
trainable_confident_1k = image_counts[image_counts['ImageID'] >= 1000]
trainable_confident_1k.head()

Unnamed: 0_level_0,0,1,ImageID,Source,Confidence
LabelName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
/m/011k07,1418,1418,1418,1418,1418
/m/01226z,4590,4590,4590,4590,4590
/m/012v4j,1765,1765,1765,1765,1765
/m/0130jx,4504,4504,4504,4504,4504
/m/0138tl,22602,22602,22602,22602,22602


In [15]:
candidates = pd.merge(trainable_confident_1k, trainable_confident, left_index=True, right_on='LabelName')
candidates.head()

Unnamed: 0,0_x,1_x,ImageID_x,Source_x,Confidence_x,0_y,1_y,ImageID_y,Source_y,LabelName,Confidence_y
4416,1418,1418,1418,1418,1418,/m/011k07,Tortoise,000404f4b9ace468,verification,/m/011k07,1
4417,1418,1418,1418,1418,1418,/m/011k07,Tortoise,0008de4e82e99aa0,verification,/m/011k07,1
4418,1418,1418,1418,1418,1418,/m/011k07,Tortoise,000aee0af66d4237,verification,/m/011k07,1
4419,1418,1418,1418,1418,1418,/m/011k07,Tortoise,001ef0f9885fa3d4,verification,/m/011k07,1
4420,1418,1418,1418,1418,1418,/m/011k07,Tortoise,00a5bf99b538fa23,verification,/m/011k07,1


In [16]:
candidates['1_y'].unique()

array(['Tortoise', 'General football', 'Hiking', 'Sink', 'Toy', 'Statue',
       'Tractor', 'Apple', 'Eye', 'Cosmetics', 'Paddle', 'Snowman',
       'Beer', 'Beard', 'Bridge', 'Bird', 'Traffic light', 'Doll',
       'Skull', 'Glove', 'Sunglasses', 'Baseball', 'Cart', 'Basketball',
       'Ball', 'Bike', 'Stadium', 'Home appliance', 'Boat', 'Smile',
       'Surfboard', 'Fast food', 'Sunset', 'Boot', 'Headphones',
       'Bowling', 'Shorts', 'Bus', 'Boy', 'Bicycle wheel', 'Sky',
       'Laptop', 'Dress', 'Portrait', 'Bear', 'Tower', 'Person',
       'Swimwear', 'Brassiere', 'Bee', 'Bathroom', 'Balloon', 'Tent',
       'Concert', 'Licence plate', 'Birthday', 'Billboard', 'Necklace',
       'Carnivore', 'Stairs', 'Computer keyboard', 'Traffic sign',
       'Chair', 'Shirt', 'Poster', 'Fire hydrant', 'Land vehicle', 'Tie',
       'Watercraft', 'Cabinetry', 'Muffin', 'Christmas', 'Snack', 'Clock',
       'Cattle', 'Cello', 'Coat', 'Suit', 'Jungle', 'Desk', 'Cat',
       'Bronze sculpture', '

In [17]:
waterfall_ids = candidates[candidates['1_y'] == 'Goose']
waterfall_ids.head()

Unnamed: 0,0_x,1_x,ImageID_x,Source_x,Confidence_x,0_y,1_y,ImageID_y,Source_y,LabelName,Confidence_y
7596754,3891,3891,3891,3891,3891,/m/0dbvp,Goose,0001ff6b8fc43d43,verification,/m/0dbvp,1
7596755,3891,3891,3891,3891,3891,/m/0dbvp,Goose,00046a9adb6776ad,verification,/m/0dbvp,1
7596756,3891,3891,3891,3891,3891,/m/0dbvp,Goose,000a93fcac30e4c0,verification,/m/0dbvp,1
7596757,3891,3891,3891,3891,3891,/m/0dbvp,Goose,0017095493836d41,verification,/m/0dbvp,1
7596758,3891,3891,3891,3891,3891,/m/0dbvp,Goose,00178f2f5ed06d6a,verification,/m/0dbvp,1


In [18]:
len(waterfall_ids)

3891

In [19]:
waterfall_ids['ImageID_y'].nunique()

3891

In [20]:
waterfall_images = pd.merge(waterfall_ids, oi.train.images(), left_on='ImageID_y', right_on='ImageID')

In [21]:
[x for x in waterfall_images['OriginalURL']]

['https://farm4.staticflickr.com/7617/16609368120_457d671a2d_o.jpg',
 'https://c6.staticflickr.com/5/4149/4961015376_d07049b7f6_o.jpg',
 'https://farm1.staticflickr.com/5302/5652630025_c86de5ab13_o.jpg',
 'https://farm2.staticflickr.com/3833/10822950025_5860a3e172_o.jpg',
 'https://c3.staticflickr.com/1/464/19330162782_2e174df5aa_o.jpg',
 'https://farm7.staticflickr.com/8197/8226619709_31a757df4c_o.jpg',
 'https://farm5.staticflickr.com/2128/2454051557_1e33138b69_o.jpg',
 'https://c4.staticflickr.com/6/5236/7179937246_b700c86ba4_o.jpg',
 'https://c1.staticflickr.com/7/6118/6226389198_00f628e041_o.jpg',
 'https://farm1.staticflickr.com/121/310001520_2744f1799d_o.jpg',
 'https://c8.staticflickr.com/8/7670/17334910441_a58b72eb89_o.jpg',
 'https://farm3.staticflickr.com/190/528409597_dc408f31b1_o.jpg',
 'https://c2.staticflickr.com/3/2224/2367459204_fd1954c1ca_o.jpg',
 'https://c2.staticflickr.com/9/8039/7925816248_b40d0a9f8e_o.jpg',
 'https://farm8.staticflickr.com/3608/3488801765_fe064bf

In [22]:
oi.train

<GroupNode>
annotations_human
annotations_human_bbox
annotations_machine
images

In [23]:
oi.test

<GroupNode>
annotations_human
annotations_human_bbox
annotations_machine
images

In [24]:
oi.validation

<GroupNode>
annotations_human
annotations_human_bbox
annotations_machine
images