### Import libraries

In [27]:
import pandas as pd
import json
import shutil
import os
import json

### Class hierarchy

In [28]:
from IPython.display import IFrame

IFrame(src="https://storage.googleapis.com/openimages/2018_04/bbox_labels_600_hierarchy_visualizer/circle.html", width = 1500, height=1500)

### Categories

In [29]:
categories = ["Food", "Vehicle", "Clothing", "Sports equipment", "Person", "Tool", "Kitchenware", "Furniture"]

### Retrieve label names

In [30]:
labels = pd.read_csv('./labels.csv')

labels

Unnamed: 0,LabelName,DisplayName
0,/m/0c4936,'Nduja
1,/m/06w6y06,10 cane
2,/m/079zcf,100 metres hurdles
3,/m/02pv3hz,100plus
4,/m/0bb154,110 metres hurdles
...,...,...
20926,/m/05_5x82,Škoda rapid
20927,/m/0d96wb,Škoda roomster
20928,/m/03vql3,Škoda superb
20929,/m/05_5ppm,Škoda yeti


### Get category labels

In [31]:
category_labels = {}
for category in categories:
    category_labels[category] = labels[labels["DisplayName"] == category]["LabelName"].item()

category_labels

{'Food': '/m/02wbm',
 'Vehicle': '/m/07yv9',
 'Clothing': '/m/09j2d',
 'Sports equipment': '/m/05y5lj',
 'Person': '/m/01g317',
 'Tool': '/m/07k1x',
 'Kitchenware': '/m/03_wxk',
 'Furniture': '/m/0c_jw'}

### Get Subcategory labels

In [32]:
def extractSubCategories(subcategory, subcategoryLabels):
    
    for info in subcategory:
        subcategoryLabels.append(info["LabelName"])
        if "Subcategory" in info:
            extractSubCategories(info["Subcategory"], subcategoryLabels)
    return


file = open('hierarchy.json')
hierarchy = json.load(file)
  
category_all_labels = {}
for display,label in category_labels.items():
    for category in hierarchy["Subcategory"]:
        if category["LabelName"] == label:
            subcategoryLabels = [label]
            extractSubCategories(category["Subcategory"], subcategoryLabels)
            category_all_labels[display] = subcategoryLabels
    
file.close()

with open("category_labels.json", "w") as outfile:
    json.dump(category_all_labels, outfile)

### Download images

#### Validation

Generate download `.txt` files for each category.  

In [19]:
val_human_labels = pd.read_csv('./validation_data/val-human-imagelabels.csv')
val_seg = pd.read_csv('./validation_data/val-segmentation.csv')

# Only get images with 100% label confidence
val_human_labels = val_human_labels[val_human_labels["Confidence"] == 1.0]

for category, labels in category_all_labels.items():
    human_ver_category_imageIDs = list(val_human_labels.loc[val_human_labels['LabelName'].isin(category_all_labels[category])]["ImageID"])
    human_ver_category_imageIDs = list(val_seg.loc[val_seg['ImageID'].isin(human_ver_category_imageIDs)]["ImageID"])
    human_ver_category_imageIDs = [*set(human_ver_category_imageIDs)]
    
    
    file_name =  f"{category.split(' ')[0]}_{category.split(' ')[1]}".lower() if len(category.split(' ')) > 1 else category.lower()
    file = open(f"./validation_data/{file_name}.txt", "w")
    lines = []
    for imageID in human_ver_category_imageIDs:
        lines.append(f"validation/{imageID} \n")

    file.writelines(lines)
    file.close()

If your system is Linux compatible run the cell below to automatically download the data needed.
If not Linux compatible then follow then after downloading all the `.txt.` files, run the following command to start downloading the images listed in the `.txt` files.  
```python
python downloader.py $IMAGE_LIST_FILE --download_folder=$DOWNLOAD_FOLDER --num_processes=5
```
Where `IMAGE_LIST_FILE` => `./validation_data/${category}.txt` and `DOWNLOAD_FOLDER` => `./images/validation/${category}`

In [26]:
!categories=("food" "vehicle" "clothing" "sports_equipment" "person" "tool" "kitchenware" "furniture"); for category in "${categories[@]}"; do mkdir -p ./images/validation/${category} && python ./downloader.py ./validation_data/${category}.txt --download_folder=./images/validation/${category}/ --num_processes=5; done

Downloading images:  26%|█████▏              | 464/1773 [00:18<01:04, 20.45it/s]^C


#### Test

Generate download `.txt` files for each category.  

In [20]:
test_human_labels = pd.read_csv('./test_data/test-human-imagelabels.csv')
test_seg = pd.read_csv('./test_data/test-segmentation.csv')

# Only get images with 100% label confidence
test_human_labels = test_human_labels[test_human_labels["Confidence"] == 1.0]

for category, labels in category_all_labels.items():
    human_ver_category_imageIDs = list(test_human_labels.loc[test_human_labels['LabelName'].isin(category_all_labels[category])]["ImageID"])
    human_ver_category_imageIDs = list(test_seg.loc[test_seg['ImageID'].isin(human_ver_category_imageIDs)]["ImageID"])
    human_ver_category_imageIDs = [*set(human_ver_category_imageIDs)]
    
    file_name =  f"{category.split(' ')[0]}_{category.split(' ')[1]}".lower() if len(category.split(' ')) > 1 else category.lower()
    file = open(f"./test_data/{file_name}.txt", "w")
    lines = []
    for imageID in human_ver_category_imageIDs:
        lines.append(f"test/{imageID} \n")

    file.writelines(lines)
    file.close()

If your system is Linux compatible run the cell below to automatically download the data needed.
If not Linux compatible then follow then after downloading all the `.txt.` files, run the following command to start downloading the images listed in the `.txt` files.  
```python
python downloader.py $IMAGE_LIST_FILE --download_folder=$DOWNLOAD_FOLDER --num_processes=5
```
Where `IMAGE_LIST_FILE` => `./test_data/${category}.txt` and `DOWNLOAD_FOLDER` => `./images/test/${category}`

In [None]:
!categories=("food" "vehicle" "clothing" "sports_equipment" "person" "tool" "kitchenware" "furniture"); for category in "${categories[@]}"; do mkdir -p ./images/test/${category} && python ./downloader.py ./test_data/${category}.txt --download_folder=./images/test/${category}/ --num_processes=5; done

#### Training

Generate download `.txt` files for each category.
`train-human-imagelabels.csv` and `train-segmentation.csv` files need to be downloaded before running the below cell as they were too big (>100MB) to push to GitHub.  
Download `train-human-imagelabels.csv` from here, https://storage.googleapis.com/openimages/v7/oidv7-train-annotations-human-imagelabels.csv  
Download `train-segmentation.csv` from here, https://storage.googleapis.com/openimages/v5/train-annotations-object-segmentation.csv  
After downloading the two files above rename them accordingly and put them in the `train_data` folder.

In [21]:
train_human_labels = pd.read_csv('./train_data/train-human-imagelabels.csv')
train_seg = pd.read_csv('./train_data/train-segmentation.csv')

# Only get images with 100% label confidence
train_human_labels = train_human_labels[train_human_labels["Confidence"] == 1.0]

for category, labels in category_all_labels.items():
    human_ver_category_imageIDs = list(train_human_labels.loc[train_human_labels['LabelName'].isin(category_all_labels[category])]["ImageID"])
    human_ver_category_imageIDs = list(train_seg.loc[train_seg['ImageID'].isin(human_ver_category_imageIDs)]["ImageID"])
    human_ver_category_imageIDs = [*set(human_ver_category_imageIDs)]
    
    file_name =  f"{category.split(' ')[0]}_{category.split(' ')[1]}".lower() if len(category.split(' ')) > 1 else category.lower()
    file = open(f"./train_data/{file_name}.txt", "w")
    lines = []
    for imageID in human_ver_category_imageIDs:
        lines.append(f"train/{imageID} \n")

    file.writelines(lines)
    file.close()

If your system is Linux compatible run the cell below to automatically download the data needed.
If not Linux compatible then follow then after downloading all the `.txt.` files, run the following command to start downloading the images listed in the `.txt` files.  
```python
python downloader.py $IMAGE_LIST_FILE --download_folder=$DOWNLOAD_FOLDER --num_processes=5
```
Where `IMAGE_LIST_FILE` => `./train_data/${category}.txt` and `DOWNLOAD_FOLDER` => `./images/train/${category}`

In [None]:
!categories=("food" "vehicle" "clothing" "sports_equipment" "person" "tool" "kitchenware" "furniture"); for category in "${categories[@]}"; do mkdir -p ./images/train/${category} && python ./downloader.py ./train_data/${category}.txt --download_folder=./images/train/${category}/ --num_processes=5; done