In [1]:
import requests
import cv2
import json
import os
from tqdm import tqdm_notebook

## Example

* http://www.image-net.org/api/text/imagenet.synset.geturls?wnid=n04154340

Classes from https://s3.amazonaws.com/deep-learning-models/image-models/imagenet_class_index.json

In [19]:
r = requests.get("http://www.image-net.org/api/text/imagenet.synset.geturls",
                params={"wnid": "n04154340"})

In [27]:
len(r.content)

56000

In [3]:
urls = r.content.decode("utf8").split()

In [4]:
len(urls)

875

In [5]:
urls[:5]

['http://www.oroshiuri.net/PERFUME/catalog/images/SCREW%20.jpg',
 'http://www.made-in-china.com/image/2f0j00dBStFnMJQaWLM/Hexagon-Socket-Countersunk-Head-Screw-DIN-7991-.jpg',
 'http://imghost.indiamart.com/data/2/M/ETO-1110388/leadscrew23_640x480.jpg',
 'http://fasteningconcepts.com/Merchant2/graphics/00000001/screw5.jpg',
 'http://www.bobvila.com/images/ProductServices/ProductDirectory/Tapcon.jpg']

In [9]:
with open("./imagenet_class_index.json") as f:
    categories_raw = json.load(f)
    
categories = [(int(k), v) for k, v in categories_raw.items()]
categories = [tuple([k] + v) for k, v in sorted(categories)]
categories[:5]

[(0, 'n01440764', 'tench'),
 (1, 'n01443537', 'goldfish'),
 (2, 'n01484850', 'great_white_shark'),
 (3, 'n01491361', 'tiger_shark'),
 (4, 'n01494475', 'hammerhead')]

## Action

At least 7kB to prevent trivial things like "This photo is no longer available" (there were many those, ~1/4).

In [28]:
def urlretrieve(url, filepath, timeout=10, min_siz_kB=7.):
    with open(filepath, 'wb') as f:
        content = requests.get(url, allow_redirects=True, timeout=timeout).content
        if len(content) / 1024 > min_siz_kB:
            f.write(requests.get(url, allow_redirects=True, timeout=timeout).content)
        else:
            raise Exception("Zu klein! {:.1f} kB".format(len(content) / 1024))

In [38]:
def download_imagenet_category(wnid="n04154340",
                               timeout=2,
                               path="data/imgs",
                               resize=True,
                               resize_to=(250, 250),
                               limit=10):

    r = requests.get("http://www.image-net.org/api/text/imagenet.synset.geturls",
                    params={"wnid": wnid})
    urls = r.content.decode('utf-8').split()
    # print("{wnid} ---> {path}".format(wnid=wnid, path=path))

    if not os.path.exists(path):
        os.makedirs(path)

    pic_num = 0
    
    tqdm_success = tqdm_notebook(total=limit, desc="{path} saved".format(path=path), leave=False)
    for url in tqdm_notebook(urls, desc="{path} tried".format(path=path), leave=False):
        try:
            # print(url, end=" ... ")
            filepath = "{path}/{pic_num}.jpg".format(path=path, pic_num=pic_num)
            urlretrieve(url, filepath, timeout=timeout)
            # print("DONE!")
                    
            if resize:
                img = cv2.imread(filepath)
                resized_image = cv2.resize(img, resize_to)
                cv2.imwrite(filepath, resized_image)

            pic_num += 1
            tqdm_success.update(n=1)

        except Exception as e:
            pass
            # print(str(e))
            
        if pic_num >= limit:
            break
            
    tqdm_success.close()

In [41]:
download_imagenet_category('n01491361')

In [42]:
last_id = 0  # for recovery

In [43]:
for i, wnid, name in tqdm_notebook(categories[last_id:], desc="Categories"):
    last_id = i
    download_imagenet_category(wnid, path="imagenet_sample/{}_{}_{}".format(str(i).zfill(3), wnid, name))





ConnectionError: HTTPConnectionPool(host='www.image-net.org', port=80): Max retries exceeded with url: /api/text/imagenet.synset.geturls?wnid=n02091635 (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x12370a978>: Failed to establish a new connection: [Errno 8] nodename nor servname provided, or not known',))