Here's a faster way to download images using the principles of concurrency and parallelism. In other words, asynchronous tasking and multiprocessing. It works best on machines with 8+ cores and when downloading large files like videos.

Please let me know if you have other good methods for downloading large numbers of files!

In [None]:
# Libraries for concurrency and parallelism
! pip install asks trio

In [None]:
from pathlib import Path
import requests
from os import cpu_count

import datatable as dt
import asks
import trio

In [None]:
Path('./pics').mkdir(exist_ok=True)

img_urls = dt.fread("../input/wikipedia-image-caption/test.tsv", sep='\t', columns={'image_url'})
links = img_urls.to_list()[0][:1000]

In [None]:
%%time
# fast way

async def fetch_pic(s, url):
    r = await s.get(url)
    return r.content


async def save_pic(s, url):
    content = await fetch_pic(s, url)
    filename = f"pics/{url.split('/')[-1][:100]}"
    with open(filename,'wb') as f:
        f.write(content)

        
async def main(links):
    dname = 'https://upload.wikimedia.org'
    s = asks.sessions.Session(dname, connections=cpu_count()*2)
    async with trio.open_nursery() as n:
        for url in links:
            n.start_soon(save_pic, s, url)

            
trio.run(main, links)

In [None]:
%%bash
ls pics | wc -l
ls pics -U | head -6

In [None]:
%%time
# regular way

for url in links:
    r = requests.get(url, stream=True)
    content=r.content
    filename = f"pics/{url.split('/')[-1][:100]}"
    with open(filename,'wb') as f:
        f.write(content)
    