Segmented file downloads (#89)

* segmented file downloads * add cert kwargs to poolmanager * use urllib3[secure] kpg * handle cases without suffix * remove wget dependency
runwayml · Dec 24, 2019 · 1646b02 · 1646b02
1 parent 6971eb5
commit 1646b02
Show file tree

Hide file tree

Showing 4 changed files with 67 additions and 7 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -6,6 +6,10 @@ The Runway Model SDK follows [semantic versioning](https://semver.org/). Be awar
 
 Until version 1.0.0, expect that minor version changes may introduce breaking changes. We will take care not to introduce new behavior, features, or breaking changes in patch releases. If you require stability and reproducible behavior you *may* pin to a version or version range of the model SDK like `runway-python>=0.2.0` or `runway-python>=0.2,<0.3`.
 
+## v.0.5.5
+
+- Speed up downloads for `runway.file` and `runway.directory` data types by using segmented file transfer.
+
 ## v.0.5.4
 
 - Add support for nearest-neighbor color matching for segmentation input.

diff --git a/requirements.txt b/requirements.txt
@@ -3,8 +3,8 @@ Flask-Cors>=3.0.2
 numpy>=1.15.0
 Pillow>=4.3.0
 gevent>=1.4.0
-wget>=3.2
 six>=1.12.0
 colorcet>=2.0.1
 Flask-Sockets==0.2.1
-scipy>=1.2.1
+scipy>=1.2.1
+urllib3[secure]>=1.25.7
diff --git a/runway/__version__.py b/runway/__version__.py
@@ -1 +1 @@
-__version__ = '0.5.4'
+__version__ = '0.5.5'
diff --git a/runway/utils.py b/runway/utils.py
@@ -2,18 +2,22 @@
 import tarfile
 import inspect
 import re
-import wget
 import os
 import functools
 import sys
 import gzip
 import datetime
 import colorcet
 import uuid
+import urllib3
+import multiprocessing
+import certifi
 if sys.version_info[0] < 3:
     from cStringIO import StringIO as IO
+    from urlparse import urlparse
 else:
     from io import BytesIO as IO
+    from urllib.parse import urlparse
 import numpy as np
 from flask import after_this_request, request, jsonify
 
@@ -53,9 +57,61 @@ def is_url(path):
     return re.match(URL_REGEX, path)
 
 
-def download_file(url):
-    download_dir = tempfile.mkdtemp()
-    return wget.download(url, out=download_dir)
+def get_file_suffix_from_url(url):
+    suffix_parts = os.path.basename(urlparse(url).path).split('.')[1:]
+    if len(suffix_parts) == 0:
+        return ''
+    else:
+        return '.%s' % '.'.join(suffix_parts)
+
+
+def get_download_chunks(total_size, chunk_size=1e7):
+    n_chunks = max(1, total_size // chunk_size)
+    for i in range(n_chunks):
+        start = (total_size // n_chunks) * i
+        end = (total_size // n_chunks) * (i + 1) - 1
+        if i == n_chunks - 1: end = max(end, total_size)
+        yield [start, end]
+
+
+def download_worker(url, queue, filename):
+    http = urllib3.PoolManager(cert_reqs='CERT_REQUIRED', ca_certs=certifi.where())
+    while True:
+        try:
+            rng = queue.get_nowait()
+        except:
+            break
+        [start, end] = rng
+        resp = http.request('GET', url, headers={'Range': 'bytes=' + str(start) + '-' + str(end)})
+        f = open(filename, 'r+b')
+        f.seek(start)
+        f.write(resp.data)
+        f.close()
+
+
+def download_file(url, n_processes=16):
+    tmp = tempfile.NamedTemporaryFile(suffix=get_file_suffix_from_url(url), delete=False)
+    filename = tmp.name
+    http = urllib3.PoolManager(cert_reqs='CERT_REQUIRED', ca_certs=certifi.where())
+    initial_response = http.request('HEAD', url)
+    enable_segmented_download = 'accept-ranges' in initial_response.headers and \
+        'content-length' in initial_response.headers and \
+        initial_response.headers['accept-ranges'] == 'bytes' and \
+        initial_response.headers['content-length'] is not None
+    if enable_segmented_download:
+        content_length = int(initial_response.headers['content-length'])
+        manager = multiprocessing.Manager()
+        queue = manager.Queue()
+        [queue.put(chunk) for chunk in get_download_chunks(content_length)]
+        processes = [multiprocessing.Process(target=download_worker, args=(url, queue, filename)) for _ in range(n_processes)]
+        [process.start() for process in processes]
+        [process.join() for process in processes]
+    else:
+        resp = http.request('GET', url)
+        f = open(filename, 'wb')
+        f.write(resp.data)
+        f.close()
+    return filename
 
 
 def extract_tarball(path):