Skip to content
This repository has been archived by the owner on Mar 28, 2022. It is now read-only.

Commit

Permalink
Segmented file downloads (#89)
Browse files Browse the repository at this point in the history
* segmented file downloads

* add cert kwargs to poolmanager

* use urllib3[secure] kpg

* handle cases without suffix

* remove wget dependency
  • Loading branch information
agermanidis committed Dec 24, 2019
1 parent 6971eb5 commit 1646b02
Show file tree
Hide file tree
Showing 4 changed files with 67 additions and 7 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Expand Up @@ -6,6 +6,10 @@ The Runway Model SDK follows [semantic versioning](https://semver.org/). Be awar
Until version 1.0.0, expect that minor version changes may introduce breaking changes. We will take care not to introduce new behavior, features, or breaking changes in patch releases. If you require stability and reproducible behavior you *may* pin to a version or version range of the model SDK like `runway-python>=0.2.0` or `runway-python>=0.2,<0.3`.

## v.0.5.5

- Speed up downloads for `runway.file` and `runway.directory` data types by using segmented file transfer.

## v.0.5.4

- Add support for nearest-neighbor color matching for segmentation input.
Expand Down
4 changes: 2 additions & 2 deletions requirements.txt
Expand Up @@ -3,8 +3,8 @@ Flask-Cors>=3.0.2
numpy>=1.15.0
Pillow>=4.3.0
gevent>=1.4.0
wget>=3.2
six>=1.12.0
colorcet>=2.0.1
Flask-Sockets==0.2.1
scipy>=1.2.1
scipy>=1.2.1
urllib3[secure]>=1.25.7
2 changes: 1 addition & 1 deletion runway/__version__.py
@@ -1 +1 @@
__version__ = '0.5.4'
__version__ = '0.5.5'
64 changes: 60 additions & 4 deletions runway/utils.py
Expand Up @@ -2,18 +2,22 @@
import tarfile
import inspect
import re
import wget
import os
import functools
import sys
import gzip
import datetime
import colorcet
import uuid
import urllib3
import multiprocessing
import certifi
if sys.version_info[0] < 3:
from cStringIO import StringIO as IO
from urlparse import urlparse
else:
from io import BytesIO as IO
from urllib.parse import urlparse
import numpy as np
from flask import after_this_request, request, jsonify

Expand Down Expand Up @@ -53,9 +57,61 @@ def is_url(path):
return re.match(URL_REGEX, path)


def download_file(url):
download_dir = tempfile.mkdtemp()
return wget.download(url, out=download_dir)
def get_file_suffix_from_url(url):
suffix_parts = os.path.basename(urlparse(url).path).split('.')[1:]
if len(suffix_parts) == 0:
return ''
else:
return '.%s' % '.'.join(suffix_parts)


def get_download_chunks(total_size, chunk_size=1e7):
n_chunks = max(1, total_size // chunk_size)
for i in range(n_chunks):
start = (total_size // n_chunks) * i
end = (total_size // n_chunks) * (i + 1) - 1
if i == n_chunks - 1: end = max(end, total_size)
yield [start, end]


def download_worker(url, queue, filename):
http = urllib3.PoolManager(cert_reqs='CERT_REQUIRED', ca_certs=certifi.where())
while True:
try:
rng = queue.get_nowait()
except:
break
[start, end] = rng
resp = http.request('GET', url, headers={'Range': 'bytes=' + str(start) + '-' + str(end)})
f = open(filename, 'r+b')
f.seek(start)
f.write(resp.data)
f.close()


def download_file(url, n_processes=16):
tmp = tempfile.NamedTemporaryFile(suffix=get_file_suffix_from_url(url), delete=False)
filename = tmp.name
http = urllib3.PoolManager(cert_reqs='CERT_REQUIRED', ca_certs=certifi.where())
initial_response = http.request('HEAD', url)
enable_segmented_download = 'accept-ranges' in initial_response.headers and \
'content-length' in initial_response.headers and \
initial_response.headers['accept-ranges'] == 'bytes' and \
initial_response.headers['content-length'] is not None
if enable_segmented_download:
content_length = int(initial_response.headers['content-length'])
manager = multiprocessing.Manager()
queue = manager.Queue()
[queue.put(chunk) for chunk in get_download_chunks(content_length)]
processes = [multiprocessing.Process(target=download_worker, args=(url, queue, filename)) for _ in range(n_processes)]
[process.start() for process in processes]
[process.join() for process in processes]
else:
resp = http.request('GET', url)
f = open(filename, 'wb')
f.write(resp.data)
f.close()
return filename


def extract_tarball(path):
Expand Down

0 comments on commit 1646b02

Please sign in to comment.