New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Figure out how to timeout #261
Comments
maybe another path is to have a flexible thread pool: instead of doing like 32 threads, increase or decrease depending on the current speed: if many requests are being done, don't start more, if not do measuring the byte/s could help estimate this |
New idea:
|
Another idea: use timeout arg of join https://docs.python.org/3/library/threading.html#threading.Thread.join So simply put the thread in bad pool and continue starting more threads |
reproducing example: import urllib.request
import time
import io
example_urls = [
(12, 'http://www.herteldenbirname.com/wp-content/uploads/2014/05/Italia-Independent-Flocked-Aviator-Sunglasses-150x150.jpg'),
(124, 'http://image.rakuten.co.jp/sneak/cabinet/shoes-03/cr-ucrocs5-a.jpg?_ex=128x128'),
(146, 'http://www.slicingupeyeballs.com/wp-content/uploads/2009/05/stoneroses452.jpg'),
(122, 'https://media.mwcradio.com/mimesis/2013-03/01/2013-03-01T153415Z_1_CBRE920179600_RTROPTP_3_TECH-US-GERMANY-EREADER_JPG_475x310_q85.jpg'),
(282, 'https://8d1aee3bcc.site.internapcdn.net/00/images/media/5/5cfb2eba8f1f6244c6f7e261b9320a90-1.jpg'),
(298, 'https://my-furniture.com.au/media/catalog/product/cache/1/small_image/295x295/9df78eab33525d08d6e5fb8d27136e95/a/u/au0019-stool-01.jpg'),
(300, 'http://images.tastespotting.com/thumbnails/889506.jpg'),
(330, 'https://www.infoworld.pk/wp-content/uploads/2016/02/Cool-HD-Valentines-Day-Wallpapers-480x300.jpeg'),
(361, 'http://pendantscarf.com/image/cache/data/necklace/JW0013-(2)-150x150.jpg'),
(408, 'https://www.solidrop.net/photo-6/animorphia-coloring-books-for-adults-children-drawing-book-secret-garden-style-relieve-stress-graffiti-painting-book.jpg'),
]
def download_image(row, timeout):
"""Download an image with urllib"""
key, url = row
img_stream = None
user_agent_string = "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:72.0) Gecko/20100101 Firefox/72.0"
start_time = time.time()
try:
request = urllib.request.Request(url, data=None, headers={"User-Agent": user_agent_string})
with urllib.request.urlopen(request, timeout=timeout) as r:
img_stream = io.BytesIO(r.read())
return key, img_stream, None, time.time() - start_time
except Exception as err: # pylint: disable=broad-except
if img_stream is not None:
img_stream.close()
return key, None, str(err), time.time() - start_time
def main():
for example_url in example_urls:
a = download_image(example_url, 2)
print(a)
if __name__ == "__main__":
main() |
one try: import queue
from threading import Thread
import time
class GoodBadPool:
def __init__(self, generator, runner, timeout, pool_size, out_queue_max) -> None:
self.generator = generator
self.runner = runner
self.timeout = timeout
self.pool_size = pool_size
self.out_queue_max = out_queue_max
self.results = []
self.good_threads = []
self.bad_threads = []
self.outqueue = queue.SimpleQueue()
self.good_done = {}
self.item_left = True
def call(self, start_time, item):
result = self.runner(item)
key = item[0]
if time.time() - start_time < self.timeout:
self.outqueue.put(result)
self.good_done[key] = True
def cleanup_bad_threads(self):
still_bad_threads = []
for thread in self.bad_threads:
thread.join(0)
if thread.is_alive():
still_bad_threads.append(thread)
self.bad_threads = still_bad_threads
def cleanup_good_threads(self):
# move slow threads to bad threads
still_good_threads = []
for start_time, key, thread in self.good_threads:
if key in self.good_done:
thread.join(0)
del self.good_done[key]
continue
if time.time() - start_time > self.timeout:
self.outqueue.put((key, None, "timeout"))
self.bad_threads.append(thread)
else:
still_good_threads.append((start_time, key, thread))
self.good_threads = still_good_threads
def provider(self):
"""Loops infinitely, if we need new values, try to get them
1. clean up bad threads (join them)
2. clean up good threads by moving the slow ones to bad threads
3. start new threads if possible
"""
while True:
if self.outqueue.qsize() > self.out_queue_max:
time.sleep(0.1)
continue
self.cleanup_bad_threads()
self.cleanup_good_threads()
#print(f"good: {len(self.good_threads)}, bad: {len(self.bad_threads)}, outqueue: {self.outqueue.qsize()}")
if self.item_left and len(self.good_threads) < self.pool_size:
try:
item = next(self.generator)
key = item[0]
except StopIteration:
self.item_left = False
continue
start_time = time.time()
thread = Thread(target=self.call, args=(start_time, item,))
thread.start()
self.good_threads.append((start_time, key, thread))
else:
if len(self.good_threads) == 0 and not self.item_left:
self.outqueue.put(None)
return
time.sleep(0.1)
continue
def run(self):
t = Thread(target=self.provider)
t.start()
while True:
item = self.outqueue.get()
if item is None:
break
yield item
t.join(0) but actually doesn't seem to help that much. Something else must be slowing things down |
I think the only reasonable paths forward here are
|
2 new ideas:
|
from pycurl import Curl
import pycurl
from io import BytesIO
import time
def download_image(row, timeout):
"""Download an image with urllib"""
key, url = row
user_agent_string = "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:72.0) Gecko/20100101 Firefox/72.0"
try:
mycurl=Curl()
mycurl.setopt(pycurl.SSL_VERIFYPEER, 0)
mycurl.setopt(pycurl.SSL_VERIFYHOST, 0)
mycurl.setopt(pycurl.TIMEOUT, timeout)
mycurl.setopt(pycurl.URL, url)
body = BytesIO()
mycurl.setopt(pycurl.WRITEFUNCTION, body.write)
mycurl.setopt(pycurl.USERAGENT, user_agent_string)
mycurl.perform()
val = body.getvalue()
body.close()
return key, val, None
except Exception as e:
return key, None, str(e)
example_urls = [
(12, 'http://www.herteldenbirname.com/wp-content/uploads/2014/05/Italia-Independent-Flocked-Aviator-Sunglasses-150x150.jpg'),
(124, 'http://image.rakuten.co.jp/sneak/cabinet/shoes-03/cr-ucrocs5-a.jpg?_ex=128x128'),
(146, 'http://www.slicingupeyeballs.com/wp-content/uploads/2009/05/stoneroses452.jpg'),
(122, 'https://media.mwcradio.com/mimesis/2013-03/01/2013-03-01T153415Z_1_CBRE920179600_RTROPTP_3_TECH-US-GERMANY-EREADER_JPG_475x310_q85.jpg'),
(282, 'https://8d1aee3bcc.site.internapcdn.net/00/images/media/5/5cfb2eba8f1f6244c6f7e261b9320a90-1.jpg'),
(298, 'https://my-furniture.com.au/media/catalog/product/cache/1/small_image/295x295/9df78eab33525d08d6e5fb8d27136e95/a/u/au0019-stool-01.jpg'),
(300, 'http://images.tastespotting.com/thumbnails/889506.jpg'),
(330, 'https://www.infoworld.pk/wp-content/uploads/2016/02/Cool-HD-Valentines-Day-Wallpapers-480x300.jpeg'),
(361, 'http://pendantscarf.com/image/cache/data/necklace/JW0013-(2)-150x150.jpg'),
(408, 'https://www.solidrop.net/photo-6/animorphia-coloring-books-for-adults-children-drawing-book-secret-garden-style-relieve-stress-graffiti-painting-book.jpg'),
]
for item in example_urls:
s = time.time()
a = download_image(item, 2)
print(time.time() - s) important:
|
curl is not faster |
tinyproxy solution is working !! instructions:
proxies = {'http': 'http://127.0.0.1:8888', 'https': 'https://127.0.0.1:8888'}
proxy_support = urllib.request.ProxyHandler(proxies)
opener = urllib.request.build_opener(proxy_support)
urllib.request.install_opener(opener) in download 2x faster |
but success rate is low... |
new idea: maintain a pool of fast domain and a pool of slow domain. |
New idea: keep track of domains that fail to resolve and don't even try to download those next time. Writing down the download time for each url in metadata can also help here for further analysis |
to compute domain stats: import glob
from urllib.parse import urlparse
import pandas as pd
pq = list(glob.glob("*.parquet"))
dfs = [pd.read_parquet(p) for p in pq]
df = pd.concat(dfs)
df["domain"] = df["url"].apply(lambda url: urlparse(url).netloc)
df2 = df[["domain", "duration"]]
df2.groupby("domain").agg({'domain':'size', 'duration':'mean'}).sort_values("duration")[-10:] but seems like longer urls (up to 40s...) are from unique domains among 10k items. Checking among more. |
among 160k seems no clear correlation between domain and duration |
ok only reasonable way forward here is to decouple completely the downloading from the rest, and then set up a clean benchmark on "here is 100 shards of 1000 items, how can you get them fast using whatever technology (other languages, libs, ...)" |
I implemented some new metrics and found that many urls timeout after 20s, which clearly slow down everything
here is some examples:
Downloaded (12, 'http://www.herteldenbirname.com/wp-content/uploads/2014/05/Italia-Independent-Flocked-Aviator-Sunglasses-150x150.jpg') in 10.019284009933472
Downloaded (124, 'http://image.rakuten.co.jp/sneak/cabinet/shoes-03/cr-ucrocs5-a.jpg?_ex=128x128') in 10.01184344291687
Downloaded (146, 'http://www.slicingupeyeballs.com/wp-content/uploads/2009/05/stoneroses452.jpg') in 10.006474256515503
Downloaded (122, 'https://media.mwcradio.com/mimesis/2013-03/01/2013-03-01T153415Z_1_CBRE920179600_RTROPTP_3_TECH-US-GERMANY-EREADER_JPG_475x310_q85.jpg') in 10.241626739501953
Downloaded (282, 'https://8d1aee3bcc.site.internapcdn.net/00/images/media/5/5cfb2eba8f1f6244c6f7e261b9320a90-1.jpg') in 10.431355476379395
Downloaded (298, 'https://my-furniture.com.au/media/catalog/product/cache/1/small_image/295x295/9df78eab33525d08d6e5fb8d27136e95/a/u/au0019-stool-01.jpg') in 10.005694150924683
Downloaded (300, 'http://images.tastespotting.com/thumbnails/889506.jpg') in 10.007027387619019
Downloaded (330, 'https://www.infoworld.pk/wp-content/uploads/2016/02/Cool-HD-Valentines-Day-Wallpapers-480x300.jpeg') in 10.004335880279541
Downloaded (361, 'http://pendantscarf.com/image/cache/data/necklace/JW0013-(2)-150x150.jpg') in 10.00539231300354
Downloaded (408, 'https://www.solidrop.net/photo-6/animorphia-coloring-books-for-adults-children-drawing-book-secret-garden-style-relieve-stress-graffiti-painting-book.jpg') in 10.004313945770264
Let's try to implement request timeout
I tried #153 , eventlet and #260 and none of them can timeout properly
A good value for timeout is 2s
The text was updated successfully, but these errors were encountered: