Skip to content

Commit

Permalink
Merge a6f4d7d into 68c39fd
Browse files Browse the repository at this point in the history
  • Loading branch information
eugriner committed Nov 13, 2017
2 parents 68c39fd + a6f4d7d commit 4fb2aa8
Show file tree
Hide file tree
Showing 4 changed files with 13 additions and 5 deletions.
2 changes: 2 additions & 0 deletions README.md
Expand Up @@ -47,6 +47,8 @@ Options
-g, --injected Scrape injected images
--proxy-server PROXY_SERVER
Proxy server to use
--min-filesize MIN_FILESIZE
Limit on size of image in bytes
--max-filesize MAX_FILESIZE
Limit on size of image in bytes
--dump-urls Print the URLs of the images
Expand Down
1 change: 1 addition & 0 deletions README.rst
Expand Up @@ -63,6 +63,7 @@ Options
-s, --save-dir <path> Name of the folder to save the images
-g, --injected Scrape injected images
--formats [ [FORMATS ..]] Specify the formats of images to be scraped
--min-filesize <size> Limit on size of image in bytes (default: 0)
--max-filesize <size> Limit on size of image in bytes (default: 100000000)
--dump-urls Print the URLs of the images
--scrape-reverse Scrape the images in reverse order
Expand Down
4 changes: 2 additions & 2 deletions image_scraper/mains.py
Expand Up @@ -60,7 +60,7 @@ def console_main():
for img_url in scraper.images:
print(img_url)

status_flags = {'count': 0, 'percent': 0.0, 'failed': 0, 'over_max_filesize': 0}
status_flags = {'count': 0, 'percent': 0.0, 'failed': 0, 'under_min_or_over_max_filesize': 0}
widgets = ['Progress: ', Percentage(), ' ', Bar(marker=RotatingMarker()),
' ', ETA(), ' ', FileTransferSpeed()]
pbar = ProgressBar(widgets=widgets, maxval=100).start()
Expand All @@ -74,7 +74,7 @@ def console_main():
pool.shutdown(wait=True)
pbar.finish()
print("\nDone!\nDownloaded {0} images\nFailed: {1}\n".format(
status_flags['count']-status_flags['failed']-status_flags['over_max_filesize'],
status_flags['count']-status_flags['failed']-status_flags['under_min_or_over_max_filesize'],
status_flags['failed']))
return

11 changes: 8 additions & 3 deletions image_scraper/utils.py
Expand Up @@ -20,6 +20,7 @@ def __init__(self):
self.no_to_download = 0
self.format_list = []
self.download_path = "images"
self.min_filesize = 0
self.max_filesize = 100000000
self.dump_urls = False
self.scrape_reverse = False
Expand All @@ -46,6 +47,8 @@ def get_arguments(self):
action="store_true")
parser.add_argument('--proxy-server', type=str, default=None,
help="Proxy server to use")
parser.add_argument('--min-filesize', type=int, default=0,
help="Limit on size of image in bytes")
parser.add_argument('--max-filesize', type=int, default=100000000,
help="Limit on size of image in bytes")
parser.add_argument('--dump-urls', default=False,
Expand Down Expand Up @@ -75,6 +78,7 @@ def get_arguments(self):
self.use_ghost = args.injected
self.format_list = args.formats if args.formats else [
"jpg", "png", "gif", "svg", "jpeg"]
self.min_filesize = args.min_filesize
self.max_filesize = args.max_filesize
self.dump_urls = args.dump_urls
self.proxy_url = args.proxy_server
Expand All @@ -91,7 +95,7 @@ def get_arguments(self):
self.filename_pattern = args.filename_pattern
self.nthreads = args.nthreads
return (self.url, self.no_to_download, self.format_list,
self.download_path, self.max_filesize,
self.download_path, self.min_filesize, self.max_filesize,
self.dump_urls, self.scrape_reverse, self.use_ghost, self.filename_pattern)

def get_html(self):
Expand Down Expand Up @@ -192,7 +196,8 @@ def download_image(self, img_url):
except:
raise ImageDownloadError()

if img_url[-3:] == "svg" or int(img_request.headers['content-length']) < self.max_filesize:
if img_url[-3:] == "svg" or (int(img_request.headers['content-length']) > self.min_filesize and\
int(img_request.headers['content-length']) < self.max_filesize):
img_content = img_request.content
with open(os.path.join(self.download_path, img_url.split('/')[-1]), 'wb') as f:
byte_image = bytes(img_content)
Expand Down Expand Up @@ -224,7 +229,7 @@ def download_worker_fn(scraper, img_url, pbar, status_flags, status_lock):
if failed:
status_flags['failed'] += 1
elif size_failed:
status_flags['over_max_filesize'] += 1
status_flags['under_min_or_over_max_filesize'] += 1
status_flags['percent'] = status_flags[
'percent'] + old_div(100.0, scraper.no_to_download)
pbar.update(status_flags['percent'] % 100)
Expand Down

0 comments on commit 4fb2aa8

Please sign in to comment.