diff --git a/README.md b/README.md index 3d83b01..aaf4f85 100644 --- a/README.md +++ b/README.md @@ -47,6 +47,8 @@ Options -g, --injected Scrape injected images --proxy-server PROXY_SERVER Proxy server to use +--min-filesize MIN_FILESIZE + Limit on size of image in bytes --max-filesize MAX_FILESIZE Limit on size of image in bytes --dump-urls Print the URLs of the images diff --git a/README.rst b/README.rst index 894b02d..290e3e6 100644 --- a/README.rst +++ b/README.rst @@ -63,6 +63,7 @@ Options -s, --save-dir Name of the folder to save the images -g, --injected Scrape injected images --formats [ [FORMATS ..]] Specify the formats of images to be scraped + --min-filesize Limit on size of image in bytes (default: 0) --max-filesize Limit on size of image in bytes (default: 100000000) --dump-urls Print the URLs of the images --scrape-reverse Scrape the images in reverse order diff --git a/image_scraper/mains.py b/image_scraper/mains.py index 794b319..7c849d5 100644 --- a/image_scraper/mains.py +++ b/image_scraper/mains.py @@ -60,7 +60,7 @@ def console_main(): for img_url in scraper.images: print(img_url) - status_flags = {'count': 0, 'percent': 0.0, 'failed': 0, 'over_max_filesize': 0} + status_flags = {'count': 0, 'percent': 0.0, 'failed': 0, 'under_min_or_over_max_filesize': 0} widgets = ['Progress: ', Percentage(), ' ', Bar(marker=RotatingMarker()), ' ', ETA(), ' ', FileTransferSpeed()] pbar = ProgressBar(widgets=widgets, maxval=100).start() @@ -74,7 +74,7 @@ def console_main(): pool.shutdown(wait=True) pbar.finish() print("\nDone!\nDownloaded {0} images\nFailed: {1}\n".format( - status_flags['count']-status_flags['failed']-status_flags['over_max_filesize'], + status_flags['count']-status_flags['failed']-status_flags['under_min_or_over_max_filesize'], status_flags['failed'])) return diff --git a/image_scraper/utils.py b/image_scraper/utils.py index b1724d8..65aaaf3 100644 --- a/image_scraper/utils.py +++ b/image_scraper/utils.py @@ -20,6 +20,7 @@ def __init__(self): self.no_to_download = 0 self.format_list = [] self.download_path = "images" + self.min_filesize = 0 self.max_filesize = 100000000 self.dump_urls = False self.scrape_reverse = False @@ -46,6 +47,8 @@ def get_arguments(self): action="store_true") parser.add_argument('--proxy-server', type=str, default=None, help="Proxy server to use") + parser.add_argument('--min-filesize', type=int, default=0, + help="Limit on size of image in bytes") parser.add_argument('--max-filesize', type=int, default=100000000, help="Limit on size of image in bytes") parser.add_argument('--dump-urls', default=False, @@ -75,6 +78,7 @@ def get_arguments(self): self.use_ghost = args.injected self.format_list = args.formats if args.formats else [ "jpg", "png", "gif", "svg", "jpeg"] + self.min_filesize = args.min_filesize self.max_filesize = args.max_filesize self.dump_urls = args.dump_urls self.proxy_url = args.proxy_server @@ -91,7 +95,7 @@ def get_arguments(self): self.filename_pattern = args.filename_pattern self.nthreads = args.nthreads return (self.url, self.no_to_download, self.format_list, - self.download_path, self.max_filesize, + self.download_path, self.min_filesize, self.max_filesize, self.dump_urls, self.scrape_reverse, self.use_ghost, self.filename_pattern) def get_html(self): @@ -192,7 +196,8 @@ def download_image(self, img_url): except: raise ImageDownloadError() - if img_url[-3:] == "svg" or int(img_request.headers['content-length']) < self.max_filesize: + if img_url[-3:] == "svg" or (int(img_request.headers['content-length']) > self.min_filesize and\ + int(img_request.headers['content-length']) < self.max_filesize): img_content = img_request.content with open(os.path.join(self.download_path, img_url.split('/')[-1]), 'wb') as f: byte_image = bytes(img_content) @@ -224,7 +229,7 @@ def download_worker_fn(scraper, img_url, pbar, status_flags, status_lock): if failed: status_flags['failed'] += 1 elif size_failed: - status_flags['over_max_filesize'] += 1 + status_flags['under_min_or_over_max_filesize'] += 1 status_flags['percent'] = status_flags[ 'percent'] + old_div(100.0, scraper.no_to_download) pbar.update(status_flags['percent'] % 100)