diff --git a/processor/DownloadProcessor.py b/processor/DownloadProcessor.py index 55f2335..8199831 100644 --- a/processor/DownloadProcessor.py +++ b/processor/DownloadProcessor.py @@ -1,23 +1,39 @@ from .BaseProcessor import BaseProcessor +from .Expander import Expander from multiprocessing import Pool +import io import os import hashlib import urllib.request import time import base64 - +import traceback +from PIL import Image class DownloadProcessor(BaseProcessor): - def __init__(self, output_directory='./', process_count=os.cpu_count()): + """ + The Expander interface takes a single PIL image as input and outputs + a list of zero or more PIL images. + + This "noop" expander that can be replaced with something more interesting + (e.g. an object or face extractor). + + The expander must close the image whenever it is not returned back to the + caller. The caller must close all images returned by this method. + """ + class _Expander(Expander): + def expand(self, image): + return [image] + + def __init__(self, output_directory='./', process_count=os.cpu_count(), expander = _Expander()): self.output_directory = output_directory self.process_count = process_count + self.expander = expander def before_process(self, search_term): - # # Create folder for downloads based on search term. - # self.gs_raw_dirpath = os.path.join(self.output_directory, search_term.strip().replace(' ', '_')) if not os.path.exists(self.gs_raw_dirpath): os.makedirs(self.gs_raw_dirpath) @@ -42,61 +58,58 @@ def download_single_image(self, params): self.download_fault = 0 timeout = 1 - if preview_url.startswith("http://") or preview_url.startswith("https://"): - try: + try: + if preview_url.startswith("http://") or preview_url.startswith("https://"): response = urllib.request.urlopen(preview_url, data=None, timeout=timeout) if response.headers['Content-Type'] == "image/jpeg": - file_ext = ".jpg" + file_ext = "jpg" elif response.headers['Content-Type'] == "image/png": - file_ext = ".png" + file_ext = "png" elif response.headers['Content-Type'] == "image/gif": - file_ext = ".gif" + file_ext = "gif" else: raise "image format not found" data = response.read() # a `bytes` object - if len(data) > 0: + elif preview_url.startswith("data:"): + if preview_url.startswith("data:image/jpeg"): + file_ext = "jpg" + elif preview_url.startswith("data:image/png"): + file_ext = "png" + elif preview_url.startswith("data:image/gif"): + file_ext = "gif" + else: + raise "image format not found" + preview_url = preview_url[preview_url.find(",") + 1:] + data = base64.standard_b64decode(preview_url) + image_format = file_ext + if file_ext == 'jpg': + image_format = 'jpeg' + if len(data) > 0: + images = self.expander.expand(Image.open(io.BytesIO(data))) + for image in images: + output = io.BytesIO() + image.save(output, format = image_format) + data = output.getvalue() + output.close() + image.close() md5_key = hashlib.md5(data).hexdigest() pic_prefix_str = '%s.%s' % (search_term, md5_key) - temp_filename = pic_prefix_str + file_ext + temp_filename = '%s.%s' % (pic_prefix_str, file_ext) temp_filename_full_path = os.path.join(self.gs_raw_dirpath, temp_filename) # If file exists, could be rare MD5 collision or, more likely, # is the same exact image downloaded from a different location if not os.path.exists(temp_filename_full_path): info_txt_path = os.path.join(self.gs_raw_dirpath, search_term + '_info.txt') - f = open(temp_filename_full_path, 'wb') # save as test.gif - # print(url_link) - f.write(data) # if have problem skip - f.close() - with open(info_txt_path, 'a') as f: - f.write(pic_prefix_str + ': ' + original_url) - f.write('\n') - except: - print('Problem with processing this data: ', original_url) - self.download_fault = 1 - pass - elif preview_url.startswith("data:"): - if preview_url.startswith("data:image/jpeg"): - file_ext = ".jpg" - elif preview_url.startswith("data:image/png"): - file_ext = ".png" - elif preview_url.startswith("data:image/gif"): - file_ext = ".gif" - else: - raise "image format not found" - data = base64.standard_b64decode(preview_url) - if len(data) > 0: - md5_key = hashlib.md5(data).hexdigest() - pic_prefix_str = '%s.%s' % (search_term, md5_key) - temp_filename = pic_prefix_str + file_ext - temp_filename_full_path = os.path.join(self.gs_raw_dirpath, temp_filename) - if not os.path.exists(temp_filename_full_path): - info_txt_path = os.path.join(self.gs_raw_dirpath, search_term + '_info.txt') - with open(temp_filename_full_path, "wb") as fh: - preview_url = preview_url[preview_url.find(",") + 1:] - fh.write(base64.standard_b64decode(preview_url)) + with open(temp_filename_full_path, 'wb') as f: + f.write(data) with open(info_txt_path, 'a') as f: f.write(pic_prefix_str + ': ' + original_url) f.write('\n') + except Exception as ex: + print('Problem with processing this data: ', original_url) + traceback.print_exc() + print(ex) + self.download_fault = 1 def after_process(self, search_term): thread_pool = Pool(processes=self.process_count) diff --git a/processor/Expander.py b/processor/Expander.py new file mode 100644 index 0000000..bb9dc4a --- /dev/null +++ b/processor/Expander.py @@ -0,0 +1,6 @@ +from abc import ABC, abstractmethod + +class Expander(ABC): + @abstractmethod + def expand(self, image): + pass diff --git a/setup.py b/setup.py index 0645dff..2e35fb0 100644 --- a/setup.py +++ b/setup.py @@ -6,7 +6,7 @@ sys.exit('Sorry, Python < 3.3 is not supported') setup(name='imagecrawler', - version='0.1', + version='0.2', description='Selenium Image Crawler', url='https://github.com/scirag/selenium-image-crawler', author='Şafak ÇIRAĞ',