Skip to content

Commit

Permalink
New Expander interface
Browse files Browse the repository at this point in the history
Bumped version number to 0.2.

Consolidated processing of downloaded image data.

Convert downloaded data to PIL image to allow for transformations.
Also, MD5 checksum is computed on the final compressed output.

Added an Expander interface to provide an option to explode a downloaded
image into multiple images. Can be also used as a filter based on image
content.

Bug fix for extracting embedded MIME images. Restored line from original source
to start decoding from specific offset.

Signed-off-by: John Poplett <john.poplett@acm.org>
  • Loading branch information
John-Poplett committed Nov 21, 2017
1 parent de938f2 commit 40073c9
Show file tree
Hide file tree
Showing 3 changed files with 62 additions and 43 deletions.
97 changes: 55 additions & 42 deletions processor/DownloadProcessor.py
@@ -1,23 +1,39 @@
from .BaseProcessor import BaseProcessor
from .Expander import Expander
from multiprocessing import Pool
import io
import os
import hashlib
import urllib.request
import time
import base64

import traceback
from PIL import Image

class DownloadProcessor(BaseProcessor):

def __init__(self, output_directory='./', process_count=os.cpu_count()):
"""
The Expander interface takes a single PIL image as input and outputs
a list of zero or more PIL images.
This "noop" expander that can be replaced with something more interesting
(e.g. an object or face extractor).
The expander must close the image whenever it is not returned back to the
caller. The caller must close all images returned by this method.
"""
class _Expander(Expander):
def expand(self, image):
return [image]

def __init__(self, output_directory='./', process_count=os.cpu_count(), expander = _Expander()):
self.output_directory = output_directory
self.process_count = process_count
self.expander = expander

def before_process(self, search_term):

#
# Create folder for downloads based on search term.
#
self.gs_raw_dirpath = os.path.join(self.output_directory, search_term.strip().replace(' ', '_'))
if not os.path.exists(self.gs_raw_dirpath):
os.makedirs(self.gs_raw_dirpath)
Expand All @@ -42,61 +58,58 @@ def download_single_image(self, params):
self.download_fault = 0

timeout = 1
if preview_url.startswith("http://") or preview_url.startswith("https://"):
try:
try:
if preview_url.startswith("http://") or preview_url.startswith("https://"):
response = urllib.request.urlopen(preview_url, data=None, timeout=timeout)
if response.headers['Content-Type'] == "image/jpeg":
file_ext = ".jpg"
file_ext = "jpg"
elif response.headers['Content-Type'] == "image/png":
file_ext = ".png"
file_ext = "png"
elif response.headers['Content-Type'] == "image/gif":
file_ext = ".gif"
file_ext = "gif"
else:
raise "image format not found"
data = response.read() # a `bytes` object
if len(data) > 0:
elif preview_url.startswith("data:"):
if preview_url.startswith("data:image/jpeg"):
file_ext = "jpg"
elif preview_url.startswith("data:image/png"):
file_ext = "png"
elif preview_url.startswith("data:image/gif"):
file_ext = "gif"
else:
raise "image format not found"
preview_url = preview_url[preview_url.find(",") + 1:]
data = base64.standard_b64decode(preview_url)
image_format = file_ext
if file_ext == 'jpg':
image_format = 'jpeg'
if len(data) > 0:
images = self.expander.expand(Image.open(io.BytesIO(data)))
for image in images:
output = io.BytesIO()
image.save(output, format = image_format)
data = output.getvalue()
output.close()
image.close()
md5_key = hashlib.md5(data).hexdigest()
pic_prefix_str = '%s.%s' % (search_term, md5_key)
temp_filename = pic_prefix_str + file_ext
temp_filename = '%s.%s' % (pic_prefix_str, file_ext)
temp_filename_full_path = os.path.join(self.gs_raw_dirpath, temp_filename)
# If file exists, could be rare MD5 collision or, more likely,
# is the same exact image downloaded from a different location
if not os.path.exists(temp_filename_full_path):
info_txt_path = os.path.join(self.gs_raw_dirpath, search_term + '_info.txt')
f = open(temp_filename_full_path, 'wb') # save as test.gif
# print(url_link)
f.write(data) # if have problem skip
f.close()
with open(info_txt_path, 'a') as f:
f.write(pic_prefix_str + ': ' + original_url)
f.write('\n')
except:
print('Problem with processing this data: ', original_url)
self.download_fault = 1
pass
elif preview_url.startswith("data:"):
if preview_url.startswith("data:image/jpeg"):
file_ext = ".jpg"
elif preview_url.startswith("data:image/png"):
file_ext = ".png"
elif preview_url.startswith("data:image/gif"):
file_ext = ".gif"
else:
raise "image format not found"
data = base64.standard_b64decode(preview_url)
if len(data) > 0:
md5_key = hashlib.md5(data).hexdigest()
pic_prefix_str = '%s.%s' % (search_term, md5_key)
temp_filename = pic_prefix_str + file_ext
temp_filename_full_path = os.path.join(self.gs_raw_dirpath, temp_filename)
if not os.path.exists(temp_filename_full_path):
info_txt_path = os.path.join(self.gs_raw_dirpath, search_term + '_info.txt')
with open(temp_filename_full_path, "wb") as fh:
preview_url = preview_url[preview_url.find(",") + 1:]
fh.write(base64.standard_b64decode(preview_url))
with open(temp_filename_full_path, 'wb') as f:
f.write(data)
with open(info_txt_path, 'a') as f:
f.write(pic_prefix_str + ': ' + original_url)
f.write('\n')
except Exception as ex:
print('Problem with processing this data: ', original_url)
traceback.print_exc()
print(ex)
self.download_fault = 1

def after_process(self, search_term):
thread_pool = Pool(processes=self.process_count)
Expand Down
6 changes: 6 additions & 0 deletions processor/Expander.py
@@ -0,0 +1,6 @@
from abc import ABC, abstractmethod

class Expander(ABC):
@abstractmethod
def expand(self, image):
pass
2 changes: 1 addition & 1 deletion setup.py
Expand Up @@ -6,7 +6,7 @@
sys.exit('Sorry, Python < 3.3 is not supported')

setup(name='imagecrawler',
version='0.1',
version='0.2',
description='Selenium Image Crawler',
url='https://github.com/scirag/selenium-image-crawler',
author='Şafak ÇIRAĞ',
Expand Down

0 comments on commit 40073c9

Please sign in to comment.