Skip to content

Commit

Permalink
Merge pull request #58 from srirams6/dev
Browse files Browse the repository at this point in the history
Moved all functions into a class.
  • Loading branch information
ssundarraj committed Apr 1, 2015
2 parents 1f205d2 + 18335af commit c70e371
Show file tree
Hide file tree
Showing 3 changed files with 177 additions and 142 deletions.
45 changes: 19 additions & 26 deletions image_scraper/mains.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,47 +4,41 @@
from past.utils import old_div
import sys
from .progressbar import *
from .utils import (process_links, get_html, get_img_list,
download_image, process_download_path,
get_arguments)
from .utils import ImageScraper
from .exceptions import *


def console_main():
URL, no_to_download, format_list, download_path, max_filesize, dump_urls, scrape_reverse, use_ghost = get_arguments()
scraper = ImageScraper()
scraper.get_arguments()
print("\nImageScraper\n============\nRequesting page....\n")

try:
page_html, page_url = get_html(URL, use_ghost)
scraper.get_html()
except PageLoadError as e:
page_html = ""
page_url = ""
scraper.page_html = ""
scraper.page_url = ""
print("Page failed to load. Status code: {0}".format(e.status_code))
sys.exit()

images = get_img_list(page_html, page_url, format_list)
scraper.get_img_list()

if len(images) == 0:
if len(scraper.images) == 0:
sys.exit("Sorry, no images found.")
if no_to_download is None:
no_to_download = len(images)
if scraper.no_to_download is None:
scraper.no_to_download = len(scraper.images)

print("Found {0} images: ".format(len(images)))
print("Found {0} images: ".format(len(scraper.images)))

try:
process_download_path(download_path)
scraper.process_download_path()
except DirectoryAccessError:
print("Sorry, the directory can't be accessed.")
sys.exit()
except DirectoryCreateError:
print("Sorry, the directory can't be created.")
sys.exit()

if scrape_reverse:
images.reverse()

if dump_urls:
for img_url in images:
if scraper.dump_urls:
for img_url in scraper.images:
print(img_url)

count = 0
Expand All @@ -55,30 +49,29 @@ def console_main():
' ', ETA(), ' ', FileTransferSpeed()]
pbar = ProgressBar(widgets=widgets, maxval=100).start()

for img_url in images:
if count == no_to_download:
for img_url in scraper.images:
if count == scraper.no_to_download:
break
try:
download_image(img_url, download_path, max_filesize)
scraper.download_image(img_url)
except ImageDownloadError:
failed += 1
except ImageSizeError:
over_max_filesize += 1

count += 1
percent = percent + old_div(100.0, no_to_download)
percent = percent + old_div(100.0, scraper.no_to_download)
pbar.update(percent % 100)

pbar.finish()
print("\nDone!\nDownloaded {0} images\nFailed: {1}\n".format(count-failed-over_max_filesize, failed))
return


def scrape_images(url, no_to_download=None,
format_list=["jpg", "png", "gif", "svg", "jpeg"],
download_path='images', max_filesize=100000000,
dump_urls=False, use_ghost=False):

# Broken due to wrapping in class. Need to fix!
page_html, page_url = get_html(url, use_ghost)
images = get_img_list(page_html, page_url, format_list)

Expand Down
243 changes: 136 additions & 107 deletions image_scraper/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,123 +9,152 @@
import re
from image_scraper.exceptions import *

class ImageScraper:
url = None
no_to_download = 0
format_list = []
download_path = "images"
max_filesize = 100000000
dump_urls = False
scrape_reverse = False
use_ghost = False
page_html = None
page_url = None
images = None

def process_links(links, formats=["jpg", "png", "gif", "svg", "jpeg"]):
x = []
for l in links:
if os.path.splitext(l)[1][1:].strip().lower() in formats:
x.append(l)
return x
def __init__(self):
url = None
no_to_download = 0
format_list = []
download_path = "images"
max_filesize = 100000000
dump_urls = False
scrape_reverse = False
use_ghost = False
images = None

def get_arguments(self):
parser = argparse.ArgumentParser(
description='Downloads images from given URL')
parser.add_argument('url2scrape', nargs=1, help="URL to scrape")
parser.add_argument('-m', '--max-images', type=int, default=None,
help="Limit on number of images\n")
parser.add_argument('-s', '--save-dir', type=str, default="images",
help="Directory in which images should be saved")
parser.add_argument('-g', '--injected', help="Scrape injected images",
action="store_true")
parser.add_argument('--max-filesize', type=int, default=100000000,
help="Limit on size of image in bytes")
parser.add_argument('--dump-urls', default=False,
help="Print the URLs of the images",
action="store_true")
parser.add_argument('--formats', nargs="*", default=None,
help="Specify formats in a list without any separator. This argument must be after the URL.")
parser.add_argument('--scrape-reverse', default=False,
help="Scrape the images in reverse order",
action="store_true")
args = parser.parse_args()
self.url = args.url2scrape[0]
if not re.match(r'^[a-zA-Z]+://', self.url):
self.url = 'http://' + self.url
self.no_to_download = args.max_images
save_dir = args.save_dir + '_{uri.netloc}'.format(
uri = urlparse(self.url))
if args.save_dir != "images":
save_dir = args.save_dir
self.download_path = os.path.join(os.getcwd(), save_dir)
self.use_ghost = args.injected
self.format_list = args.formats if args.formats else ["jpg", "png", "gif", "svg", "jpeg"]
self.max_filesize = args.max_filesize
self.dump_urls = args.dump_urls
self.scrape_reverse = args.scrape_reverse
return (self.url, self.no_to_download, self.format_list, self.download_path, self.max_filesize,
self.dump_urls, self.scrape_reverse, self.use_ghost)

def get_arguments():
parser = argparse.ArgumentParser(
description='Downloads images from given URL')
parser.add_argument('url2scrape', nargs=1, help="URL to scrape")
parser.add_argument('-m', '--max-images', type=int, default=None,
help="Limit on number of images\n")
parser.add_argument('-s', '--save-dir', type=str, default="images",
help="Directory in which images should be saved")
parser.add_argument('-g', '--injected', help="Scrape injected images",
action="store_true")
parser.add_argument('--max-filesize', type=int, default=100000000,
help="Limit on size of image in bytes")
parser.add_argument('--dump-urls', default=False,
help="Print the URLs of the images",
action="store_true")
parser.add_argument('--formats', nargs="*", default=None,
help="Specify formats in a list without any separator. This argument must be after the URL.")
parser.add_argument('--scrape-reverse', default=False,
help="Scrape the images in reverse order",
action="store_true")
args = parser.parse_args()
URL = args.url2scrape[0]
if not re.match(r'^[a-zA-Z]+://', URL):
URL = 'http://' + URL
no_to_download = args.max_images
save_dir = args.save_dir + '_{uri.netloc}'.format(
uri = urlparse(URL))
if args.save_dir != "images":
save_dir = args.save_dir
download_path = os.path.join(os.getcwd(), save_dir)
use_ghost = args.injected
format_list = args.formats if args.formats else ["jpg", "png", "gif", "svg", "jpeg"]
max_filesize = args.max_filesize
dump_urls = args.dump_urls
scrape_reverse = args.scrape_reverse
return (URL, no_to_download, format_list, download_path, max_filesize,
dump_urls, scrape_reverse, use_ghost)

def get_html(self):
if self.use_ghost:
self.url = urljoin("http://", self.url)
import selenium
import selenium.webdriver
driver = selenium.webdriver.PhantomJS(service_log_path=os.path.devnull)
driver.get(self.url)
page_html = driver.page_source
page_url = driver.current_url
driver.quit()
else:
try:
page = requests.get(self.url)
if page.status_code != 200:
raise PageLoadError(page.status_code)
except requests.exceptions.MissingSchema:
self.url = "http://" + self.url
page = requests.get(self.url)
if page.status_code != 200:
raise PageLoadError(page.status_code)
finally:
page_html = page.text
page_url = page.url
self.page_html = page_html
self.page_url = page_url
return (self.page_html, self.page_url)

def process_download_path(download_path):
if os.path.exists(download_path):
if not os.access(download_path, os.W_OK):
raise DirectoryAccessError
elif os.access(os.path.dirname(download_path), os.W_OK):
os.makedirs(download_path)
else:
raise DirectoryCreateError
return True

def get_img_list(self):
tree = html.fromstring(self.page_html)
img = tree.xpath('//img/@src')
links = tree.xpath('//a/@href')
img_list = self.process_links(img)
img_links = self.process_links(links)
img_list.extend(img_links)
images = [urljoin(self.url, img_url) for img_url in img_list]
images = list(set(images))
self.images = images
if self.scrape_reverse:
self.images.reverse()
return self.images

def get_html(URL, use_ghost):
if use_ghost:
URL = urljoin("http://", URL)
import selenium
import selenium.webdriver
driver = selenium.webdriver.PhantomJS(service_log_path=os.path.devnull)
driver.get(URL)
page_html = driver.page_source
page_url = driver.current_url
driver.quit()
else:
try:
page = requests.get(URL)
except requests.exceptions.MissingSchema:
URL = "http://" + URL
page = requests.get(URL)
finally:
if page.status_code != 200:
raise PageLoadError(page.status_code)
page_html = page.text
page_url = page.url
return (page_html, page_url)
def process_download_path(self):
if os.path.exists(self.download_path):
if not os.access(self.download_path, os.W_OK):
raise DirectoryAccessError
elif os.access(os.path.dirname(self.download_path), os.W_OK):
os.makedirs(self.download_path)
else:
raise DirectoryCreateError
return True

def download_image(self, img_url):
img_request = None
success_flag = True
size_success_flag = True
try:
img_request = requests.request('get', img_url, stream=True)
if img_request.status_code != 200:
raise ImageDownloadError(img_request.status_code)
except:
raise ImageDownloadError()

def get_img_list(page_html, page_url, format_list):
tree = html.fromstring(page_html)
img = tree.xpath('//img/@src')
links = tree.xpath('//a/@href')
img_list = process_links(img, format_list)
img_links = process_links(links, format_list)
img_list.extend(img_links)
images = [urljoin(page_url, url) for url in img_list]
images = list(set(images))
return images
if img_url[-3:] == "svg" :
img_content = img_request.content
with open(os.path.join(self.download_path, img_url.split('/')[-1]), 'wb') as f:
byte_image = bytes(img_content)
f.write(byte_image)

elif int(img_request.headers['content-length']) < self.max_filesize :
img_content = img_request.content
with open(os.path.join(self.download_path, img_url.split('/')[-1]), 'wb') as f:
byte_image = bytes(img_content)
f.write(byte_image)
else:
raise ImageSizeError(img_request.headers['content-length'])
return True

def download_image(img_url, download_path, max_filesize):
img_request = None
success_flag = True
size_success_flag = True
try:
img_request = requests.request('get', img_url, stream=True)
if img_request.status_code != 200:
raise ImageDownloadError(img_request.status_code)
except:
raise ImageDownloadError()

if img_url[-3:] == "svg" :
img_content = img_request.content
with open(os.path.join(download_path, img_url.split('/')[-1]), 'wb') as f:
byte_image = bytes(img_content)
f.write(byte_image)
def process_links(self, links):
x = []
for l in links:
if os.path.splitext(l)[1][1:].strip().lower() in self.format_list:
x.append(l)
return x

elif int(img_request.headers['content-length']) < max_filesize :
img_content = img_request.content
with open(os.path.join(download_path, img_url.split('/')[-1]), 'wb') as f:
byte_image = bytes(img_content)
f.write(byte_image)
else:
raise ImageSizeError(img_request.headers['content-length'])
return True

0 comments on commit c70e371

Please sign in to comment.