Skip to content

Commit

Permalink
Merge pull request #23 from srirams6/master
Browse files Browse the repository at this point in the history
Refactor and added feature.
  • Loading branch information
sananth12 committed Mar 3, 2015
2 parents a6fe49c + 9c01db0 commit 7cb8d83
Show file tree
Hide file tree
Showing 3 changed files with 94 additions and 86 deletions.
86 changes: 1 addition & 85 deletions image_scraper/__init__.py
Original file line number Diff line number Diff line change
@@ -1,85 +1 @@
def console_main():
import sys
from progressbar import *
from utils import (process_links, get_html, get_img_list, download_image,
process_download_path, get_arguments)
URL, no_to_download, format_list, download_path, max_filesize, dump_urls, use_ghost = get_arguments()
print "\nImageScraper\n============\nRequesting page....\n"

page_html, page_url = get_html(URL, use_ghost)
images = get_img_list(page_html, page_url, format_list)

if len(images) == 0:
sys.exit("Sorry, no images found.")
if no_to_download == 0:
no_to_download = len(images)

print "Found %s images: " % len(images)

download_path_flag, download_path_msg = process_download_path(download_path)
if not download_path_flag:
sys.exit(download_path_msg)

if dump_urls:
for img_url in images:
print img_url

count = 0
percent = 0.0
failed = 0
over_max_filesize = 0
widgets = ['Progress: ', Percentage(), ' ', Bar(marker=RotatingMarker()),
' ', ETA(), ' ', FileTransferSpeed()]
pbar = ProgressBar(widgets=widgets, maxval=100).start()

for img_url in images:
flag, size_flag = download_image(img_url, download_path, max_filesize)
if not flag:
if not size_flag:
failed += 1
else:
over_max_filesize += 1
count += 1
percent = percent + 100.0 / no_to_download
pbar.update(percent % 100)
if count == no_to_download:
break

pbar.finish()
print "\nDone!\nDownloaded %s images" % (count-failed-over_max_filesize)
return

def scrape_images(url, no_to_download=0, format_list=["jpg", "png", "gif", "svg", "jpeg"], download_path='images', max_filesize=100000000, dump_urls=False, use_ghost=False):
import sys, os
from utils import (process_links, get_html, get_img_list, download_image,
process_download_path, get_arguments)
page_html, page_url = get_html(url, use_ghost)
images = get_img_list(page_html, page_url, format_list)

download_path = os.path.join(os.getcwd(), download_path)

if len(images) == 0:
return
if no_to_download == 0:
no_to_download = len(images)

download_path_flag, download_path_msg = process_download_path(download_path)
if not download_path_flag:
sys.exit(download_path_msg)

count = 0
failed = 0
over_max_filesize = 0

for img_url in images:
flag, size_flag = download_image(img_url, download_path, max_filesize)
if not flag:
if not size_flag:
failed += 1
else:
over_max_filesize += 1
count += 1
if count == no_to_download:
break
return count, failed

from mains import console_main, scrape_images
88 changes: 88 additions & 0 deletions image_scraper/mains.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
def console_main():
import sys
from progressbar import *
from utils import (process_links, get_html, get_img_list, download_image,
process_download_path, get_arguments)
URL, no_to_download, format_list, download_path, max_filesize, dump_urls, scrape_reverse, use_ghost = get_arguments()
print "\nImageScraper\n============\nRequesting page....\n"

page_html, page_url = get_html(URL, use_ghost)
images = get_img_list(page_html, page_url, format_list)

if len(images) == 0:
sys.exit("Sorry, no images found.")
if no_to_download == 0:
no_to_download = len(images)

print "Found %s images: " % len(images)

download_path_flag, download_path_msg = process_download_path(download_path)
if not download_path_flag:
sys.exit(download_path_msg)

if scrape_reverse:
images.reverse()

if dump_urls:
for img_url in images:
print img_url

count = 0
percent = 0.0
failed = 0
over_max_filesize = 0
widgets = ['Progress: ', Percentage(), ' ', Bar(marker=RotatingMarker()),
' ', ETA(), ' ', FileTransferSpeed()]
pbar = ProgressBar(widgets=widgets, maxval=100).start()

for img_url in images:
flag, size_flag = download_image(img_url, download_path, max_filesize)
if not flag:
if not size_flag:
failed += 1
else:
over_max_filesize += 1
count += 1
percent = percent + 100.0 / no_to_download
pbar.update(percent % 100)
if count == no_to_download:
break

pbar.finish()
print "\nDone!\nDownloaded %s images" % (count-failed-over_max_filesize)
return

def scrape_images(url, no_to_download=0, format_list=["jpg", "png", "gif", "svg", "jpeg"], download_path='images', max_filesize=100000000, dump_urls=False, use_ghost=False):
import sys, os
from utils import (process_links, get_html, get_img_list, download_image,
process_download_path, get_arguments)
page_html, page_url = get_html(url, use_ghost)
images = get_img_list(page_html, page_url, format_list)

download_path = os.path.join(os.getcwd(), download_path)

if len(images) == 0:
return
if no_to_download == 0:
no_to_download = len(images)

download_path_flag, download_path_msg = process_download_path(download_path)
if not download_path_flag:
sys.exit(download_path_msg)

count = 0
failed = 0
over_max_filesize = 0

for img_url in images:
flag, size_flag = download_image(img_url, download_path, max_filesize)
if not flag:
if not size_flag:
failed += 1
else:
over_max_filesize += 1
count += 1
if count == no_to_download:
break
return count, failed

6 changes: 5 additions & 1 deletion image_scraper/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,9 @@ def get_arguments():
parser.add_argument('--dump-urls', default=False,
help="Print the URLs of the images",
action="store_true")
parser.add_argument('--scrape-reverse', default=False,
help="Scrape the images in reverse order",
action="store_true")
args = parser.parse_args()
URL = args.url2scrape[0]
if not re.match(r'^[a-zA-Z]+://', URL):
Expand All @@ -42,8 +45,9 @@ def get_arguments():
format_list = ["jpg", "png", "gif", "svg", "jpeg"]
max_filesize = args.max_filesize
dump_urls = args.dump_urls
scrape_reverse = args.scrape_reverse
return (URL, no_to_download, format_list, download_path, max_filesize,
dump_urls, use_ghost)
dump_urls, scrape_reverse, use_ghost)


def process_download_path(download_path):
Expand Down

0 comments on commit 7cb8d83

Please sign in to comment.