Merge pull request #58 from srirams6/dev

Moved all functions into a class.
sananth12 · Apr 1, 2015 · c70e371 · c70e371
2 parents 1f205d2 + 18335af
commit c70e371
Show file tree

Hide file tree

Showing 3 changed files with 177 additions and 142 deletions.
diff --git a/image_scraper/mains.py b/image_scraper/mains.py
@@ -4,47 +4,41 @@
 from past.utils import old_div
 import sys
 from .progressbar import *
-from .utils import (process_links, get_html, get_img_list,
-                    download_image, process_download_path,
-                    get_arguments)
+from .utils import ImageScraper
 from .exceptions import *
 
-
 def console_main():
-    URL, no_to_download, format_list, download_path, max_filesize, dump_urls, scrape_reverse, use_ghost = get_arguments()
+    scraper = ImageScraper()
+    scraper.get_arguments()
     print("\nImageScraper\n============\nRequesting page....\n")
-
     try:
-        page_html, page_url = get_html(URL, use_ghost)
+        scraper.get_html()
     except PageLoadError as e:
-        page_html = ""
-        page_url = ""
+        scraper.page_html = ""
+        scraper.page_url = ""
         print("Page failed to load. Status code: {0}".format(e.status_code))
         sys.exit()
 
-    images = get_img_list(page_html, page_url, format_list)
+    scraper.get_img_list()
 
-    if len(images) == 0:
+    if len(scraper.images) == 0:
         sys.exit("Sorry, no images found.")
-    if no_to_download is None:
-        no_to_download = len(images)
+    if scraper.no_to_download is None:
+        scraper.no_to_download = len(scraper.images)
 
-    print("Found {0} images: ".format(len(images)))
+    print("Found {0} images: ".format(len(scraper.images)))
 
     try:
-        process_download_path(download_path)
+        scraper.process_download_path()
     except DirectoryAccessError:
         print("Sorry, the directory can't be accessed.")
         sys.exit()
     except DirectoryCreateError:
         print("Sorry, the directory can't be created.")
         sys.exit()
 
-    if scrape_reverse:
-        images.reverse()
-
-    if dump_urls:
-        for img_url in images:
+    if scraper.dump_urls:
+        for img_url in scraper.images:
             print(img_url)
 
     count = 0
@@ -55,30 +49,29 @@ def console_main():
                ' ', ETA(), ' ', FileTransferSpeed()]
     pbar = ProgressBar(widgets=widgets, maxval=100).start()
 
-    for img_url in images:
-        if count == no_to_download:
+    for img_url in scraper.images:
+        if count == scraper.no_to_download:
             break
         try:
-            download_image(img_url, download_path, max_filesize)
+            scraper.download_image(img_url)
         except ImageDownloadError:
             failed += 1
         except ImageSizeError:
             over_max_filesize += 1
 
         count += 1
-        percent = percent + old_div(100.0, no_to_download)
+        percent = percent + old_div(100.0, scraper.no_to_download)
         pbar.update(percent % 100)
 
     pbar.finish()
     print("\nDone!\nDownloaded {0} images\nFailed: {1}\n".format(count-failed-over_max_filesize, failed))
     return
 
-
 def scrape_images(url, no_to_download=None,
                   format_list=["jpg", "png", "gif", "svg", "jpeg"],
                   download_path='images', max_filesize=100000000,
                   dump_urls=False, use_ghost=False):
-
+    #  Broken due to wrapping in class. Need to fix!
     page_html, page_url = get_html(url, use_ghost)
     images = get_img_list(page_html, page_url, format_list)
 

diff --git a/image_scraper/utils.py b/image_scraper/utils.py
@@ -9,123 +9,152 @@
 import re
 from image_scraper.exceptions import *
 
+class ImageScraper:
+    url = None
+    no_to_download = 0
+    format_list = []
+    download_path = "images"
+    max_filesize = 100000000
+    dump_urls = False
+    scrape_reverse = False
+    use_ghost = False
+    page_html = None
+    page_url = None
+    images = None
 
-def process_links(links, formats=["jpg", "png", "gif", "svg", "jpeg"]):
-    x = []
-    for l in links:
-        if os.path.splitext(l)[1][1:].strip().lower() in formats:
-                x.append(l)
-    return x
+    def __init__(self):
+        url = None
+        no_to_download = 0
+        format_list = []
+        download_path = "images"
+        max_filesize = 100000000
+        dump_urls = False
+        scrape_reverse = False
+        use_ghost = False
+        images = None
 
+    def get_arguments(self):
+        parser = argparse.ArgumentParser(
+            description='Downloads images from given URL')
+        parser.add_argument('url2scrape', nargs=1, help="URL to scrape")
+        parser.add_argument('-m', '--max-images', type=int, default=None,
+                            help="Limit on number of images\n")
+        parser.add_argument('-s', '--save-dir', type=str, default="images",
+                            help="Directory in which images should be saved")
+        parser.add_argument('-g', '--injected', help="Scrape injected images",
+                            action="store_true")
+        parser.add_argument('--max-filesize', type=int, default=100000000,
+                            help="Limit on size of image in bytes")
+        parser.add_argument('--dump-urls', default=False,
+                            help="Print the URLs of the images",
+                            action="store_true")
+        parser.add_argument('--formats', nargs="*", default=None,
+                            help="Specify formats in a list without any separator. This argument must be after the URL.")
+        parser.add_argument('--scrape-reverse', default=False,
+                            help="Scrape the images in reverse order",
+                            action="store_true")
+        args = parser.parse_args()
+        self.url = args.url2scrape[0]
+        if not re.match(r'^[a-zA-Z]+://', self.url):
+            self.url = 'http://' + self.url
+        self.no_to_download = args.max_images
+        save_dir = args.save_dir + '_{uri.netloc}'.format(
+            uri = urlparse(self.url))
+        if args.save_dir != "images":
+            save_dir = args.save_dir
+        self.download_path = os.path.join(os.getcwd(), save_dir)
+        self.use_ghost = args.injected
+        self.format_list = args.formats if args.formats else ["jpg", "png", "gif", "svg", "jpeg"]
+        self.max_filesize = args.max_filesize
+        self.dump_urls = args.dump_urls
+        self.scrape_reverse = args.scrape_reverse
+        return (self.url, self.no_to_download, self.format_list, self.download_path, self.max_filesize,
+                self.dump_urls, self.scrape_reverse, self.use_ghost)
 
-def get_arguments():
-    parser = argparse.ArgumentParser(
-        description='Downloads images from given URL')
-    parser.add_argument('url2scrape', nargs=1, help="URL to scrape")
-    parser.add_argument('-m', '--max-images', type=int, default=None,
-                        help="Limit on number of images\n")
-    parser.add_argument('-s', '--save-dir', type=str, default="images",
-                        help="Directory in which images should be saved")
-    parser.add_argument('-g', '--injected', help="Scrape injected images",
-                        action="store_true")
-    parser.add_argument('--max-filesize', type=int, default=100000000,
-                        help="Limit on size of image in bytes")
-    parser.add_argument('--dump-urls', default=False,
-                        help="Print the URLs of the images",
-                        action="store_true")
-    parser.add_argument('--formats', nargs="*", default=None,
-                        help="Specify formats in a list without any separator. This argument must be after the URL.")
-    parser.add_argument('--scrape-reverse', default=False,
-                        help="Scrape the images in reverse order",
-                        action="store_true")
-    args = parser.parse_args()
-    URL = args.url2scrape[0]
-    if not re.match(r'^[a-zA-Z]+://', URL):
-        URL = 'http://' + URL
-    no_to_download = args.max_images
-    save_dir = args.save_dir + '_{uri.netloc}'.format(
-        uri = urlparse(URL))
-    if args.save_dir != "images":
-        save_dir = args.save_dir
-    download_path = os.path.join(os.getcwd(), save_dir)
-    use_ghost = args.injected
-    format_list = args.formats if args.formats else ["jpg", "png", "gif", "svg", "jpeg"]
-    max_filesize = args.max_filesize
-    dump_urls = args.dump_urls
-    scrape_reverse = args.scrape_reverse
-    return (URL, no_to_download, format_list, download_path, max_filesize,
-            dump_urls, scrape_reverse, use_ghost)
 
+    def get_html(self):
+        if self.use_ghost:
+            self.url = urljoin("http://", self.url)
+            import selenium
+            import selenium.webdriver
+            driver = selenium.webdriver.PhantomJS(service_log_path=os.path.devnull)
+            driver.get(self.url)
+            page_html = driver.page_source
+            page_url = driver.current_url
+            driver.quit()
+        else:
+            try:
+                page = requests.get(self.url)
+                if page.status_code != 200:
+                    raise PageLoadError(page.status_code)
+            except requests.exceptions.MissingSchema:
+                self.url = "http://" + self.url
+                page = requests.get(self.url)
+                if page.status_code != 200:
+                    raise PageLoadError(page.status_code)
+            finally:
+                page_html = page.text
+                page_url = page.url
+        self.page_html = page_html
+        self.page_url = page_url
+        return (self.page_html, self.page_url)
 
-def process_download_path(download_path):
-    if os.path.exists(download_path):
-        if not os.access(download_path, os.W_OK):
-            raise DirectoryAccessError
-    elif os.access(os.path.dirname(download_path), os.W_OK):
-        os.makedirs(download_path)
-    else:
-        raise DirectoryCreateError
-    return True
 
+    def get_img_list(self):
+        tree = html.fromstring(self.page_html)
+        img = tree.xpath('//img/@src')
+        links = tree.xpath('//a/@href')
+        img_list = self.process_links(img)
+        img_links = self.process_links(links)
+        img_list.extend(img_links)
+        images = [urljoin(self.url, img_url) for img_url in img_list]
+        images = list(set(images))
+        self.images = images
+        if self.scrape_reverse:
+            self.images.reverse()
+        return self.images
 
-def get_html(URL, use_ghost):
-    if use_ghost:
-        URL = urljoin("http://", URL)
-        import selenium
-        import selenium.webdriver
-        driver = selenium.webdriver.PhantomJS(service_log_path=os.path.devnull)
-        driver.get(URL)
-        page_html = driver.page_source
-        page_url = driver.current_url
-        driver.quit()
-    else:
-        try:
-            page = requests.get(URL)
-        except requests.exceptions.MissingSchema:
-            URL = "http://" + URL
-            page = requests.get(URL)
-        finally:
-            if page.status_code != 200:
-                raise PageLoadError(page.status_code)
-            page_html = page.text
-            page_url = page.url
-    return (page_html, page_url)
+    def process_download_path(self):
+        if os.path.exists(self.download_path):
+            if not os.access(self.download_path, os.W_OK):
+                raise DirectoryAccessError
+        elif os.access(os.path.dirname(self.download_path), os.W_OK):
+            os.makedirs(self.download_path)
+        else:
+            raise DirectoryCreateError
+        return True
 
+    def download_image(self, img_url):
+        img_request = None
+        success_flag = True
+        size_success_flag = True
+        try:
+            img_request = requests.request('get', img_url, stream=True)
+            if img_request.status_code != 200:
+                raise ImageDownloadError(img_request.status_code)
+        except:
+            raise ImageDownloadError()
 
-def get_img_list(page_html, page_url, format_list):
-    tree = html.fromstring(page_html)
-    img = tree.xpath('//img/@src')
-    links = tree.xpath('//a/@href')
-    img_list = process_links(img, format_list)
-    img_links = process_links(links, format_list)
-    img_list.extend(img_links)
-    images = [urljoin(page_url, url) for url in img_list]
-    images = list(set(images))
-    return images
+        if img_url[-3:] == "svg" :
+            img_content = img_request.content
+            with open(os.path.join(self.download_path,  img_url.split('/')[-1]), 'wb') as f:
+                byte_image = bytes(img_content)
+                f.write(byte_image)
 
+        elif int(img_request.headers['content-length']) < self.max_filesize :
+            img_content = img_request.content
+            with open(os.path.join(self.download_path,  img_url.split('/')[-1]), 'wb') as f:
+                byte_image = bytes(img_content)
+                f.write(byte_image)
+        else:
+            raise ImageSizeError(img_request.headers['content-length'])
+        return True
 
-def download_image(img_url, download_path, max_filesize):
-    img_request = None
-    success_flag = True
-    size_success_flag = True
-    try:
-        img_request = requests.request('get', img_url, stream=True)
-        if img_request.status_code != 200:
-            raise ImageDownloadError(img_request.status_code)
-    except:
-        raise ImageDownloadError()
 
-    if img_url[-3:] == "svg" :
-        img_content = img_request.content
-        with open(os.path.join(download_path,  img_url.split('/')[-1]), 'wb') as f:
-            byte_image = bytes(img_content)
-            f.write(byte_image)
+    def process_links(self, links):
+        x = []
+        for l in links:
+            if os.path.splitext(l)[1][1:].strip().lower() in self.format_list:
+                    x.append(l)
+        return x
 
-    elif int(img_request.headers['content-length']) < max_filesize :
-        img_content = img_request.content
-        with open(os.path.join(download_path,  img_url.split('/')[-1]), 'wb') as f:
-            byte_image = bytes(img_content)
-            f.write(byte_image)
-    else:
-        raise ImageSizeError(img_request.headers['content-length'])
-    return True