In [2]:
# Settings for notebook
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [3]:
try:
    import scrapy
except:
    !pip install scrapy
    import scrapy
from scrapy.crawler import CrawlerProcess

In [4]:
# define the fields for your item here
import scrapy


class DiscoverlifeItem(scrapy.Item):
    biological_name = scrapy.Field()
    common_name = scrapy.Field()
    latitude = scrapy.Field()
    longitude = scrapy.Field()
    date = scrapy.Field()
    image_urls = scrapy.Field()
    image_paths = scrapy.Field()
    url = scrapy.Field()


In [5]:
# define pip line
import scrapy
from scrapy.pipelines.images import ImagesPipeline
from scrapy.exceptions import DropItem

class MyImagesPipeline(ImagesPipeline):

    def get_media_requests(self, item, info):
        for image_url in item['image_urls']:
            yield scrapy.Request(image_url)

    def item_completed(self, results, item, info):
        image_paths = [x['path'] for ok, x in results if ok]
        if not image_paths:
            raise DropItem("Item contains no images")
        item['image_paths'] = 'images_2.0/' + image_paths[0]
        return item

class DropNullBioNm(object):
    def process_item(self, item, spider):
        if item['biological_name'] == None:
            raise DropItem("Item empty %s" % item)
        else:
            return item


In [6]:
import scrapy
import logging


class DiscoverlifeSpider(scrapy.Spider):
    name = "discoverlife"
    custom_settings = {
        'LOG_LEVEL': logging.WARNING,
        'ITEM_PIPELINES': {
            '__main__.DropNullBioNm': 1, # pipline to check if bioname present
            '__main__.MyImagesPipeline': 2 # pipline to download images from scraped site
        },
        'FEED_FORMAT':'csv',                                 
        'FEED_URI': 'discoverlife_2.0.csv',
        'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)',
        'IMAGES_STORE': 'images_2.0',
        'DOWNLOAD_TIMEOUT': 672,
    }
    
    start_urls = ["https://www.discoverlife.org/moth/data/table2_33.9_-83.3.html"]

    def parse(self, response):
        # get photo num summary from tables
        photo_num  = response.xpath('//table[@border="1"]/tr[@bgcolor]/td[@bgcolor and @align="right"]/text()').extract()
        # fetch links to each species
        link = response.xpath('//table[@border="1"]/tr[@bgcolor]/td[1]/a/@href').extract()
       
        for i, num in enumerate(photo_num):
            num = int(num.split('\xa0')[0].replace(',', ''))
            # check if total is less than 100, else collect and follow link to species list
            if num > 99:
                moth_name = link[i].split('=')[1] # extract species name from link
                details_link = f'https://www.discoverlife.org/mp/20p?&res=640&selected=1&name={moth_name}&see=name&xml=Moth;'
                yield response.follow(details_link, callback=self.get_links)
    
    def get_links(self, response):
        # get links to species details page and follow
        for url in response.xpath('//td[@valign="top"]/a[2]/@href').extract():
            yield response.follow('https://www.discoverlife.org'+url, callback=self.paser_details)


    def paser_details(self, response):
        try:
            item = DiscoverlifeItem()
            # creating xpath defination to extract individual table data items.
            xpath = '//table/tr[td/b="{}"]/td/following-sibling::td[1]/text()'.format
            bio = response.xpath('//table/tr/td/a/text()').extract()

            item['image_urls'] = ['https://www.discoverlife.org' + response.xpath('//div[@align="center"]/a[1]/img/@src').extract()[0]]
            item['common_name'] = response.xpath(xpath('title')).extract()[1].split(',')[1].strip()
            item['latitude'] = response.xpath(xpath('latitude')).extract()[0]
            item['longitude'] = response.xpath(xpath('longitude')).extract()[0]
            item['date'] = response.xpath(xpath('date1 yyyymmdd')).extract()[0]
            item['biological_name'] = bio[0] if len(bio) > 1 else None
            item['url'] = response.url

            yield item
        except Exception as e:
            print('An Error occured :: ', e, request.url)

In [None]:
# from scrapy.crawler import CrawlerProcess
import scrapy.crawler as crawler
from multiprocessing import Process, Queue
from twisted.internet import reactor

def run_spider(spider):
    print('Processing.....')
    def f(q):
        try:
            runner = crawler.CrawlerRunner()
            deferred = runner.crawl(spider)
            deferred.addBoth(lambda _: reactor.stop())
            reactor.run()
            q.put(None)
        except Exception as e:
            q.put(e)

    q = Queue()
    p = Process(target=f, args=(q,))
    p.start()
    result = q.get()
    p.join()

    if result is not None:
        raise result
    print('Done!')


run_spider(DiscoverlifeSpider)

Processing.....
