## Intro to Scrapy Selector

In [13]:
# Import Scrapy Libraries

from scrapy import Selector

In [22]:
# Import requests
import requests

# Create the string html containing the HTML source
html = requests.get( url ).content

# Create the Selector object sel from html
sel = Selector( text = html )

# Print out the number of elements in the HTML document
print( "There are 1020 elements in the HTML document.")
print( "You have found: ", len( sel.xpath('//*') ) )

NameError: name 'url' is not defined

## Creating a Spider

In [4]:
def inspect_class(c):
    newc = c()
    meths = dir(newc)
    if 'name' in meths:
        print("Your spider class name is:", newc.name)
    if 'from_crawler' in meths:
        print("It seems you have inherited methods from scrapy.Spider -- NICE!")
    else:
        print("Oh no! It doesn't seem that you are inheriting the methods from scrapy.Spider!!")

In [5]:
# Inheriting the Spider

# Import scrapy library
import scrapy

# Create the spider class
class YourSpider(scrapy.Spider):
    name = "your_spider"
  # start_requests method
    def start_requests(self):
        pass
  # parse method
    def parse(self, response):
        pass

# Inspect Your Class
inspect_class(YourSpider)

Your spider class name is: your_spider
It seems you have inherited methods from scrapy.Spider -- NICE!


In [9]:
def inspect_class( c ):
  newc = c()
  meths = dir( newc )
  if 'start_requests' in meths:
    print( "The start_requests method yields the following urls:" )
    for u in newc.start_requests():
      print(  "\t-", u )

In [10]:
# Import scrapy library
import scrapy

# Create the spider class
class YourSpider( scrapy.Spider ):
  name = "your_spider"
  # start_requests method
  def start_requests( self ):
    urls = ["https://www.datacamp.com", "https://scrapy.org"]
    for url in urls:
      yield url
  # parse method
  def parse( self, response ):
    pass
  
# Inspect Your Class
inspect_class( YourSpider )

The start_requests method yields the following urls:
	- https://www.datacamp.com
	- https://scrapy.org


In [11]:
def inspect_class( c ):
  newc = c()
  try:
    newc.start_requests()
  except:
    print( "Oh No! Something is wrong with the code! Keep trying." )

In [12]:
# Self referencing in classes
# Import scrapy library
import scrapy

# Create the spider class
class YourSpider( scrapy.Spider ):
  name = "your_spider"
  # start_requests method
  def start_requests( self ):
    self.print_msg( "Hello World!" )
  # parse method
  def parse( self, response ):
    pass
  # print_msg method
  def print_msg( self, msg ):
    print( "Calling start_requests in YourSpider prints out:", msg )
  
# Inspect Your Class
inspect_class( YourSpider )

Calling start_requests in YourSpider prints out: Hello World!


In [16]:
# Import scrapy library
import scrapy

# Create the spider class
class YourSpider( scrapy.Spider ):
  name = "your_spider"
  # start_requests method
  def start_requests( self ):
    yield scrapy.Request( url = "https://www.datacamp.com", callback = self.parse)
  # parse method
  def parse( self, response ):
    pass
  
# Inspect Your Class
inspect_class( YourSpider )

In [18]:
def inspect_spider( s ):
  news = s()
  try:
    req = list( news.start_requests() )[0]
    url = req.url
    html = requests.get( url ).content
    response = TextResponse( url = url, body = html, encoding = 'utf-8' )
    author_names = req.callback( response )
    print( 'You have collected the author names:')
    for a in author_names:
      print('\t-', a )
  except:
    print( 'Oh no! Something went wrong with the code. Keep trying!')


In [28]:
# Import the scrapy library
import scrapy

url_short = 'https://assets.datacamp.com/production/repositories/2560/datasets/19a0a26daa8d9db1d920b5d5607c19d6d8094b3b/all_short'

url_short = requests.get( url_short ).content

# Create the Spider class
class DCspider( scrapy.Spider ):
  name = 'dcspider'
  # start_requests method
  def start_requests( self ):
    yield scrapy.Request( url = url_short, callback = self.parse )
  # parse method
  def parse( self, response):
    # Create an extracted list of course author names
    author_names = response.css('p.course-block__author-name ::text').extract()
    # Here we will just return the list of Authors
    return author_names
  
# Inspect the spider
inspect_spider( DCspider )

Oh no! Something went wrong with the code. Keep trying!


In [29]:
# Import the scrapy library
import scrapy

# Create the Spider class
class DCdescr( scrapy.Spider ):
  name = 'dcdescr'
  # start_requests method
  def start_requests( self ):
    yield scrapy.Request( url = url_short, callback = self.parse )
  
  # First parse method
  def parse( self, response ):
    links = response.css( 'div.course-block > a::attr(href)' ).extract()
    # Follow each of the extracted links
    for link in links:
      yield response.follow( url = link, callback = self.parse_descr )

      
  # Second parsing method
  def parse_descr( self, response ):
    # Extract course description
    course_descr = response.css( 'p.course__description::text' ).extract_first()
    # For now, just yield the course description
    yield course_descr


# Inspect the spider
inspect_spider( DCdescr )

Oh no! Something went wrong with the code. Keep trying!


# Capstone Project

In [32]:
# Definition

def previewCourses( dc_dict, n = 3 ):
  crs_titles = list( dc_dict.keys() )
  print( "A preview of DataCamp Courses:")
  print("---------------------------------------\n")
  for t in crs_titles[:n]:
    print( "TITLE: %s" % t)
    for i,ct in enumerate(dc_dict[t]):
      print("\tChapter %d: %s" % (i+1,ct) )
    print("")

In [35]:
# Import scrapy
import scrapy

# Import the CrawlerProcess: for running the spider
from scrapy.crawler import CrawlerProcess

# Create the Spider class
class DC_Chapter_Spider(scrapy.Spider):
  name = "dc_chapter_spider"
  # start_requests method
  def start_requests(self):
    yield scrapy.Request(url = url_short,
                         callback = self.parse_front)
  # First parsing method
  def parse_front(self, response):
    course_blocks = response.css('div.course-block')
    course_links = course_blocks.xpath('./a/@href')
    links_to_follow = course_links.extract()
    for url in links_to_follow:
      yield response.follow(url = url,
                            callback = self.parse_pages)
  # Second parsing method
  def parse_pages(self, response):
    crs_title = response.xpath('//h1[contains(@class,"title")]/text()')
    crs_title_ext = crs_title.extract_first().strip()
    ch_titles = response.css('h4.chapter__title::text')
    ch_titles_ext = [t.strip() for t in ch_titles.extract()]
    dc_dict[ crs_title_ext ] = ch_titles_ext

# Initialize the dictionary **outside** of the Spider class
dc_dict = dict()

# Run the Spider
process = CrawlerProcess()
process.crawl(DC_Chapter_Spider)
process.start()

# Print a preview of courses
previewCourses(dc_dict)

2019-11-26 17:20:31 [scrapy.utils.log] INFO: Scrapy 1.8.0 started (bot: scrapybot)
2019-11-26 17:20:31 [scrapy.utils.log] INFO: Versions: lxml 4.4.1.0, libxml2 2.9.5, cssselect 1.0.3, parsel 1.5.1, w3lib 1.20.0, Twisted 19.10.0, Python 3.5.5 |Anaconda, Inc.| (default, Apr  7 2018, 04:52:34) [MSC v.1900 64 bit (AMD64)], pyOpenSSL 18.0.0 (OpenSSL 1.0.2t  10 Sep 2019), cryptography 2.2.2, Platform Windows-10-10.0.17763-SP0
2019-11-26 17:20:31 [scrapy.crawler] INFO: Overridden settings: {}
2019-11-26 17:20:31 [scrapy.extensions.telnet] INFO: Telnet Password: 92fa32c613bb2e5a
2019-11-26 17:20:31 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.logstats.LogStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.corestats.CoreStats']
2019-11-26 17:20:31 [scrapy.middleware] INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
 'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
 'scrapy.download

ReactorNotRestartable: 

In [37]:
# Import scrapy
import scrapy

# Import the CrawlerProcess: for running the spider
from scrapy.crawler import CrawlerProcess

# Create the Spider class
class DC_Description_Spider(scrapy.Spider):
  name = "dc_chapter_spider"
  # start_requests method
  def start_requests(self):
    yield scrapy.Request(url = url_short,
                         callback = self.parse_front)
  # First parsing method
  def parse_front(self, response):
    course_blocks = response.css('div.course-block')
    course_links = course_blocks.xpath('./a/@href')
    links_to_follow = course_links.extract()
    for url in links_to_follow:
      yield response.follow(url = url,
                            callback = self.parse_pages)
  # Second parsing method
  def parse_pages(self, response):
    # Create a SelectorList of the course titles text
    crs_title = response.xpath('//h1[contains(@class,"title")]/text()')
    # Extract the text and strip it clean
    crs_title_ext = crs_title.extract_first().strip()
    # Create a SelectorList of course descriptions text
    crs_descr = response.css('p.course__description::text')
    # Extract the text and strip it clean
    crs_descr_ext = crs_descr.extract_first().strip()
    # Fill in the dictionary
    dc_dict[crs_title_ext] = crs_descr_ext

# Initialize the dictionary **outside** of the Spider class
dc_dict = dict()

# Run the Spider
process = CrawlerProcess()
process.crawl(DC_Description_Spider)
process.start()

# Print a preview of courses
previewCourses(dc_dict)

2019-11-26 17:24:13 [scrapy.utils.log] INFO: Scrapy 1.8.0 started (bot: scrapybot)
2019-11-26 17:24:13 [scrapy.utils.log] INFO: Versions: lxml 4.4.1.0, libxml2 2.9.5, cssselect 1.0.3, parsel 1.5.1, w3lib 1.20.0, Twisted 19.10.0, Python 3.5.5 |Anaconda, Inc.| (default, Apr  7 2018, 04:52:34) [MSC v.1900 64 bit (AMD64)], pyOpenSSL 18.0.0 (OpenSSL 1.0.2t  10 Sep 2019), cryptography 2.2.2, Platform Windows-10-10.0.17763-SP0
2019-11-26 17:24:13 [scrapy.crawler] INFO: Overridden settings: {}
2019-11-26 17:24:13 [scrapy.extensions.telnet] INFO: Telnet Password: c696fb1fb3e91654
2019-11-26 17:24:13 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.logstats.LogStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.corestats.CoreStats']
2019-11-26 17:24:13 [scrapy.middleware] INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
 'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
 'scrapy.download

ReactorNotRestartable: 

In [38]:
# parse method
def parse(self, response):
  # Extracted course titles
  crs_titles = response.xpath('//h4[contains(@class,"block__title")]/text()').extract()
  # Extracted course descriptions
  crs_descrs = response.xpath('//p[contains(@class,"block__description")]/text()').extract()
  # Fill in the dictionary
  for crs_title, crs_descr in zip(crs_titles, crs_descrs):
    dc_dict[crs_title] = crs_descr

In [41]:
# Import scrapy
import scrapy

# Import the CrawlerProcess
from scrapy.crawler import CrawlerProcess

# Create the Spider class
class YourSpider(scrapy.Spider):
  name = 'yourspider'
  # start_requests method
  def start_requests( self ):
    yield scrapy.Request(url = url_short, callback = self.parse)
      
  def parse(self, response):
    # My version of the parser you wrote in the previous part
    crs_titles = response.xpath('//h4[contains(@class,"block__title")]/text()').extract()
    crs_descrs = response.xpath('//p[contains(@class,"block__description")]/text()').extract()
    for crs_title, crs_descr in zip( crs_titles, crs_descrs ):
      dc_dict[crs_title] = crs_descr
    
# Initialize the dictionary **outside** of the Spider class
dc_dict = dict()

# Run the Spider
process = CrawlerProcess()
process.crawl(YourSpider)
process.start()

# Print a preview of courses
previewCourses(dc_dict)

2019-11-26 17:27:21 [scrapy.utils.log] INFO: Scrapy 1.8.0 started (bot: scrapybot)
2019-11-26 17:27:21 [scrapy.utils.log] INFO: Versions: lxml 4.4.1.0, libxml2 2.9.5, cssselect 1.0.3, parsel 1.5.1, w3lib 1.20.0, Twisted 19.10.0, Python 3.5.5 |Anaconda, Inc.| (default, Apr  7 2018, 04:52:34) [MSC v.1900 64 bit (AMD64)], pyOpenSSL 18.0.0 (OpenSSL 1.0.2t  10 Sep 2019), cryptography 2.2.2, Platform Windows-10-10.0.17763-SP0
2019-11-26 17:27:21 [scrapy.crawler] INFO: Overridden settings: {}
2019-11-26 17:27:21 [scrapy.extensions.telnet] INFO: Telnet Password: c1b089c6b57a385d
2019-11-26 17:27:21 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.logstats.LogStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.corestats.CoreStats']
2019-11-26 17:27:21 [scrapy.middleware] INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
 'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
 'scrapy.download

ReactorNotRestartable: 