# Scrapy

- An open source and collaborative framework for extracting the data that we need

### Installation

In [1]:
pip install scrapy

Note: you may need to restart the kernel to use updated packages.


In [3]:
# to create a project folder
# !scrapy startproject projectname
# go back and check the directory
!scrapy startproject myproject

Error: scrapy.cfg already exists in /home/sneha/Downloads/DataScience-CodingBlocks/DataAcquisition_webCrawl_Scrapy/myproject


# Creating Spider

- follow this url for learning more about spider
- http://docs.scrapy.org/en/latest/intro/tutorial.html

- Go to myproject folder 
- Go to spider folder
- Create new quotes_spider.py file

- After writing code go to location of .py file
- Go to Downloads/DataScience-CodingBlocks/DataAcquisition_webCrawl_Scrapy/myproject/myproject
- Run scrapy crawl quotes
- we will see two html files created quote1.html and quote2.html

# Extract data from  a given Web Page

- We have already done scraping of data using beautiful soup library

- this time we use scrapy

- go to /Downloads/DataScience-CodingBlocks/DataAcquisition_webCrawl_Scrapy/myproject
- run this command (scrapy shell 'http://quotes.toscrape.com/page/1/') on shell it will send url request and will return response
- we can check this by typing response , in output we can see <200 http://quotes.toscrape.com/page/1/>

- now we can use this response to fetch the title of url as mentioned above
- response.css('title') run this on shell , here title is tag
- it will return metadta
- we need title only so, response.css('title').getall()
- this will return a list
- we need first element response.css('title::text').getall()[0]
- or we can use get function instead of getAll that by default returns first element
- response.css('title::text').get()

- we can fetch authors and tags for a particular quote
- quote=response.css('div.quotes').get()
- title=quote.css('span.text::text').get()
- author=quote.css('small.author::text').get()
- tag=quote.css('a.tag::text').get()
- ::text is used to print the content in textual format
- span.text means span is tag of htmland text is class name one can check doing inspect element over any quote
- for q in response.css('div.quotes'):
-     text=q.css('span.text::text').get()
-     author=q.css('small.author::text').get()
-     tag=q.css('a.tag::text').getall()
- we can store this is in dictionary 
-    yield {
     "text"=text,
     "author"=author,
     "tag"=tag
}

- we can add all this code in quote.py file of spider folder
- we can run this using this command scrapy crawl quotes -o quotes.json
- we can even run all this command on shell also using scrapy shell 'http://quotes.toscrape.com/page/1/')

In [5]:
import scrapy


class QuotesSpider(scrapy.Spider):
    name = "quotes"

    def start_requests(self):
        urls = [
            'http://quotes.toscrape.com/page/1/',
            'http://quotes.toscrape.com/page/2/',
        ]
        for url in urls:
            yield scrapy.Request(url=url, callback=self.parse)

    def parse(self, response):
        page = response.url.split("/")[-2]
        filename = 'quotes-%s.html' % page
        
        #with open(filename, 'wb') as f:
        #    f.write(response.body)
        #self.log('Saved file %s' % filename)

        for q in response.css("div.quote"):
            text=q.css('span.text::text').get()
            author=q.css('small.author::text').get()
            tags=q.css('a.tag::text').getall()
            yield {
                "text":text,
                "author":author,
                "tags":tags
            }

In [6]:
# after this go to folder myprojects and check json file

In [7]:
# if we want to crawl all the quotes from all the pages not only 
# 1st and last page, then we need to find class of next button on 1st page use following codeas shown below

import scrapy


class QuotesSpider(scrapy.Spider):
    name = "quotes"

    def start_requests(self):
        urls = [
            'http://quotes.toscrape.com/page/1/',
            'http://quotes.toscrape.com/page/2/',
        ]
        for url in urls:
            yield scrapy.Request(url=url, callback=self.parse)

    def parse(self, response):
        page = response.url.split("/")[-2]
        filename = 'quotes-%s.html' % page
        
        #with open(filename, 'wb') as f:
        #    f.write(response.body)
        #self.log('Saved file %s' % filename)

        for q in response.css("div.quote"):
            text=q.css('span.text::text').get()
            author=q.css('small.author::text').get()
            tags=q.css('a.tag::text').getall()
            yield {
                "text":text,
                "author":author,
                "tags":tags
            }
        
        
        #new code added
        #li.next a::attr(href) means give me anchor tag of li.next and we want to access
        #next page url which is present in href , attribute of a(anchor) tag
        # we can access next page url in other way also
        #response.css('li.next a').attrib["href"]
        next_page=response.css('li.next a::attr(href)').get()
        if next_page is not None:
            
            #this will join new link to next_page
            next_page=response.urljoin(next_page)
            
            # will make request will new url next_page and will call parse method until last page
            yield scrapy.Request(next_page,callback=self.parse)
        
            
            
            
            
            
            

In [8]:
#run this command in second myproject folder
#scrapy crawl quotes_v2 -o all_quotes.json

# Bookstore scrapy challenge

In [None]:
import scrapy

class ShopsiteSpider(scrapy.Spider):
    name="bookstore"
    book_urls=[
        'http://books.toscrape.com/catalogue/'
    ]
    start_urls=[
        'http://books.toscrape.com/'
    ]
    def start_requests(self):
        
        urls=[
            'http://books.toscrape.com/catalogue/page-1.html'
        ]
        for url in urls:
            yield scrapy.Request(url=url,callback=self.parse)
        
    def parse(self,response):
        page_id=response.url.split("/")[4][-6]
        filename='books-%s.html'%page_id
        
        for b in response.css("article.product_pod"):
            text=b.css('h3 a::attr(title)').get()
            prod_price=b.css('p.price_color::text').get()
            book_url=self.book_urls[0]+b.css("div a::attr(href)").get()
            img_url=self.start_urls[0]+b.css("a img::attr(src)").get()[3:]
            yield{
                "text":text,
                "prod_price":prod_price,
                "book_url":book_url,
                "img_url":img_url
            }
        next_page=response.css('ul.pager li.next a::attr(href)').get()
        if next_page is not None:
            #this will join new link to next_page
            next_page=response.urljoin(next_page)
            # will make request will new url next_page and will call parse method until last page
            yield scrapy.Request(next_page,callback=self.parse)
        

In [5]:
page_id="http://books.toscrape.com/catalogue/page-1.html".split("/")
print(page_id[4][-6])

1


import scrapy

class ShopsiteSpider(scrapy.Spider):
    name="bookstore"
    book_urls=[
        'http://books.toscrape.com/catalogue/'
    ]
    start_urls=[
        'http://books.toscrape.com/'
    ]
    def start_requests(self):
        
        urls=[
            'http://books.toscrape.com/catalogue/page-1.html'
        ]
        for url in urls:
            yield scrapy.Request(url=url,callback=self.parse)
        
    def convertToJson(self):
        fileInput = 'book.json'
        fileOutput = 'book.csv'
        inputFile = open(fileInput) #open json file
        outputFile = open(fileOutput, 'w') #load csv file
        data = json.load(inputFile) #load json content
        inputFile.close() #close the input file
        output = csv.writer(outputFile) #create a csv.write
        output.writerow(data[0].keys())  # header row
        for row in data:
            output.writerow(row.values()) #values row 
  
   
    def parse(self,response):
        page_id=response.url.split("/")[4][-6]
        filename='books-%s.html'%page_id
        
        for b in response.css("article.product_pod"):
            image_url=b.css("a img::attr(src)").get()
            book_title=b.css('h3 a::attr(title)').get()
            product_price=b.css('p.price_color::text').get()
            #book_url=self.book_urls[0]+b.css("div a::attr(href)").get()
            
            yield{
                "image_url":image_url,                
                "book_title":book_title,
                "product_price":product_price,
                #"book_url":book_url,
                
            }
        next_page=response.css('ul.pager li.next a::attr(href)').get()
        if next_page is not None:
            #this will join new link to next_page
            next_page=response.urljoin(next_page)
            # will make request will new url next_page and will call parse method until last page
            yield scrapy.Request(next_page,callback=self.parse)
        
        self.convertToJson()
        

In [None]:
import json
import csv

with open('book.json') as json_file:
    data=json.load(json_file)

book_data=data['book_details']

data_file=open('book_file.csv','w')

csv_writer=csv.writer(data_file)

count=0

for b in book_data:
    if count == 0:
        header=b.keys()
        csv_writer.writerow(header)
        count+=1
    csv_writer.writerow(b.values())

data_file.close()



In [None]:
import scrapy


class ShopsiteSpider(scrapy.Spider):
    name="bookstore"
    book_urls=[
        'http://books.toscrape.com/catalogue/'
    ]
    start_urls=[
        'http://books.toscrape.com/'
    ]
    def start_requests(self):
        
        urls=[
            'http://books.toscrape.com/catalogue/page-1.html'
        ]
        for url in urls:
            yield scrapy.Request(url=url,callback=self.parse)
   
    def parse(self,response):
        page_id=response.url.split("/")[4][-6]
        filename='books-%s.html'%page_id
       
        for b in response.css("article.product_pod"):
            book_titles=''
            image_url=b.css("a img::attr(src)").get()
            book_title=b.css('h3 a::attr(title)').get()
            product_price=b.css('p.price_color::text').get()
            #book_url=self.book_urls[0]+b.css("div a::attr(href)").get()
            
            if "," in book_title:
                book_titles+='"'+book_title+'"'
            else:
                book_titles+=book_title

            yield{
                "image_url":image_url,                
                "book_title":book_titles,
                "product_price":product_price,
                #"book_url":book_url,
                
            }
        next_page=response.css('ul.pager li.next a::attr(href)').get()
        if next_page is not None:
            #this will join new link to next_page
            next_page=response.urljoin(next_page)
            # will make request will new url next_page and will call parse method until last page
            yield scrapy.Request(next_page,callback=self.parse)
        
        
        

In [None]:
import scrapy
class BookSpider(scrapy.Spider):
    name = "books_spider"
    def start_requests(self):
        urls = [
            "http://books.toscrape.com/catalogue/page-1.html"
        ]
        for url in urls:
            yield scrapy.Request(url=url, callback=self.parse)

    def parse(self, response):

        for q in response.css("article.product_pod"):
            link = q.css("div.image_container a img::attr(src)").get()
            title = q.css("h3 a::attr(title)").get()
            price = q.css("div.product_price p.price_color::text").get()
            
            yield {
                'image_url' : link,
                'book_title' : title,
                'product_price': price
            }
        next_page = response.css('ul.pager li.next a::attr(href)').get()
        if next_page is not None:
            
            next_page = response.urljoin(next_page)
            yield scrapy.Request(next_page,callback = self.parse)
                        

In [None]:
import scrapy

class pepperfry(scrapy.Spider):
    name="pepperfrySpider"
    BASE_DIR='./Pepperfry_data/'
    MAX_CNT=20
    
    
    def start_requests(self):
        
        BASE_URL="https://www.pepperfry.com/site_product/search?q="
        
        
        items=["two seater sofa","bench","book cases","coffee table",
              "dining set","queen beds","arm chairs","chest drawers",
              "garden seating","bean bags","king beds"]
        
        urls=[]
        dir_names=[]
        
        for item in items:
            query_string='-'.join(item.split(' '))
            dir_name=' '.join(item.split(' '))
            dir_names.append(dir_name)
            urls.append(BASE_URL+query_string)
            
            dir_path=self.BASE_DIR+dir_name
            if not os.path.exists(dir_path):
                os.makedirs(dir_path)
                
        for i in range(len(urls)):
            d={
                "dir_name":dir_names[i]
            }
            resp=scrapy.Request(url=urls[i],callback=self.parse,
                               dont_filter=True)
            resp.meta['dir_name']=dir_names[i]
            yield resp
            
            
    def parse(self,response,**meta):
        #response.selector.xpath('').extract()
        product_urls=response.xpath('//div/div/div/a[@p=0]/@href').extract()
        #print(product_urls)
        #print(len(product_urls))
        counter=0
        
        #print(response.meta)
        for url in product_urls:
            resp=scrapy.Request(url=url,callback=self.parse_item,dont_filter=True)
            resp.meta['dir_name']=response.meta['dir_name']
            #print(resp)
            
            if counter == self.MAX_CNT:
                break
                
            if not resp == None:
                counter+=1
                #print(resp)
                
            yield resp
            
    def parse_item(self,response,**meta):
        
        item_title=response.xpath('//div/div/div/h1/text()').extract()[0]
        item_price=response.xpath('//div/div/div/p/b[@class="pf-orange-color
                                   pf-large font-20 pf-primary-color"]/text()').extract()[0].strip()
        item_savings=response.xpath('//p[@class="pf-margin-0 pf-bold-txt font-13"]/text()').extract()[0].strip()
        item_description=response.xpath('')
            
    
            
            

























