# Starting with Scrapy 

BeautifulSoup and lxml are libraries for parsing HTML and XML. Scrapy is an application framework for writing web spiders that crawl web sites and extract data from them.

In [108]:
import scrapy
import requests
from scrapy.http import TextResponse
import urllib.request, urllib.parse, urllib.error
import numpy as np


### XPath is a language



<h5>How to use commands in console</h5>
<img src="x_console.png" alt="Drawing" style="width: 400px;"/>


* Using //p will select all the p elements and //a for all the links
* To find all the links under div we'll use $x('//div//a')
* You can also select just the text by using the text()
* You can also select just the text by using the text()
* There are tens of XPath functions like not(), contains(), and startswith() that you can find in the online documentation (http://www.w3schools.com/xsl/xpath_functions.asp),

In [6]:
class QuotesSpider(scrapy.Spider):
    name = "grand"

    def start_requests(self):
        urls = [
            'http://quotes.toscrape.com/page/1/',
            'http://quotes.toscrape.com/page/2/',
        ]
        for url in urls:
            yield scrapy.Request(url=url, callback=self.parse)
            # The callback of a request is a function that will be called when the response of that request is downloaded.
            # The callback function will be called with the downloaded Response object as its first argument. Example: def parse_page1(self, response): return scrapy.

    def parse(self, response):
        page = response.url.split("/")[-2]
        filename = 'quotes-%s.html' % page
        with open(filename, 'wb') as f:
            f.write(response.body)
        self.log('Saved file %s' % filename)

In [125]:
conv = '''accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9
accept-encoding: gzip, deflate, br
accept-language: en-US,en;q=0.9
cache-control: max-age=0
cookie: prov=6ee1b1c0-a975-57c1-b806-d6f02acdc078; _ga=GA1.2.1770530859.1579667732; __qca=P0-709205387-1579667731930; __gads=ID=7abeba23853bfab5:T=1579667732:S=ALNI_MZQpLsyZugJqQyXRBUJPOkgNjsDbw; sgt=id=57291405-92ae-4989-a741-f989cf2c6903; _gid=GA1.2.1065622312.1595226703; arp_scroll_position=136; acct=t=JE1j0YL6mlLl7%2bJDAZ0zaTJQ%2bUidPKXE&s=xcKiH08vYJFwXKLlol2HcBhRPB70gUZR
dnt: 1
referer: https://www.google.com/
sec-fetch-dest: document
sec-fetch-mode: navigate
sec-fetch-site: cross-site
sec-fetch-user: ?1
upgrade-insecure-requests: 1
user-agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36'''

conv = conv.split('\n')

headers = dict()

for i in range(len(conv)):
    headers[conv[i].split(': ')[0]] = conv[i].split(': ')[1]


In [144]:
headers

{'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
 'accept-encoding': 'gzip, deflate, br',
 'accept-language': 'en-US,en;q=0.9',
 'cache-control': 'max-age=0',
 'cookie': 'prov=6ee1b1c0-a975-57c1-b806-d6f02acdc078; _ga=GA1.2.1770530859.1579667732; __qca=P0-709205387-1579667731930; __gads=ID=7abeba23853bfab5:T=1579667732:S=ALNI_MZQpLsyZugJqQyXRBUJPOkgNjsDbw; sgt=id=57291405-92ae-4989-a741-f989cf2c6903; _gid=GA1.2.1065622312.1595226703; arp_scroll_position=136; acct=t=JE1j0YL6mlLl7%2bJDAZ0zaTJQ%2bUidPKXE&s=xcKiH08vYJFwXKLlol2HcBhRPB70gUZR',
 'dnt': '1',
 'referer': 'https://www.google.com/',
 'sec-fetch-dest': 'document',
 'sec-fetch-mode': 'navigate',
 'sec-fetch-site': 'cross-site',
 'sec-fetch-user': '?1',
 'upgrade-insecure-requests': '1',
 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36'}

In [151]:
res = requests.get('https://www.magicbricks.com/propertyDetails/2-BHK-1030-Sq-ft-Multistorey-Apartment-FOR-Sale-Thanisandra-Main-Road-in-Bangalore&id=4d423231313036303433?sem=Y', headers=headers)

# res = requests.get('https://madhubanmurli.org/#')#, headers = headers)
response = TextResponse(res.url, body=res.text, encoding='utf-8')

In [181]:
response.xpath('//*[@id="thirdFoldDisplay"]/div[1]/div[2]/span/a').extract()
# response.xpath('//*[@target="_blank"][4]').extract()

['<a href="https://www.magicbricks.com//sai-kalyan-builders-&amp;-developers-pvt.-ltd.-buid-4d423737333231" target="_blank">\nSai Kalyan Builders &amp; Developers Pvt. Ltd.\n</a>']

In [213]:
# response.xpath('//*[@id="thirdFoldDisplay"]/div[1]/div[2]/span/a').extract()

for i in range(1, 20):
    print(i, response.xpath('//*[@target="_blank"]['+str(i) + ']/text()').extract())

1 ['\nThanisandra Main Road, Bangalore\n', '\nSai Kalyan Builders & Developers Pvt. Ltd.\n', ' Ultima Smart Homes ', '\n', '\n', '\n', ' ', '\n', '\n', '\n', '\n', '\n', '\n', 'Ultima Smart Homes', '\nSai Kalyan Builders & Developers Pvt. Ltd.\n', 'Download\nBrochure', 'View All', 'View All', 'Sitemap', 'Terms & Conditions', 'Privacy Policy', 'Blog', 'Careers', 'Testimonials', 'Help Center', 'Sales Enquiry']
2 []
3 []
4 []
5 []
6 []
7 []
8 []
9 []
10 []
11 []
12 []
13 []
14 []
15 []
16 []
17 []
18 []
19 []


In [199]:
# response.xpath('//*[@id="thirdFoldDisplay"]/div[1]/div[2]/span/a//text()').extract()
response.xpath('//*[@target="_blank"][1]').extract()[2]

'<a href="https://www.magicbricks.com//sai-kalyan-builders-&amp;-developers-pvt.-ltd.-buid-4d423737333231" target="_blank">\nSai Kalyan Builders &amp; Developers Pvt. Ltd.\n</a>'

# Note visual tags can't be used with attribute tags
XPath is practically used for locating XML nodes.

General Syntax:

xpath=//tag[@attribute='value']

// : Select current node.

tag: Tagname of the particular node. Also, "*" is for searching any tag in the xml structure

@: Select attribute.

attribute: Attribute name of the node.

value: Value of the attribute.

In [239]:
response.xpath('//*[@id="thirdFoldDisplay"]/div[1]/div[2]/span/a//text()')

# That's how you nest elements for selecting nth instance of mth entity and combine regular expression
# ' '.join(response.xpath('(//*[@id="thirdFoldDisplay"]//*[@target="_blank"])//text()').re('[\.a-zA-Z]+'))

[<Selector xpath='//*[@id="thirdFoldDisplay"]/div[1]/div[2]/span/a//text()' data='\nSai Kalyan Builders & Developers Pvt...'>]

In [235]:
headers

{'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
 'accept-encoding': 'gzip, deflate, br',
 'accept-language': 'en-US,en;q=0.9',
 'cache-control': 'max-age=0',
 'cookie': 'prov=6ee1b1c0-a975-57c1-b806-d6f02acdc078; _ga=GA1.2.1770530859.1579667732; __qca=P0-709205387-1579667731930; __gads=ID=7abeba23853bfab5:T=1579667732:S=ALNI_MZQpLsyZugJqQyXRBUJPOkgNjsDbw; sgt=id=57291405-92ae-4989-a741-f989cf2c6903; _gid=GA1.2.1065622312.1595226703; arp_scroll_position=136; acct=t=JE1j0YL6mlLl7%2bJDAZ0zaTJQ%2bUidPKXE&s=xcKiH08vYJFwXKLlol2HcBhRPB70gUZR',
 'dnt': '1',
 'referer': 'https://www.google.com/',
 'sec-fetch-dest': 'document',
 'sec-fetch-mode': 'navigate',
 'sec-fetch-site': 'cross-site',
 'sec-fetch-user': '?1',
 'upgrade-insecure-requests': '1',
 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36'}

You make a new project by typing "scrappy startproject properties"