In [82]:
from scrapy import Selector
from scrapy.http import Response
import pandas as pd
import requests

In [83]:
html = '''
<html>
    <body>
        <div id='uid'>
            <p class='class-1'>Hello World!</p>
            <p class='class-2'>Enjoy DataCamp!</p>
        </div>
        <p class='class-1 class-2'>
        Thanks for Watching!
        </p>
    </body>
</html>
'''

In [84]:
sel = Selector(text=html)

In [85]:
# here we will get the tag p whose class is class-1 only without other classes
sel.xpath('//p[@class="class-1"]')

[<Selector xpath='//p[@class="class-1"]' data='<p class="class-1">Hello World!</p>\n ...'>]

In [86]:
# here we will get the tag whose id is uid only
sel.xpath('//*[@id="uid"]')

[<Selector xpath='//*[@id="uid"]' data='<div id="uid">\n            <p class="...'>]

In [87]:
sel.xpath('//div[@id="uid"]/p[2]')

[<Selector xpath='//div[@id="uid"]/p[2]' data='<p class="class-2">Enjoy DataCamp!</p...'>]

# Content with Contains will return all tags has the same attribute in our example bellow we search for

In [88]:
# all tags contains class-1
sel.xpath('//*[contains(@class,"class-1")]')

[<Selector xpath='//*[contains(@class,"class-1")]' data='<p class="class-1">Hello World!</p>\n ...'>,
 <Selector xpath='//*[contains(@class,"class-1")]' data='<p class="class-1 class-2">\n        T...'>]

# here we fetch only all tags has class-1 only without any classes

In [89]:
sel.xpath('//*[@class="class-1"]')

[<Selector xpath='//*[@class="class-1"]' data='<p class="class-1">Hello World!</p>\n ...'>]

# here return second tag p if it has an attribute class only

In [90]:
sel.xpath('/html/body/div/p[2]/@class')

[<Selector xpath='/html/body/div/p[2]/@class' data='class-2'>]

# another example return second p if it has an attribute id it will return [] because there is no p with id attribute

In [91]:
sel.xpath('/html/body/div/p[2]/@id')

[]

### Extracting Data from a SelectorList

### Use the extract() method

In [92]:
sel.xpath("//p").extract()

['<p class="class-1">Hello World!</p>\n            <p class="class-2">Enjoy DataCamp!</p>\n        </div>\n        <p class="class-1 class-2">\n        Thanks for Watching!\n        </p>\n    </body>\n</html>\n',
 '<p class="class-2">Enjoy DataCamp!</p>\n        </div>\n        <p class="class-1 class-2">\n        Thanks for Watching!\n        </p>\n    </body>\n</html>\n',
 '<p class="class-1 class-2">\n        Thanks for Watching!\n        </p>\n    </body>\n</html>\n']

### We can use extract_first() to get the rst element of the list

In [93]:
sel.xpath("//p").extract_first()

'<p class="class-1">Hello World!</p>\n            <p class="class-2">Enjoy DataCamp!</p>\n        </div>\n        <p class="class-1 class-2">\n        Thanks for Watching!\n        </p>\n    </body>\n</html>\n'

In [94]:
sel.xpath("//p").extract_first()

'<p class="class-1">Hello World!</p>\n            <p class="class-2">Enjoy DataCamp!</p>\n        </div>\n        <p class="class-1 class-2">\n        Thanks for Watching!\n        </p>\n    </body>\n</html>\n'

### Using requests to scrap content of url

In [95]:
url = 'https://en.wikipedia.org/wiki/Web_scraping'
html = requests.get( url ).content
sel = Selector( text = html )

In [96]:
# url = 'https://en.wikipedia.org/wiki/Web_scraping'
# response = sel.response
# response.follow(url=url)

In [97]:
# get the See also section content
see_also_div =sel.xpath('//div[@class="div-col"]')

In [98]:
# get the text of all anchore tag in See also section
title = see_also_div.css('a::text').extract()

In [99]:
# get the url of all anchore tag in See also section
links = see_also_div.css('a::attr(href)').extract()

In [100]:
# concat the https://en.wikipedia.org with all url in href attribute inside anchore tag in See also section
url = list(map(lambda x: f'https://en.wikipedia.org{x}', links))

In [101]:
# save data in excel csv/sheet
df = pd.DataFrame({'Scrapping Topic':title,'Wikipedia Scrapping Topic link':url})
df.to_csv('Scrapping.csv', index=False, encoding='utf-8')
df.to_excel('Scrapping.xlsx', index=False, encoding='utf-8')