# Working with web scraping

### 1. Fetching web pages with requests

In [None]:
import requests

url = 'https://example.com'
response = requests.get(url)
html = response.text

### 2. Parsing html with beautifulsoup

In [None]:
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'html.parser')
print(soup.prettify())  # Pretty-print the HTML

### 3. Navigating the html tree

In [None]:
title = soup.title.text  # Get the page title
headings = soup.find_all('h1')  # List of all <h1> tags

### 4. Using css selectors

In [None]:
articles = soup.select('div.article')  # All elements with class 'article' inside a <div>

### 5. Extracting data from tags

In [None]:
for article in articles:
    title = article.h2.text  # Text inside the <h2> tag
    link = article.a['href']  # 'href' attribute of the <a> tag
    print(title, link)

### 6. Handling relative urls

In [None]:
from urllib.parse import urljoin
absolute_urls = [urljoin(url, link) for link in relative_urls]

### 7. To scrape content across multiple pages

In [None]:
base_url = "https://example.com/page/"
for page in range(1, 6):  # For 5 pages
    page_url = base_url + str(page)
    response = requests.get(page_url)
    # Process each page's content

### 8. To scrape data by ajax requests

In [None]:
# Find the URL of the AJAX request (using browser's developer tools) and fetch it
ajax_url = 'https://example.com/ajax_endpoint'
data = requests.get(ajax_url).json()  # Assuming the response is JSON

### 9. Using regex in web scraping

In [None]:
import re
emails = re.findall(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', html)

### 10. To check robots.txt for scraping permissions

In [None]:
from urllib.robotparser import RobotFileParser

rp = RobotFileParser()
rp.set_url('https://example.com/robots.txt')
rp.read()
can_scrape = rp.can_fetch('*', url)

### 11. Using sessions and cookies

In [None]:
session = requests.Session()
session.get('https://example.com/login')
session.cookies.set('key', 'value')  # Set cookies, if needed
response = session.get('https://example.com/protected_page')

### 12. Scraping with browser automation (selenium)

In [None]:
from selenium import webdriver
browser = webdriver.Chrome()
browser.get('https://example.com')
content = browser.page_source
# Parse and extract data using BeautifulSoup, etc.
browser.quit()

### 13. Error handling in web scraping

In [None]:
try:
    response = requests.get(url, timeout=5)
    response.raise_for_status()  # Raises an error for bad status codes
except requests.exceptions.RequestException as e:
    print(f"Error: {e}")

### 14. Asynchronous web scraping

In [None]:
import aiohttp
import asyncio

async def fetch(url):
    async with aiohttp.ClientSession() as session:
        async with session.get(url) as response:
            return await response.text()

urls = ['https://example.com/page1', 'https://example.com/page2']
loop = asyncio.get_event_loop()
pages = loop.run_until_complete(asyncio.gather(*(fetch(url) for url in urls)))

### 15. Data storage

In [None]:
import csv

with open('output.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(['Title', 'URL'])
    for article in articles:
        writer.writerow([article['title'], article['url']])