<a href="https://colab.research.google.com/github/rkt024/google_collab/blob/main/Web_Scraping.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


# Architecture
[![Code Implementation](https://www.kdnuggets.com/wp-content/uploads/garg_stepbystep_guide_web_scraping_python_beautiful_soup_1.png)]

## Environment Setup
```python
# For Mac/Linux:
python3 -m venv venv
source venv/bin/activate

# For Windows:
python3 -m venv venv
source venv\scripts\activate
```

### Install necessary modules

In [None]:
!pip install requests
!pip install bs4

Collecting bs4
  Downloading bs4-0.0.2-py2.py3-none-any.whl.metadata (411 bytes)
Downloading bs4-0.0.2-py2.py3-none-any.whl (1.2 kB)
Installing collected packages: bs4
Successfully installed bs4-0.0.2


In [None]:
import requests

url = "https://quotes.toscrape.com/"
res = requests.get(url)
htmlData = res.content
print(htmlData)

b'<!DOCTYPE html>\n<html lang="en">\n<head>\n\t<meta charset="UTF-8">\n\t<title>Quotes to Scrape</title>\n    <link rel="stylesheet" href="/static/bootstrap.min.css">\n    <link rel="stylesheet" href="/static/main.css">\n    \n    \n</head>\n<body>\n    <div class="container">\n        <div class="row header-box">\n            <div class="col-md-8">\n                <h1>\n                    <a href="/" style="text-decoration: none">Quotes to Scrape</a>\n                </h1>\n            </div>\n            <div class="col-md-4">\n                <p>\n                \n                    <a href="/login">Login</a>\n                \n                </p>\n            </div>\n        </div>\n    \n\n<div class="row">\n    <div class="col-md-8">\n\n    <div class="quote" itemscope itemtype="http://schema.org/CreativeWork">\n        <span class="text" itemprop="text">\xe2\x80\x9cThe world as we have created it is a process of our thinking. It cannot be changed without changing our thinki

In [None]:
from bs4 import BeautifulSoup

parsedData = BeautifulSoup(htmlData, "html.parser")
print(parsedData.prettify())

In [None]:
from bs4 import BeautifulSoup
import requests
import csv

# Making a GET request to the webpage to be scraped
page_response = requests.get("http://quotes.toscrape.com")

# Check if the GET request was successful before parsing the data
if page_response.status_code == 200:
    soup = BeautifulSoup(page_response.text, "html.parser")

    # Find all quote containers
    quote_containers = soup.find_all("div", class_="quote")

    # Lists to store quotes and authors
    quotes = []
    authors = []

    # Loop through each quote container and extract the quote and author
    for quote_div in quote_containers:
        # Extract the quote text
        quote = quote_div.find("span", class_="text").text
        quotes.append(quote)

        # Extract the author
        author = quote_div.find("small", class_="author").text
        authors.append(author)

    # Combine quotes and authors into a list of tuples
    data = list(zip(quotes, authors))

    # Save the data to a CSV file
    with open("quotes.csv", "w", newline="", encoding="utf-8") as file:
        writer = csv.writer(file)
        # Write the header
        writer.writerow(["Quote", "Author"])
        # Write the data
        writer.writerows(data)

    print("Data saved to quotes.csv")
else:
    print(f"Failed to retrieve the webpage. Status code: {page_response.status_code}")

Data saved to quotes.csv


In [None]:
import requests
from bs4 import BeautifulSoup
from requests.exceptions import RequestException
from time import sleep

def scrape_quotes_improved():
    page = 1
    all_quotes = []
    headers = {'User-Agent': 'Mozilla/5.0'}

    while True:
        url = f"https://quotes.toscrape.com/page/{page}/"
        try:
            response = requests.get(url, headers=headers, timeout=5)
            response.raise_for_status()  # Will raise an HTTPError if the HTTP request returned an unsuccessful status code

            soup = BeautifulSoup(response.text, 'html.parser')
            quotes = [quote.text for quote in soup.find_all('span', class_='text')]
            if not quotes:
                break
            all_quotes.extend(quotes)

            next_btn = soup.select_one('li.next > a')
            if not next_btn:
                break  # No more pages

            page += 1
            sleep(1)  # More reasonable delay
        except RequestException as e:
            print(f"Request failed: {e}")
            break
    return all_quotes

quotes = scrape_quotes_improved()
for q in quotes:
    print(q)