In [2]:
import requests
from bs4 import BeautifulSoup
import csv
import time


##### Define the base URL of the website

In [3]:

base_url = 'http://quotes.toscrape.com'
current_url = base_url


##### Open a CSV file to store the scraped data

In [7]:
# Initialize an empty list to collect the quotes
quotes_list = []

with open('quotes.csv', 'w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(['author', 'quote', 'tag_name'])  # CSV header
    
    while current_url:
        print(f"Scraping {current_url}")
        try:
            response = requests.get(current_url)
            response.raise_for_status()  # Check for HTTP errors
        except requests.exceptions.RequestException as e:
            print(f"Error retrieving {current_url}: {e}")
            break
        
        soup = BeautifulSoup(response.text, 'html.parser')
        quotes = soup.find_all('div', class_='quote')
        
        for quote in quotes:
            # Extract quote text (removes leading/ending quotes)
            text = quote.find('span', class_='text').get_text(strip=True) if quote.find('span', class_='text') else ''
            
            # Extract author name
            author = quote.find('small', class_='author').get_text(strip=True) if quote.find('small', class_='author') else ''
            
            # Extract tags (list of strings)
            tags = [tag.get_text(strip=True) for tag in quote.find_all('a', class_='tag')]
            
            # Write each tag as a separate row in CSV
            if tags:
                for tag in tags:
                    writer.writerow([author, text, tag])
                    # Add to the quotes_list for the second CSV writing
                    quotes_list.append({'Author': author, 'Quote': text, 'Tags': tag})
            else:
                # Write empty tag if none exist
                writer.writerow([author, text, ''])
                # Add to the quotes_list for the second CSV writing
                quotes_list.append({'Author': author, 'Quote': text, 'Tags': ''})
        
        # Check for next page
        next_button = soup.find('li', class_='next')
        if next_button:
            next_page = next_button.find('a')['href']
            current_url = base_url + next_page
            time.sleep(1)  # Polite delay between requests
        else:
            current_url = None  # Exit loop if no more pages

print("Scraping complete. Data saved to quotes.csv.")


Scraping http://quotes.toscrape.com
Scraping http://quotes.toscrape.com/page/2/
Scraping http://quotes.toscrape.com/page/3/
Scraping http://quotes.toscrape.com/page/4/
Scraping http://quotes.toscrape.com/page/5/
Scraping http://quotes.toscrape.com/page/6/
Scraping http://quotes.toscrape.com/page/7/
Scraping http://quotes.toscrape.com/page/8/
Scraping http://quotes.toscrape.com/page/9/
Scraping http://quotes.toscrape.com/page/10/
Scraping complete. Data saved to quotes.csv.


In [8]:
csv_filename = "quotes.csv"
with open(csv_filename, "w", newline="", encoding="utf-8-sig") as file:
    fieldnames = ["Author", "Quote", "Tags"]
    writer = csv.DictWriter(file, fieldnames=fieldnames)
    writer.writeheader()
    writer.writerows(quotes_list)

print(f"Quotes have been scraped and saved to {csv_filename}")

Quotes have been scraped and saved to quotes.csv
