In [3]:
# Data_collection

import requests
from bs4 import BeautifulSoup
import time
import csv
import json
import random
import xml.etree.ElementTree as ET


# Safe HTTP request

def safe_request(url, retries=3, delay=2):
    headers = {"User-Agent": "Mozilla/5.0"}
    for attempt in range(retries):
        try:
            response = requests.get(url, headers=headers, timeout=10)
            response.raise_for_status()
            return response
        except requests.exceptions.RequestException as e:
            print(f"Attempt {attempt+1} failed: {e}")
            time.sleep(delay)
    return None



# Scrape books.toscrape.com 

BASE_URL = "http://books.toscrape.com/catalogue/page-{}.html"
all_books = []


for page_num in range(1, 21): 
    url = BASE_URL.format(page_num)
    print(f"Scraping page {page_num}...")
    
    response = safe_request(url)
    if response:
        soup = BeautifulSoup(response.text, "lxml")
        books = soup.select("article.product_pod")
        
        for book in books:
            title = book.h3.a["title"]
            price = book.select_one(".price_color").text.strip()
            stock = book.select_one(".availability").text.strip()
            rating = book.p["class"][1]
            
            all_books.append({
                "title": title,
                "price": price,
                "stock": stock,
                "rating": rating
            })
        
        print(f"  Found {len(books)} books on page {page_num}")
        time.sleep(random.uniform(1, 3))  # polite delay between pages
    else:
        print(f"  Failed to scrape page {page_num}")

print(f" Scraped {len(all_books)} books from 20 pages")


# Save books to CSV + JSON
if all_books:
    with open("books.csv", "w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=all_books[0].keys())
        writer.writeheader()
        writer.writerows(all_books)
    
    with open("books.json", "w", encoding="utf-8") as f:
        json.dump(all_books, f, indent=4)



# Scrape e-commerce site

shop_items = []
base_url = "https://webscraper.io"

response = safe_request("https://webscraper.io/test-sites/e-commerce/allinone")
if response:
    soup = BeautifulSoup(response.text, "lxml")
    
    # Get all category links
    category_links = [base_url + a["href"] for a in soup.select(".category-link")]
    
    for cat_link in category_links:
        cat_resp = safe_request(cat_link)
        if not cat_resp:
            continue
        cat_soup = BeautifulSoup(cat_resp.text, "lxml")
        
        category_name = cat_soup.select_one("h1").text.strip() if cat_soup.select_one("h1") else "Unknown"
        
        # Subcategory links
        sub_links = [base_url + a["href"] for a in cat_soup.select(".subcategory-link")]
        
        for sub_link in sub_links:
            sub_resp = safe_request(sub_link)
            if not sub_resp:
                continue
            sub_soup = BeautifulSoup(sub_resp.text, "lxml")
            
            subcategory_name = sub_soup.select_one("h1").text.strip() if sub_soup.select_one("h1") else "Unknown"
            
            products = sub_soup.select(".thumbnail")
            for p in products:
                name = p.select_one(".title")["title"].strip()
                price = p.select_one(".price").text.strip()
                link = base_url + p.select_one(".title")["href"]
                
                shop_items.append({
                    "name": name,
                    "price": price,
                    "link": link,
                    "category": category_name,
                    "subcategory": subcategory_name
                })
            
            time.sleep(random.uniform(1, 2))  # polite delay

print(f" Scraped {len(shop_items)} shop products")


# Save shop data
if shop_items:
    with open("shop.csv", "w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=shop_items[0].keys())
        writer.writeheader()
        writer.writerows(shop_items)
    
    with open("shop.json", "w", encoding="utf-8") as f:
        json.dump(shop_items, f, indent=4)



# Parse BBC RSS feed

rss_items = []
rss_url = "http://feeds.bbci.co.uk/news/rss.xml"
response = safe_request(rss_url)

if response:
    root = ET.fromstring(response.content)
    for item in root.findall(".//item"):
        title = item.find("title").text
        link = item.find("link").text
        pub_date = item.find("pubDate").text if item.find("pubDate") is not None else "N/A"
        rss_items.append({"title": title, "link": link, "date": pub_date})

print(f" Parsed {len(rss_items)} RSS items")


# Save RSS feed
if rss_items:
    with open("rss.csv", "w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=rss_items[0].keys())
        writer.writeheader()
        writer.writerows(rss_items)
    
    with open("rss.json", "w", encoding="utf-8") as f:
        json.dump(rss_items, f, indent=4)                                                                                                           
        

Scraping page 1...
  Found 20 books on page 1
Scraping page 2...
  Found 20 books on page 2
Scraping page 3...
  Found 20 books on page 3
Scraping page 4...
  Found 20 books on page 4
Scraping page 5...
  Found 20 books on page 5
Scraping page 6...
  Found 20 books on page 6
Scraping page 7...
  Found 20 books on page 7
Scraping page 8...
  Found 20 books on page 8
Scraping page 9...
  Found 20 books on page 9
Scraping page 10...
  Found 20 books on page 10
Scraping page 11...
  Found 20 books on page 11
Scraping page 12...
  Found 20 books on page 12
Scraping page 13...
  Found 20 books on page 13
Scraping page 14...
  Found 20 books on page 14
Scraping page 15...
  Found 20 books on page 15
Scraping page 16...
  Found 20 books on page 16
Scraping page 17...
  Found 20 books on page 17
Scraping page 18...
  Found 20 books on page 18
Scraping page 19...
  Found 20 books on page 19
Scraping page 20...
  Found 20 books on page 20
 Scraped 400 books from 20 pages
 Scraped 147 shop product