In [1]:
# Data_processing

import requests, time, random, re
from bs4 import BeautifulSoup
import pandas as pd
import xml.etree.ElementTree as ET


# Helper functions

def safe_request(url):
    headers = {"User-Agent": "Mozilla/5.0"}
    try:
        return requests.get(url, headers=headers, timeout=10)
    except:
        return None

def clean_price(value):
    value = re.sub(r"[^\d.]", "", str(value))
    return float(value) if value else None


# Books

books_data = []
for page in range(1, 21):  
    resp = safe_request(f"http://books.toscrape.com/catalogue/page-{page}.html")
    if not resp: continue
    soup = BeautifulSoup(resp.text, "lxml")
    for book in soup.select("article.product_pod"):
        title = book.h3.a["title"]
        price = clean_price(book.select_one(".price_color").text)
        stock = book.select_one(".availability").text.strip()
        rating = book.p["class"][1]
        # get category from detail page
        link = "http://books.toscrape.com/catalogue/" + book.h3.a["href"].replace("../", "")
        cat_resp = safe_request(link)
        category = None
        if cat_resp:
            cat_soup = BeautifulSoup(cat_resp.text, "lxml")
            bc = cat_soup.select("ul.breadcrumb li a")
            if len(bc) >= 3:
                category = bc[2].text.strip()
        books_data.append([title, price, stock, rating, category])
    time.sleep(random.uniform(1,2))

books_df = pd.DataFrame(books_data, columns=["title","price","stock","rating","category"])


# E-commerce site

shop_data = []
base_url = "https://webscraper.io/test-sites/e-commerce/allinone"
resp = safe_request(base_url)
if resp:
    soup = BeautifulSoup(resp.text, "lxml")
    categories = soup.select("div.sidebar-nav ul li a.category-link")
    for cat in categories:
        cat_name = cat.text.strip()
        cat_url = "https://webscraper.io" + cat["href"]
        cat_resp = safe_request(cat_url)
        if not cat_resp: continue
        cat_soup = BeautifulSoup(cat_resp.text, "lxml")
        subcats = cat_soup.select("div.sidebar-nav ul li ul li a.subcategory-link")
        for sub in subcats:
            sub_name = sub.text.strip()
            sub_url = "https://webscraper.io" + sub["href"]
            sub_resp = safe_request(sub_url)
            if not sub_resp: continue
            sub_soup = BeautifulSoup(sub_resp.text, "lxml")
            for p in sub_soup.select(".thumbnail"):
                name = p.select_one(".title").text.strip()
                price = clean_price(p.select_one(".price").text)
                shop_data.append([name, price, cat_name, sub_name])
            time.sleep(random.uniform(1,2))

shop_df = pd.DataFrame(shop_data, columns=["name","price","category","subcategory"])


# RSS feed

rss_data = []
rss_url = "http://feeds.bbci.co.uk/news/rss.xml"
resp = safe_request(rss_url)
if resp:
    root = ET.fromstring(resp.content)
    for item in root.findall(".//item"):
        rss_data.append([
            item.find("title").text,
            item.find("link").text,
            item.find("pubDate").text if item.find("pubDate") is not None else None
        ])

rss_df = pd.DataFrame(rss_data, columns=["title","link","date"])


print("Books:", books_df.shape)
print("Shop:", shop_df.shape)
print("RSS:", rss_df.shape)     


Books: (400, 5)
Shop: (147, 4)
RSS: (31, 3)
