<table>
    <tr>
      <td>Minería de datos y PAradigma BigData(<b>MIN</b>) - Facultad de Informática - UCM
      </td>
      <td>
      <img src="https://biblioteca.ucm.es/data/cont/media/www/pag-88746//escudo.jpg"  width=50/>
      </td>
     </tr>
</table>

## Web Scraping con **BeautifulSoup** 
### Pablo C. Cañizares
Este cuaderno reproduce, con *requests* y *BeautifulSoup*, las acciones del ejemplo original con Selenium sobre `https://scrapeme.live/shop/`:

1. Cargar la página de **Shop**.
2. **Extraer** la cuadrícula de productos (nombre y precio).



In [None]:

# !pip install requests beautifulsoup4 lxml

In [None]:
modules = ["beautifulsoup4"]

import sys
import os.path
from subprocess import check_call
import importlib
import os

def instala(modules):
    print("Instalando módulos")
    for m in modules:
        # para el import quitamos [...] y ==...
        p = m.find("[")
        mi = m if p==-1 else m[:p]
        p = mi.find("==")
        mi = mi if p==-1 else mi[:p]
        torch_loader = importlib.util.find_spec(mi)
        if torch_loader is not None:
            print(m," encontrado")
        else:
            print(m," No encontrado, instalando...",end="")  
            try:        
                r = check_call([sys.executable, "-m", "pip", "install", "--user", m])
                print("¡hecho!")
            except:
                print("¡Problema al instalar ",m,"! ¿seguro que el módulo existe?",sep="")

    print("¡Terminado!")

instala(modules)  

Instalando módulos
¡hecho!
¡Terminado!


In [None]:

import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlencode
from dataclasses import dataclass
from typing import List, Tuple, Optional
import re



@dataclass
class Product:
    name: str
    price_text: str
    url: str
    product_id: Optional[str] = None   # para add-to-cart (WooCommerce)

def extract_grid_products(soup: BeautifulSoup) -> List[Product]:
    """Extrae productos de la cuadrícula principal (ul.products li)."""
    out: List[Product] = []
    for li in soup.select("ul.products li.product"):
        name_tag = li.select_one("h2.woocommerce-loop-product__title")
        price_tag = li.select_one("span.price")
        link_tag = li.select_one("a.woocommerce-LoopProduct-link")
        # En WooCommerce el botón suele tener data-product_id
        add_btn = li.select_one("a.add_to_cart_button")
        pid = None
        if add_btn and add_btn.has_attr("data-product_id"):
            pid = add_btn["data-product_id"]
        if name_tag and link_tag:
            out.append(Product(
                name=name_tag.get_text(strip=True),
                price_text=price_tag.get_text(" ", strip=True) if price_tag else "",
                url=link_tag.get("href"),
                product_id=pid
            ))
    return out

def pretty_list(products: List[Product]) -> list[str]:
    return [f"{p.name}: {p.price_text}" for p in products]


## Cargar **Shop** y extraer la cuadrícula de productos

In [None]:
import requests 

try:
    URL = "http://scrapeme.live/shop/"
    r = requests.get(URL)
    soup = BeautifulSoup(r.content, 'html.parser') 
    print(soup.title)
    products = extract_grid_products(soup)
    print(f"Productos encontrados: {len(products)}")
    list = pretty_list(products)[:12]  
    print(list)
except requests.exceptions.Timeout:     
	print("Timed out waiting for page to load")     
except requests.exceptions.HTTPError as e: 
    print(f"HTTP error: {e.response.status_code}")     



<title>Products – ScrapeMe</title>
Productos encontrados: 16
['Bulbasaur: £ 63.00', 'Ivysaur: £ 87.00', 'Venusaur: £ 105.00', 'Charmander: £ 48.00', 'Charmeleon: £ 165.00', 'Charizard: £ 156.00', 'Squirtle: £ 130.00', 'Wartortle: £ 123.00', 'Blastoise: £ 76.00', 'Caterpie: £ 73.00', 'Metapod: £ 148.00', 'Butterfree: £ 162.00']
