In [123]:
# Importar librerias
from   bs4 import BeautifulSoup
import requests

from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.firefox.service import Service as FirefoxService
from selenium.webdriver.edge.service import Service as EdgeService
from selenium.webdriver.chrome.options import Options as ChromeOptions
from selenium.webdriver.firefox.options import Options as FirefoxOptions
from selenium.webdriver.edge.options import Options as EdgeOptions
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import os

import re

# Variables globables
HEADER = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36'
DEFAULT_UTILS_VERBOSE = True

In [124]:
def get_http_response(url, headers=None, response_type='page', verbose=DEFAULT_UTILS_VERBOSE, debug=False, timeout=10, retry_attempts=3):
    """
    Obtiene la respuesta HTML de una URL.
    Puede aceptar headers personalizados; si no se proporcionan, utiliza unos por defecto.
    La función retorna el resultado HTML como un objeto BeautifulSoup o texto plano.

    Args:
        url (str): La URL de la página web.
        headers (dict, optional): Headers para la solicitud HTTP.
        response_type (str, optional): Tipo de respuesta ('page' para BeautifulSoup, 'text' para texto plano).
        verbose (bool, optional): Si es True, imprime información detallada.
        debug (bool, optional): Si es True, imprime la respuesta HTTP completa.
        timeout (int, optional): Tiempo máximo de espera para la solicitud HTTP en segundos.
        retry_attempts (int, optional): Número de intentos de reintentos en caso de fallo.

    Returns:
        BeautifulSoup object or str: Dependiendo de response_type, retorna un objeto BeautifulSoup o texto plano.

    Raises:
        ValueError: Si response_type no es 'page' o 'text'.
        RuntimeError: Si ocurre un error durante la solicitud HTTP.
    """
    # Validación de parámetros
    if not isinstance(url, str):
        raise ValueError("La URL debe ser una cadena de caracteres.")
    if headers is not None and not isinstance(headers, dict):
        raise ValueError("Headers debe ser un diccionario.")
    if response_type not in ['page', 'text']:
        raise ValueError("response_type debe ser 'page' o 'text'.")
    if not isinstance(verbose, bool):
        raise ValueError("verbose debe ser un valor booleano.")
    if not isinstance(debug, bool):
        raise ValueError("debug debe ser un valor booleano.")
    if not isinstance(timeout, (int, float)):
        raise ValueError("timeout debe ser un número.")
    if not isinstance(retry_attempts, int) or retry_attempts < 0:
        raise ValueError("El número de intentos de reintentos debe ser un entero no negativo.")

    # Definimos los headers por defecto
    if headers is None:
        headers = {
            'user-agent': HEADER
        }

    attempts = 0
    while attempts <= retry_attempts:
        try:
            # Realizamos una solicitud a la página web con timeout
            response = requests.get(url, headers=headers, timeout=timeout)

            # Debug: Imprimimos la respuesta completa si debug es True
            if debug:
                print(f'HTTP response: {response}')

            # Analizamos el contenido HTML de la página web utilizando BeautifulSoup
            page = BeautifulSoup(response.content, 'html.parser')

            # Verbose: Imprimimos información detallada si verbose es True
            if verbose:
                msg  = f'URL [{url}], '
                msg += f'HTTP status [{response.ok}], '
                msg += f'HTTP code [{response.status_code}]'
                print(msg)

            # Validación de la respuesta
            if response.ok:
                if response_type == 'text':
                    return response.text
                else:
                    return page
            else:
                if verbose:
                    msg  = f'URL [{url}], '
                    msg += f'HTTP status [{response.ok}], '
                    msg += f'HTTP code [{response.status_code}], '
                    msg += f'Message [ERROR! Ocurrió un error inesperado al cargar la URL seleccionada]'
                    print(msg)
                return None

        except requests.RequestException as e:
            if verbose:
                logger.error(f'ERROR! Ocurrió un error al realizar la solicitud HTTP para la URL [{url}]. Error: [{e}]')

            # Incrementamos el contador de intentos y esperamos antes de reintentar
            attempts += 1
            if attempts <= retry_attempts:
                time.sleep(1)  # Esperamos 1 segundo antes de realizar el siguiente intento

    # Si llegamos aquí, significa que todos los intentos de reintentos fallaron
    print(f"No se pudo obtener la respuesta HTTP para la URL [{url}] después de {retry_attempts} intentos.")
    return None

def calculate_average(price_string):
    """
    Calcula el promedio de dos precios en una cadena o devuelve el precio único si solo hay uno.
    
    Parámetros:
    price_string (str): Cadena que contiene uno o dos precios con la moneda.

    Retorna:
    str: El promedio de los precios o el precio único, formateado con la moneda.
    """
    # Extraer los números de la cadena
    numbers = re.findall(r'\d+\.\d+', price_string)
    
    # Extraer la moneda de la cadena
    currency = re.findall(r'[A-Z]+', price_string)[0]
    
    # Calcular el promedio o devolver el único precio
    if len(numbers) == 1:
        # Si solo hay un número, usar ese número como promedio
        average = float(numbers[0])
    else:
        # Si hay dos números, calcular el promedio
        number1 = float(numbers[0])
        number2 = float(numbers[1])
        average = round( (number1 + number2) / 2, 3 )
    
    return currency, average

In [28]:
# Defino la URL
topic = 'juguetes'
#url = 'https://www.amazon.com/s?k={}/'.format( topic.replace(' ','+') )
url = 'https://www.ebay.com/sch/i.html?_from=R40&_nkw=juguetes'
url

'https://www.ebay.com/sch/i.html?_from=R40&_nkw=juguetes'

In [31]:
# Pido el contenido HTML
response = get_http_response(url)
#response

URL [https://www.ebay.com/sch/i.html?_from=R40&_nkw=juguetes], HTTP status [True], HTTP code [200]


In [128]:
products = response.find_all('li', class_='s-item')

In [140]:
product = products[2]
product_info = product.find('div', class_='s-item__info')

product_id = int( product.get('id').replace('item',''), 16)
product_name = product_info.find('div', class_='s-item__title').text
description = ''
installments = 1
price_str = product_info.find('span', class_='s-item__price').text
price, currency =  calculate_average(price_str)
ranking = 0
rating = 0
rating_count = 0
platform = 'ebay'
store = product_info.find('span', class_='s-item__itemLocation').text.replace('de ','')
is_best_seller = bool( product_info.find('div', class_='s-item__details-section--secondary').text )
is_promoted = 'kexu191' in product_info.find('span', class_='s-item__sep').find('span').get('style')
url = f'https://www.ebay.com/itm/{product_id}'

print('product_id:', product_id)
print('product_name:', product_name)
print('description:', description)
print('price:', product_price)
print('currency:', currency)
print('installments:', installments)
print('rating:', rating)
print('rating_count:', rating_count)
print('platform:', platform)
print('store:', store)
print('is_best_seller:', is_best_seller)
print('is_promoted:', is_promoted)
print('url:', url)

product_id: 387078449262
product_name: 2024 POP MART Dimoo By Your Side Serie Caja Ciega Confirmada Figura Juguetes Lo último 
description: 
product_price: USD
currency: 82.83
installments: 1
rating: 0
rating_count: 0
platform: ebay
store: China
is_best_seller: False
is_promoted: True
url: https://www.ebay.com/itm/387078449262


In [139]:
product = products[5]
product_info = product.find('div', class_='s-item__info')

product_id = int( product.get('id').replace('item',''), 16)
product_name = product_info.find('div', class_='s-item__title').text
description = ''
installments = 1
price_str = product_info.find('span', class_='s-item__price').text
price, currency =  calculate_average(price_str)
ranking = 0
rating = 0
rating_count = 0
platform = 'ebay'
store = product_info.find('span', class_='s-item__itemLocation').text.replace('de ','')
is_best_seller = bool( product_info.find('div', class_='s-item__details-section--secondary').text )
is_promoted = 'kexu191' in product_info.find('span', class_='s-item__sep').find('span').get('style')
url = f'https://www.ebay.com/itm/{product_id}'

print('product_id:', product_id)
print('product_name:', product_name)
print('description:', description)
print('price:', product_price)
print('currency:', currency)
print('installments:', installments)
print('rating:', rating)
print('rating_count:', rating_count)
print('platform:', platform)
print('store:', store)
print('is_best_seller:', is_best_seller)
print('is_promoted:', is_promoted)
print('url:', url)

product_id: 403391508121
product_name: Anillo Roblox Toy Code Celebrity Series 2 Callmehbob Crown, Royale High Callmehbob
description: 
product_price: USD
currency: 12.99
installments: 1
rating: 0
rating_count: 0
platform: ebay
store: Estados Unidos
is_best_seller: True
is_promoted: True
url: https://www.ebay.com/itm/403391508121


In [None]:
## Uso un driver
driver_path = r'C:\Users\santi\OneDrive\Desktop\latinframe_soft\drivers'
chrome_options = ChromeOptions()
chrome_options.add_argument("--disable-usb-device-detection")
chrome_options.add_experimental_option('excludeSwitches', ['enable-logging'])
driver = webdriver.Chrome(service=ChromeService(os.path.join(driver_path, 'chromedriver.exe')), options=chrome_options)
driver.get(url)