In [None]:
import json
import time
import os
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup

# Versión del script
VERSION = "1.0.7"
print(f"Ejecutando script de scraping - Versión {VERSION}")

# Configuración de Selenium
def setup_selenium():
    options = Options()
    options.add_argument("--headless=new")
    options.add_argument("--disable-gpu")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    options.add_argument("--window-size=1920,1080")
    service = Service("/usr/local/bin/chromedriver")
    driver = webdriver.Chrome(service=service, options=options)
    return driver

# Cargar los links desde archivo
def load_links(file_path):
    if not os.path.exists(file_path):
        return []
    with open(file_path, "r", encoding="utf-8") as f:
        return [line.strip() for line in f.readlines() if line.strip()]

# Guardar un link en un archivo
def append_link(link, file_path):
    with open(file_path, "a", encoding="utf-8") as f:
        f.write(link + "\n")

# Guardar datos en JSON línea por línea (modo append)
def append_to_json(data, output_file):
    with open(output_file, "a", encoding="utf-8") as f:
        json.dump(data, f, ensure_ascii=False)
        f.write("\n")

# Extraer datos de una pregunta individual
def extract_question_data(driver, url):
    driver.get(url)
    try:
        WebDriverWait(driver, 15).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "main"))
        )
    except:
        print(f"Timeout esperando a que cargue la página: {url}")

    soup = BeautifulSoup(driver.page_source, "html.parser")

    try:
        title = soup.find("h1").get_text(strip=True)
    except:
        title = ""

    try:
        content_divs = soup.select("main div.content")
        question_content = content_divs[0].get_text(strip=True, separator="\n") if content_divs else ""
    except:
        question_content = ""

    try:
        accepted_answer_li = soup.select_one("li[itemprop='suggestedAnswer'] div.content")
        accepted_answer = accepted_answer_li.get_text(strip=True, separator="\n") if accepted_answer_li else ""
    except:
        accepted_answer = ""

    try:
        tag_elements = soup.select(".tags .tag")
        tags = [tag.get_text(strip=True) for tag in tag_elements]
    except:
        tags = []

    try:
        date_element = soup.select_one("local-time")
        date_posted = date_element.get("datetime") if date_element else ""
    except:
        date_posted = ""

    return {
        "title": title,
        "url": url,
        "question_content": question_content,
        "accepted_answer": accepted_answer,
        "tags": tags,
        "date": date_posted
    }

# Guardar lista actualizada de links
def save_links(links, file_path):
    with open(file_path, "w", encoding="utf-8") as f:
        for link in links:
            f.write(link + "\n")

# Guardar resumen de ejecución
def save_summary(start_time, total_links, processed_count, error_count):
    end_time = time.time()
    duration = end_time - start_time
    hours, rem = divmod(duration, 3600)
    minutes, seconds = divmod(rem, 60)
    summary = (
        f"Resumen de ejecución:\n"
        f"Total de links: {total_links}\n"
        f"Procesados exitosamente: {processed_count}\n"
        f"Errores: {error_count}\n"
        f"Tiempo total: {int(hours)}h {int(minutes)}m {int(seconds)}s\n"
    )
    print(summary)
    with open("summary.log", "w", encoding="utf-8") as f:
        f.write(summary)

# Main
if __name__ == "__main__":
    input_file = "question_links.txt"
    processed_file = "processed_links.txt"
    errors_file = "error_links.txt"
    output_file = "questions_data.json"

    all_links = load_links(input_file)
    processed_links = set(load_links(processed_file))
    error_links = set(load_links(errors_file))

    links = [link for link in all_links if link not in processed_links and link not in error_links]

    driver = setup_selenium()

    total_links = len(links)
    processed_count = 0
    error_count = 0

    print(f"Total de links pendientes: {total_links}")

    start_time = time.time()

    for index, link in enumerate(links.copy(), start=1):
        elapsed_time = time.time() - start_time
        avg_time_per_link = elapsed_time / index if index > 0 else 0
        remaining_links = total_links - index
        estimated_time_remaining = avg_time_per_link * remaining_links / 3600  # horas

        print(f"Cargando {index} de {total_links} ({estimated_time_remaining:.2f} Hrs estimadas restantes): {link}")
        try:
            question_data = extract_question_data(driver, link)
            append_to_json(question_data, output_file)
            append_link(link, processed_file)
            processed_count += 1
        except Exception as e:
            print(f"Error en {link}: {e}")
            append_link(link, errors_file)
            error_count += 1

    driver.quit()
    save_summary(start_time, total_links, processed_count, error_count)


Ejecutando script de scraping - Versión 1.0.3
Total de links: 21766
Cargando 1 de 21766: https://learn.microsoft.com/en-us/answers/questions/1013832/no-finding-sklearn-pandas-import-dataframemapper
[DEBUG] Contenido de la pregunta:
 Hi,
I'm trying to execute  Running experiments in Azure Machine Learning exercise which is part of exercise files found at
https://aka.ms/mslearn-dp090
.
I keep getting an error {ModuleNotFoundError: No module named 'sklearn_pandas'} so it doesn't recognize sklearn_pandas. Any idea about how to resolve this?
Cargando 2 de 21766: https://learn.microsoft.com/en-us/answers/questions/1001297/adf-data-preview-error-at-source-this-endpoint-doe
[DEBUG] Contenido de la pregunta:
 What could be the real reason for this error - This endpoint does not support BlobStorageEvents or SoftDelete. Please disable these account features if you would like to use this endpoint.", 409, HEAD
Error:
Spark job failed: { "text/plain": "{\"runId\":\"5b80d2d4-f942-47b1-abc5-f68d8330dc

KeyboardInterrupt: 