In [10]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import json
import time

def setup_selenium():
    """
    Set up Selenium WebDriver with Chrome options.
    Returns:
        WebDriver: Configured WebDriver instance.
    """
    options = Options()
    options.add_argument("--headless")  # Run in headless mode
    options.add_argument("--disable-gpu")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    service = Service(executable_path="/usr/local/bin/chromedriver")
    driver = webdriver.Chrome(service=service, options=options)
    return driver

def fetch_question_details(driver, question_link):
    """
    Fetch the details of a specific question, including its full text and accepted answer.
    Args:
        driver (WebDriver): Selenium WebDriver instance.
        question_link (str): URL of the specific question page.
    Returns:
        dict: Dictionary containing question details (title, question text, accepted answer).
    """
    try:
        driver.get(question_link)
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CLASS_NAME, "thread-title"))
        )
        time.sleep(2)  # Additional wait for content to load
        
        soup = BeautifulSoup(driver.page_source, "html.parser")
        
        # Extract question title
        title = soup.find("h1").get_text(strip=True) if soup.find("h1") else "No Title"
        
        # Extract question text
        question_text = soup.find("div", class_="thread-body").get_text(strip=True) if soup.find("div", class_="thread-body") else "No Question Text"
        
        # Extract accepted answer
        accepted_answer = soup.find("div", class_="accepted-answer").get_text(strip=True) if soup.find("div", class_="accepted-answer") else "No Accepted Answer"
        
        return {
            "title": title,
            "question_text": question_text,
            "accepted_answer": accepted_answer,
            "link": question_link
        }
    except Exception as e:
        print(f"Error fetching details from {question_link}: {e}")
        return {
            "title": "Error",
            "question_text": "Error",
            "accepted_answer": "Error",
            "link": question_link
        }

def scrape_questions(base_url, output_file="azure_questions.json"):
    """
    Scrape Azure questions from the given URL and save them to a file.
    Args:
        base_url (str): URL to start scraping from.
        output_file (str): Path to save the scraped questions.
    """
    driver = setup_selenium()
    all_questions = []
    error_links = []
    
    try:
        current_page = 1
        while True:
            print(f"Scraping page {current_page}: {base_url}")
            
            driver.get(base_url)
            WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.CLASS_NAME, "thread-title"))
            )
            time.sleep(2)  # Additional wait for content to load
            
            soup = BeautifulSoup(driver.page_source, "html.parser")
            
            # Find all question links
            question_links = soup.find_all("a", class_="thread-title", href=True)
            question_links = [f"https://learn.microsoft.com{link['href']}" for link in question_links]
            
            if not question_links:
                print("No more questions found. Stopping.")
                break
            
            print(f"Found {len(question_links)} questions on page {current_page}.")
            
            for idx, link in enumerate(question_links, start=1):
                print(f"Scraping question {idx} of {len(question_links)} on page {current_page}: {link}")
                
                question_details = fetch_question_details(driver, link)
                if question_details["title"] == "Error":
                    error_links.append(link)
                else:
                    all_questions.append(question_details)
            
            # Check for "Next" button to navigate to the next page
            next_button = soup.find("a", class_="next", href=True)
            if next_button:
                base_url = f"https://learn.microsoft.com{next_button['href']}"
                current_page += 1
            else:
                print("No more pages to scrape.")
                break
        
        # Save all questions to a JSON file
        with open(output_file, "w", encoding="utf-8") as f:
            json.dump(all_questions, f, ensure_ascii=False, indent=4)
        
        # Save error links to a file
        if error_links:
            with open("errors.txt", "w", encoding="utf-8") as f:
                for error_link in error_links:
                    f.write(f"{error_link}\n")
        
        print(f"Scraped {len(all_questions)} questions successfully.")
        if error_links:
            print(f"Encountered {len(error_links)} errors. Links saved to errors.txt.")
    
    except Exception as e:
        print(f"Error during scraping: {e}")
    finally:
        driver.quit()

# Run the script
if __name__ == "__main__":
    scrape_questions("https://learn.microsoft.com/en-us/answers/tags/133/azure?filterby=withacceptedanswer")


Scraping page 1: https://learn.microsoft.com/en-us/answers/tags/133/azure?filterby=withacceptedanswer
Error during scraping: Message: 
Stacktrace:
0   chromedriver                        0x0000000102dc1568 chromedriver + 6088040
1   chromedriver                        0x0000000102db917a chromedriver + 6054266
2   chromedriver                        0x0000000102858540 chromedriver + 415040
3   chromedriver                        0x00000001028aa0a0 chromedriver + 749728
4   chromedriver                        0x00000001028aa2f1 chromedriver + 750321
5   chromedriver                        0x00000001028fa764 chromedriver + 1079140
6   chromedriver                        0x00000001028d041d chromedriver + 906269
7   chromedriver                        0x00000001028f7a19 chromedriver + 1067545
8   chromedriver                        0x00000001028d01c3 chromedriver + 905667
9   chromedriver                        0x000000010289c05a chromedriver + 692314
10  chromedriver                       