In [8]:
import undetected_chromedriver as uc
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import re
import time
import random
import json
import pandas as pd

# --- [Core Functions] ---

def parse_case_details(driver, url):
    try:
        driver.get(url)
        
        # Wait for the main document content to load
        try:
            WebDriverWait(driver, 25).until(EC.presence_of_element_located((By.ID, "documentContent")))
        except:
            print(f"    ⚠️ Page loading slow: {url}")
            time.sleep(5)

        soup = BeautifulSoup(driver.page_source, 'html.parser')
        
        judge_name = "Unknown"
        heard_date = "Unknown"
        released_date = "Unknown"

        # 1. Parse Judge Name
        judge_target = soup.find(lambda tag: tag.name == "p" and re.search(r'Before\s+Justice', tag.get_text()))
        if judge_target:
            full_text = judge_target.get_text(separator=" ", strip=True)
            clean_text = " ".join(full_text.split()) 
            if "Justice" in clean_text:
                judge_name = clean_text.split("Justice")[-1].strip()

        # 2. Parse Heard Date
        heard_pattern = re.compile(r"(Heard\s+on|Heard:|Date\s+of\s+hearing:)", re.I)
        heard_tag = soup.find(lambda tag: tag.name == "p" and heard_pattern.search(tag.get_text()))
        if heard_tag:
            full_text = heard_tag.get_text(separator=" ", strip=True)
            clean_text = " ".join(full_text.split())
            if "on" in clean_text.lower():
                heard_date = clean_text.split("on")[-1].strip()
            elif ":" in clean_text:
                heard_date = clean_text.split(":")[-1].strip()

        # 3. Parse Released Date (Specific Target)
        date_label_div = soup.find("div", class_="col-3", string=re.compile(r"Date:", re.I))
        if date_label_div:
            date_value_div = date_label_div.find_next_sibling("div", class_="col")
            if date_value_div:
                released_date = date_value_div.get_text(strip=True)

        return {
            "Judge": judge_name,
            "Heard_Date": heard_date,
            "Released_Date": released_date
        }
        
    except Exception as e:
        print(f"    ❌ Error parsing {url}: {e}")
        return None

def main_process(limit=20):
    options = uc.ChromeOptions()
    # Adding stability arguments to prevent the 'target window closed' error
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-dev-shm-usage')
    options.add_argument('--disable-popup-blocking')
    options.add_argument('--start-maximized')
    
    # Initialize driver with subprocess mode for better stability on MacOS/Windows
    driver = uc.Chrome(options=options, version_main=144, use_subprocess=True)

    try:
        print("1. Initializing session. Solve CAPTCHA manually if prompted.")
        driver.get("https://www.canlii.org/en/on/oncj/nav/date/2025/")
        time.sleep(15) # Extra time for manual verification

        print("2. Fetching API Case List...")
        api_url = "https://www.canlii.org/on/oncj/nav/date/2025/items"
        driver.get(api_url)
        time.sleep(7)
        
        raw_json = driver.find_element(By.TAG_NAME, "body").text
        data = json.loads(raw_json)
        cases_df = pd.DataFrame(data)
        
        criminal_cases = cases_df[cases_df['styleOfCause'].str.startswith("R. v.", na=False)]
        
        final_results = []
        processed_count = 0
        
        for i, row in criminal_cases.iterrows():
            if processed_count >= limit: break
            
            case_url = "https://www.canlii.org" + row['url']
            print(f"[{processed_count+1}/{limit}] Scraping: {row['styleOfCause']}")
            
            details = parse_case_details(driver, case_url)
            if details:
                details['Case_Title'] = row['styleOfCause']
                details['URL'] = case_url
                final_results.append(details)
                processed_count += 1
            
            # Anti-bot delay
            time.sleep(random.uniform(8, 14))

        if final_results:
            result_df = pd.DataFrame(final_results)
            result_df.to_csv("canlii_final_report_20.csv", index=False, encoding='utf-8-sig')
            print(f"\n✅ Success! Data saved to 'canlii_final_report_20.csv'.")
        
    except Exception as e:
        print(f"❌ Critical System Error: {e}")
    finally:
        # Quit ensures all ghost processes are killed
        driver.quit()

if __name__ == "__main__":
    main_process(limit=20)

1. Initializing session. Solve CAPTCHA manually if prompted.
2. Fetching API Case List...
[1/20] Scraping: R. v. M.T.
    ⚠️ Page loading slow: https://www.canlii.org/en/on/oncj/doc/2025/2025oncj671/2025oncj671.html
[2/20] Scraping: R. v. J.G.
    ⚠️ Page loading slow: https://www.canlii.org/en/on/oncj/doc/2025/2025oncj700/2025oncj700.html
[3/20] Scraping: R. v. Laguerre
    ⚠️ Page loading slow: https://www.canlii.org/en/on/oncj/doc/2025/2025oncj694/2025oncj694.html
[4/20] Scraping: R. v. Khosa
    ⚠️ Page loading slow: https://www.canlii.org/en/on/oncj/doc/2025/2025oncj693/2025oncj693.html
[5/20] Scraping: R. v. Lachance
    ⚠️ Page loading slow: https://www.canlii.org/en/on/oncj/doc/2025/2025oncj690/2025oncj690.html
[6/20] Scraping: R. v. Eagen
    ⚠️ Page loading slow: https://www.canlii.org/en/on/oncj/doc/2025/2025oncj687/2025oncj687.html
[7/20] Scraping: R. v. Burnett
    ⚠️ Page loading slow: https://www.canlii.org/en/on/oncj/doc/2025/2025oncj686/2025oncj686.html
[8/20] Scraping