First attempt using Selenium

In [None]:
# Install Selenium and webdriver
!pip install selenium
!apt-get update # to update ubuntu to correctly run apt install
!apt install chromium-chromedriver
!cp /usr/lib/chromium-browser/chromedriver /usr/bin

In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import pandas as pd
import time

# Initialize the WebDriver
options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_argument('--no-sandbox')  # Bypass OS security model
options.add_argument('--disable-dev-shm-usage')  # Overcome limited resource problems
options.add_argument('--disable-gpu')  # Applicable for windows os and headless mode

driver = webdriver.Chrome(options=options)

# Navigate to the target page
driver.get("https://www.conseil-etat.fr/arianeweb/#/recherche")  # Replace with the actual URL


In [None]:
# Wait for the element to be clickable and perform a click
wait = WebDriverWait(driver, 10)
first_toggle = wait.until(EC.element_to_be_clickable((By.XPATH, '//*[@id="content"]/div[1]/form/div[2]/div/div[1]/div[1]/label/input')))
first_toggle.click()

second_toggle = wait.until(EC.element_to_be_clickable((By.XPATH, '//*[@id="content"]/div[1]/form/div[2]/div/div[2]/div[1]/label/input')))
second_toggle.click()


In [None]:
# Enter text in the text area
text_area = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, '//*[@id="content"]/div[1]/form/div[1]/div/div/textarea')))
text_area.clear()  # Clears the text area before inputting text
text_area.send_keys("test")

In [None]:
# Click the search button
search_button = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, '//*[@id="content"]/div[1]/form/div[4]/div/button')))
search_button.click()

In [None]:
# Initialize an empty list to hold all scraped data
all_data = []

# Function to extract and return data from a row
def extract_data_from_row(row):
    # Extract all 'td' elements that are not headers
    cells = row.find_all('td')
    # Extract text from each cell
    extracted_data = [cell.get_text(strip=True) for cell in cells]
    return extracted_data

# Main scraping loop
try:
    while True:
        # Wait for the dynamic content to load
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "tr[ng-repeat-start]"))
        )

        # Now use BeautifulSoup to parse the page source
        soup = BeautifulSoup(driver.page_source, 'html.parser')

        # Find all rows of interest
        rows = soup.select('tr[ng-repeat-start]')

        # Iterate over each row and extract data
        for row in rows:
            row_data = extract_data_from_row(row)
            all_data.append(row_data)

        # Attempt to find and click the 'Next' button, if it exists
        next_button = driver.find_elements(By.XPATH, "//button[contains(text(), 'Next') or @aria-label='Next']")
        if next_button:
            next_button[0].click()
        else:
            break  # Exit loop if there's no 'Next' button

finally:
    driver.quit()  # Ensure the driver is quit at the end

# Convert the list of data into a pandas DataFrame
df = pd.DataFrame(all_data, columns=['Rank', 'DecisionType', 'Section', 'Role', 'Date', 'CaseNumber', 'Category'])  # Adjust columns as needed
print(df.head())

# Here you may save the dataframe to a file or perform other operations as required.


  Rank DecisionType         Section              Role        Date  \
0    1       Arrêts       CAA DOUAI           Chambre  05/03/2024   
1    2    Décisions  Conseil dÉtat          3-8  CHR  26/02/2024   
2    3    Décisions  Conseil dÉtat          3-8  CHR  15/02/2024   
3    4    Décisions  Conseil dÉtat  Juge des référés  14/02/2024   
4    5    Décisions  Conseil dÉtat         10-9  CHR  09/02/2024   

    CaseNumber Category  
0    22DA01934       C+  
1       469858        B  
2       454475        B  
3       491005        C  
4  472346(...)        C  


In [None]:
df

Unnamed: 0,Rank,DecisionType,Section,Role,Date,CaseNumber,Category
0,1,Arrêts,CAA DOUAI,Chambre,05/03/2024,22DA01934,C+
1,2,Décisions,Conseil dÉtat,3-8 CHR,26/02/2024,469858,B
2,3,Décisions,Conseil dÉtat,3-8 CHR,15/02/2024,454475,B
3,4,Décisions,Conseil dÉtat,Juge des référés,14/02/2024,491005,C
4,5,Décisions,Conseil dÉtat,10-9 CHR,09/02/2024,472346(...),C
5,6,Décisions,Conseil dÉtat,Juge des référés,10/01/2024,490477,C
6,7,Décisions,Conseil dÉtat,Juge des référés,21/12/2023,489990,C
7,8,Décisions,Conseil dÉtat,5-6 CHR,21/12/2023,473466,C
8,9,Décisions,Conseil dÉtat,5ème CHS,21/12/2023,470132,C
9,10,Décisions,Conseil dÉtat,6-5 CHR,18/12/2023,451947,C


In [None]:
len(df)

50

Second attempt

In [None]:
import requests

In [None]:
url = "https://www.conseil-etat.fr/xsearch?"

# Paramètres pour la chaîne de requête
params = {
    "advanced": "1",
    "type": "json",
    "SourceStr4": "AW_DCA",
    "synonyms": "true",
    "scmode": "smart",
    "SkipCount": "50",
    "SkipFrom": "0",
    "sort": "SourceDateTime1.desc,SourceStr5.desc",
    "add.text": "1729 du code général des impôts déchargé"
}

# Envoi de la requête POST
response = requests.post(url, params=params)

# Initialisation du compteur pour "Rejet"
count_rejet = 0

# Vérification du statut de la réponse
if response.status_code == 200:
  # Transformation de la réponse en JSON
  data = response.json()

  # Vérification que le champ "Documents" est présent dans les données
  if "Documents" in data:
    # Parcours de chaque document
    for document in data["Documents"]:
      # Vérification si "SourceStr12" est présent et équivaut à "Rejet"
      if document.get("SourceStr12") == "Rejet":
        count_rejet += 1

  # Affichage du nombre de "Rejet"
  print(f"Nombre de 'Rejet' : {count_rejet}")
else:
  print(f"Erreur lors de la requête : {response.status_code}")

Nombre de 'Rejet' : 24
