# Scraping Sentenze Appalti

In [3]:
%pip install requests beautifulsoup4

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.2 -> 24.3.1
[notice] To update, run: C:\Users\burre\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


### Importiamo le librerie

In [4]:
import os
import re
import requests
from bs4 import BeautifulSoup

### Definiamo le principali funzioni per lo scraping delle pagine di SentenzeAppalti

In [None]:
# Base URL of the website
base_url = 'https://www.sentenzeappalti.it/'

# Function to scrape articles within the specified structure
def get_article_tags_and_links(page_url):
    try:
        # Send request to the page
        response = requests.get(page_url)
        response.raise_for_status()

        soup = BeautifulSoup(response.content, 'html.parser')

        main_content = soup.find('div', id='main-content')

        primary_content = main_content.find('div', id='primary')

        content_area = primary_content.find('div', id='content')

        articles = content_area.find_all('article')

        # Extract article links and associated tags
        relevant_articles = []
        for article in articles:
            title = article.find('h1')
            article_url = title.find('a')['href']

            pattern = r'\b\w*d[\-\.]?lgs[\-\.]?\s*36[\-\.]?\/[\-\.]?2023\w*\b'

            if re.search(pattern, article.get_text()):
                tag_links_span = article.find('span', class_='tag-links')
                if tag_links_span:
                    link_tags = [a['href'] for a in tag_links_span.find_all('a', href=True) if "36-2023" in a['href']]
                else:
                    link_tags = []

                relevant_articles.append({
                    'url': article_url,    # URL of the article
                    'tags': article['class'],  # Article classes as tag proxies
                    'link_tags': link_tags     # List of extracted links (or empty)
                })

        return relevant_articles

    except Exception as e:
        print(f"Error scraping {page_url}: {e}")
        return []

# Download the article from the URL provided
def download_article(url):
    try:
        # Create download directory on the Desktop
        download_dir = os.path.join(os.path.expanduser("~/Desktop"), 'downloaded_articles_sentenzeAppalti')
        os.makedirs(download_dir, exist_ok=True)

        # Make the request to the URL
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')

        article = soup.find('article')
        entry_content = article.find('div', class_='entry-content')

        if entry_content:
            # Extract text from all <p> elements
            article_content = '\n'.join(p.get_text().replace('\n', ' ') for p in entry_content.find_all('p'))
            base_file_name = url.split('/')[-2]  # Get the base name from the URL
            file_name = f"{base_file_name}.txt"
            file_path = os.path.join(download_dir, file_name)

            # Check if the file already exists
            counter = 1
            while os.path.exists(file_path):
                file_name = f"{base_file_name}_{counter}.txt"
                file_path = os.path.join(download_dir, file_name)
                counter += 1

            # Save the article content to a file
            with open(file_path, 'w', encoding='utf-8') as file:
                file.write(article_content)
            print(f"Downloaded and saved: {file_path}")
        else:
            print(f"Div with class 'entry-content' not found for {url}.")
    except requests.HTTPError as http_err:
        print(f"HTTP error occurred while downloading {url}: {http_err}")
    except requests.RequestException as req_err:
        print(f"Request error occurred while downloading {url}: {req_err}")
    except Exception as e:
        print(f"Error downloading {url}: {e}")

# Main function to scrape the homepage and subsequent pages for relevant articles
def scrape_articles():
    documents = []

    # Start with the homepage or first page
    current_page = base_url
    for page_number in range(1, 5):  # Adjust page range as needed
        print(f"Scraping page {page_number}...")
        articles = get_article_tags_and_links(current_page)
        documents.extend(articles)
        current_page = f'{base_url}page/{page_number}/'

    return documents

### Eseguiamo lo scraping di sentenzeAppalti riguardante gli articoli con tag "36-2023"

In [6]:
# Run the scraper and print results
articles = scrape_articles()
for article in articles:
    download_article(article['url'])

Scraping page 1...
Scraping page 2...
Scraping page 3...
Scraping page 4...
Downloaded and saved: C:\Users\burre/Desktop\downloaded_articles_sentenzeAppalti\incentivi-funzioni-tecniche-calcolo-sull-importo-a-base-delle-procedure-al-netto-delle-eventuali-opzioni.txt
Downloaded and saved: C:\Users\burre/Desktop\downloaded_articles_sentenzeAppalti\rotazione-gestore-uscente-nuovamente-affidatario-in-raggruppamento-temporaneo-con-altri-operator-economici.txt
Downloaded and saved: C:\Users\burre/Desktop\downloaded_articles_sentenzeAppalti\affidamento-diretto-sopra-5-000-euro-ed-obbligo-utilizzo-mepa-equivalente-la-piattaforma-approvvigionamento-digitale-pad-della-stazione-appaltante.txt
Downloaded and saved: C:\Users\burre/Desktop\downloaded_articles_sentenzeAppalti\riesame-delle-offerte-da-parte-della-commissione-giudicatrice-quando-e-consentita.txt
Downloaded and saved: C:\Users\burre/Desktop\downloaded_articles_sentenzeAppalti\attivita-incentivabili-per-i-lavori-pubblici.txt
Downloaded an