In [None]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
from io import StringIO
import time

# This list will store DataFrames containing links
dataframe_list = []

# IMPORTANT: The number of pages on the website listing the companies is 1,045.
# It's efficient to work in batches: range(0:100), range(100:200), range(200:230), etc., up to 1,046 (1,045 + 1).
# You can use larger or smaller batches. Note that a 20-second delay between connections is set using time.sleep(20).

for i in range(1046): 
    if i == 0:
        url = 'https://www.fatturatoitalia.it/regione/friuli-venezia-giulia'
    else:
        url = f'https://www.fatturatoitalia.it/regione/friuli-venezia-giulia/{i + 1}'  # Add 1 to i to get the URL of the next page

    print(url)
    try:
        response = requests.get(url)
        response.raise_for_status()  # Raise an exception if the request was unsuccessful

        html_table = BeautifulSoup(response.text, 'html.parser').find('table')
        response.close()

        # Find all rows in the table
        rows = html_table.find_all('tr')

        # Initialize an empty list for links in the first column
        first_column_links = []

        # Iterate over the table rows
        for row in rows:
            # Find the first cell in the row
            cell = row.find('td')
            # If the cell exists, search for a link inside it
            if cell:
                link = cell.find('a')
                # If a link is found, add its href to the list of first column links
                if link:
                    first_column_links.append(link.get('href'))

        # Convert the HTML string to a StringIO object
        html_string = str(html_table)
        html_io = StringIO(html_string)

        # Read the DataFrame from the HTML string
        df = pd.read_html(html_io, header=0)[0]

        # Assign the links to the DataFrame
        df['Link'] = first_column_links

        dataframe_list.append(df)
    except requests.exceptions.RequestException as e:
        print(f"Error during request to {url}: {e}")

    time.sleep(20)  # Wait for 20 seconds before the next request

# Concatenate all DataFrames into a single DataFrame
concatenated_df = pd.concat(dataframe_list, ignore_index=True)



In [None]:
df_concatenato

In [None]:
# This list will store DataFrames containing company information
company_info_list = []

for url in concatenated_df.Link:
    # Fetch the webpage content
    try:
        response = requests.get(url)
        print(url)
        # Check if the request was successful
        if response.status_code == 200:
            # Parse the HTML content
            soup = BeautifulSoup(response.content, 'html.parser')

            # Extract specific information
            # For example, extract the company name
            company_name = soup.find('h1').get_text().strip()

            # Extract the address
            address = soup.find_all('div', class_='col-xs-7')[1].get_text().strip()

            # Extract the city
            city = soup.find_all('div', class_='col-xs-7')[2].get_text().strip()

            # Extract the province
            province = soup.find_all('div', class_='col-xs-7')[3].get_text().strip()

            # Extract the VAT number
            vat_number = soup.find_all('div', class_='col-xs-7')[5].get_text().strip()

            # IMPORTANT: The numbers like [5] represent the 5th row of the table on each page.
            # If you want additional information, you need to save the other rows and add them to the data dictionary.

            # Create a DataFrame with the extracted data
            data = {
                'Company Name': [company_name],
                'Address': [address],
                'City': [city],
                'Province': [province],
                'VAT Number': [vat_number]
            }

            df = pd.DataFrame(data)
            print(df)

            company_info_list.append(df)

        else:
            print("Unable to retrieve the webpage.")
    except requests.exceptions.RequestException as e:
        print(f"Error fetching {url}: {e}")

    time.sleep(20)  # Wait for 20 seconds before the next request

# Concatenate all company information DataFrames into a single DataFrame
final_company_info_df = pd.concat(company_info_list, ignore_index=True)

# IMPORTANT: To save the file as CSV:
# If working in batches, remember to change the filename to avoid overwriting each time
final_company_info_df.to_csv('companies_batch_0_100.csv', index=False)