In [1]:
import requests
from bs4 import BeautifulSoup
import time

In [2]:
# Base URL components
base_url = 'https://guide.michelin.com'
search_url = 'https://guide.michelin.com/en/it/restaurants/page/{}'

# List to store restaurant URLs
restaurant_urls = []



# Loop through each page
for page in range(1, 101):
    response = requests.get(search_url.format(page))
    if response.status_code != 200:
        print(f"Failed to retrieve page {page}")
        continue

    # Parse the page HTML
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Find all restaurant links
    restaurants = soup.find_all('a', class_='link')  # Update 'link' if needed

    if not restaurants:  # Stop if no restaurants are found
        print(f"No restaurants found on page {page}. Ending scrape.")
        break

    # Extract and store each restaurant URL
    for restaurant in restaurants:
        url = restaurant.get('href')
        
        # Check if the URL is a valid restaurant link
        if '/restaurant/' in url:
            # Ensure the URL is complete by adding the base URL if needed
            if not url.startswith('https://'):
                url = base_url + url
                
            # Add only valid restaurant URLs (avoid duplicates)
            if '/restaurant/' in url and url not in restaurant_urls:
                restaurant_urls.append(url)

    print(f"Page {page} processed; total URLs collected: {len(restaurant_urls)}")
    time.sleep(1)  # Pause to avoid overloading the server


Page 1 processed; total URLs collected: 20
Page 2 processed; total URLs collected: 40
Page 3 processed; total URLs collected: 60
Page 4 processed; total URLs collected: 80
Page 5 processed; total URLs collected: 100
Page 6 processed; total URLs collected: 120
Page 7 processed; total URLs collected: 140
Page 8 processed; total URLs collected: 160
Page 9 processed; total URLs collected: 180
Page 10 processed; total URLs collected: 200
Page 11 processed; total URLs collected: 220
Page 12 processed; total URLs collected: 240
Page 13 processed; total URLs collected: 260
Page 14 processed; total URLs collected: 280
Page 15 processed; total URLs collected: 300
Page 16 processed; total URLs collected: 320
Page 17 processed; total URLs collected: 340
Page 18 processed; total URLs collected: 360
Page 19 processed; total URLs collected: 380
Page 20 processed; total URLs collected: 400
Page 21 processed; total URLs collected: 420
Page 22 processed; total URLs collected: 440
Page 23 processed; tota

In [4]:
for url in restaurant_urls:
    print(url)

https://guide.michelin.com/en/campania/gragnano/restaurant/o-me-o-il-mare
https://guide.michelin.com/en/abruzzo/popoli_1845563/restaurant/donevandro
https://guide.michelin.com/en/piemonte/alba/restaurant/ape-vino-e-cucina
https://guide.michelin.com/en/campania/sorrento/restaurant/da-bob-cook-fish
https://guide.michelin.com/en/basilicata/matera/restaurant/da-mo
https://guide.michelin.com/en/sardegna/cagliari/restaurant/sa-domu-sarda
https://guide.michelin.com/en/sicilia/palermo/restaurant/charleston
https://guide.michelin.com/en/toscana/bibbiena/restaurant/il-tirabuscio262517
https://guide.michelin.com/en/emilia-romagna/cesenatico/restaurant/la-buca130947
https://guide.michelin.com/en/campania/marina-di-casal-velino/restaurant/alessandro-feo
https://guide.michelin.com/en/lombardia/cervesina/restaurant/dama-1213583
https://guide.michelin.com/en/campania/napoli/restaurant/il-ristorante-alain-ducasse-napoli
https://guide.michelin.com/en/emilia-romagna/noceto_1827072/restaurant/palazzo-utin

In [5]:
print(restaurant_urls)

['https://guide.michelin.com/en/campania/gragnano/restaurant/o-me-o-il-mare', 'https://guide.michelin.com/en/abruzzo/popoli_1845563/restaurant/donevandro', 'https://guide.michelin.com/en/piemonte/alba/restaurant/ape-vino-e-cucina', 'https://guide.michelin.com/en/campania/sorrento/restaurant/da-bob-cook-fish', 'https://guide.michelin.com/en/basilicata/matera/restaurant/da-mo', 'https://guide.michelin.com/en/sardegna/cagliari/restaurant/sa-domu-sarda', 'https://guide.michelin.com/en/sicilia/palermo/restaurant/charleston', 'https://guide.michelin.com/en/toscana/bibbiena/restaurant/il-tirabuscio262517', 'https://guide.michelin.com/en/emilia-romagna/cesenatico/restaurant/la-buca130947', 'https://guide.michelin.com/en/campania/marina-di-casal-velino/restaurant/alessandro-feo', 'https://guide.michelin.com/en/lombardia/cervesina/restaurant/dama-1213583', 'https://guide.michelin.com/en/campania/napoli/restaurant/il-ristorante-alain-ducasse-napoli', 'https://guide.michelin.com/en/emilia-romagna/

In [7]:
# Write URLs to a .txt file, each on a new line
with open('restaurants_urls.txt', 'w') as file:
    for url in restaurant_urls:
        file.write(url + '\n')

print("URLs have been saved to restaurants_urls.txt")

URLs have been saved to restaurants_urls.txt


In [8]:
# Iterate over the URLs to fetch HTML content and save it
for index, url in enumerate(restaurant_urls):
    try:
        # Send a GET request to the URL
        response = requests.get(url)
        
        # Check if the request was successful
        if response.status_code == 200:
            # Save the HTML content to a file named by the restaurant index
            file_name = f'restaurant_{index + 1}.html'
            with open(file_name, 'w', encoding='utf-8') as file:
                file.write(response.text)
            print(f"HTML for {url} has been saved to {file_name}")
        else:
            print(f"Failed to retrieve {url}. Status code: {response.status_code}")
    
    except requests.exceptions.RequestException as e:
        print(f"Error downloading {url}: {e}")

HTML for https://guide.michelin.com/en/campania/gragnano/restaurant/o-me-o-il-mare has been saved to restaurant_1.html
HTML for https://guide.michelin.com/en/abruzzo/popoli_1845563/restaurant/donevandro has been saved to restaurant_2.html
HTML for https://guide.michelin.com/en/piemonte/alba/restaurant/ape-vino-e-cucina has been saved to restaurant_3.html
HTML for https://guide.michelin.com/en/campania/sorrento/restaurant/da-bob-cook-fish has been saved to restaurant_4.html
HTML for https://guide.michelin.com/en/basilicata/matera/restaurant/da-mo has been saved to restaurant_5.html
HTML for https://guide.michelin.com/en/sardegna/cagliari/restaurant/sa-domu-sarda has been saved to restaurant_6.html
HTML for https://guide.michelin.com/en/sicilia/palermo/restaurant/charleston has been saved to restaurant_7.html
HTML for https://guide.michelin.com/en/toscana/bibbiena/restaurant/il-tirabuscio262517 has been saved to restaurant_8.html
HTML for https://guide.michelin.com/en/emilia-romagna/cese

In [38]:
import os

In [12]:
# Function to save HTML files for each page
def save_html_by_page(restaurant_urls, page_number):
    # Create a directory for the current page (if it doesn't exist)
    folder_name = f'page_{page_number}'
    if not os.path.exists(folder_name):
        os.makedirs(folder_name)
    
    # Calculate the starting and ending index for the current page (max 20 per page)
    start_index = (page_number - 1) * 20
    end_index = start_index + 20

    # Select the URLs for this page
    page_urls = restaurant_urls[start_index:end_index]

    # Iterate over the selected URLs to fetch HTML content and save it in the correct folder
    for index, url in enumerate(page_urls, start=start_index + 1):
        try:
            # Send a GET request to the URL
            response = requests.get(url)
            
            # Check if the request was successful
            if response.status_code == 200:
                # Define the file name using the index
                file_name = os.path.join(folder_name, f'restaurant_{index}.html')
                
                # Save the HTML content to the file immediately after downloading
                with open(file_name, 'w', encoding='utf-8') as file:
                    file.write(response.text)
                print(f"HTML for {url} has been saved to {file_name}")
            else:
                print(f"Failed to retrieve {url}. Status code: {response.status_code}")
        
        except requests.exceptions.RequestException as e:
            print(f"Error downloading {url}: {e}")

# Loop through the 100 pages
for page_number in range(1, 101):  # From page 1 to page 100
    save_html_by_page(restaurant_urls, page_number)

HTML for https://guide.michelin.com/en/campania/gragnano/restaurant/o-me-o-il-mare has been saved to page_1/restaurant_1.html
HTML for https://guide.michelin.com/en/abruzzo/popoli_1845563/restaurant/donevandro has been saved to page_1/restaurant_2.html
HTML for https://guide.michelin.com/en/piemonte/alba/restaurant/ape-vino-e-cucina has been saved to page_1/restaurant_3.html
HTML for https://guide.michelin.com/en/campania/sorrento/restaurant/da-bob-cook-fish has been saved to page_1/restaurant_4.html
HTML for https://guide.michelin.com/en/basilicata/matera/restaurant/da-mo has been saved to page_1/restaurant_5.html
HTML for https://guide.michelin.com/en/sardegna/cagliari/restaurant/sa-domu-sarda has been saved to page_1/restaurant_6.html
HTML for https://guide.michelin.com/en/sicilia/palermo/restaurant/charleston has been saved to page_1/restaurant_7.html
HTML for https://guide.michelin.com/en/toscana/bibbiena/restaurant/il-tirabuscio262517 has been saved to page_1/restaurant_8.html
HT

KeyboardInterrupt: 