In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import requests

from tqdm.notebook import tqdm

import time

import asyncio
import aiohttp
import random
import nest_asyncio
import aiofiles

import os

In [2]:
# get connection
cnt = requests.get("https://guide.michelin.com/en/it/restaurants")

In [3]:
# get content
soup = BeautifulSoup(cnt.content, features="lxml")

In [None]:
# save html
f = open("source.html", "w")
f.write(soup.prettify())
f.close()

In [170]:
# find all ul tags, where class = 'pagination'
ul = soup.find_all('ul', {'class': 'pagination'})[0]

In [171]:
page_urls = []
for element in ul.find_all('a')[:-1]:
    page_urls.append('https://guide.michelin.com' + element['href'])

In [None]:
# Add missing pages, will cause URL duplications!!! 
for url in tqdm(page_urls[9:]):

    # get connection
    cnt = requests.get(url)

    # get content
    soup = BeautifulSoup(cnt.content, features="lxml")  

    # find all ul tags, where class = 'pagination'
    ul = soup.find_all('ul', {'class': 'pagination'})[0]

    for element in ul.find_all('a'):
        page_urls.append('https://guide.michelin.com' + element['href'])

    time.sleep(3)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=10.0), HTML(value='')))




In [181]:
# remove duplicated elements
page_urls = list(set(page_urls))

In [186]:
# sort URLs
page_urls = sorted(page_urls, key=lambda url: int(url.split('/')[-1]) if url.split('/')[-1].isdigit() else 1)

In [None]:
restaurant_urls = []

page = 1
for url in tqdm(page_urls):
    
    # get connection
    cnt = requests.get(url)

    # get content
    soup = BeautifulSoup(cnt.content, features="lxml")
    
    # find all div tags, where class = card__menu selection-card box-placeholder js-restaurant__list_item js-match-height js-map
    div = soup.find_all('div', {'class': 'card__menu selection-card box-placeholder js-restaurant__list_item js-match-height js-map'})
    print(f"# of restaurants on page {page} is {len(div)}")

    for element in div:
        restaurant_urls.append('https://guide.michelin.com' + element.select_one('a')['href'])

    page += 1

    time.sleep(3)

HBox(children=(HTML(value=''), FloatProgress(value=0.0), HTML(value='')))

# of restaurants on page 1 is 20
# of restaurants on page 2 is 20
# of restaurants on page 3 is 20
# of restaurants on page 4 is 20
# of restaurants on page 5 is 20
# of restaurants on page 6 is 20
# of restaurants on page 7 is 20
# of restaurants on page 8 is 20
# of restaurants on page 9 is 20
# of restaurants on page 10 is 20
# of restaurants on page 11 is 20
# of restaurants on page 12 is 20
# of restaurants on page 13 is 20
# of restaurants on page 14 is 20
# of restaurants on page 15 is 20
# of restaurants on page 16 is 20
# of restaurants on page 17 is 20
# of restaurants on page 18 is 20
# of restaurants on page 19 is 20
# of restaurants on page 20 is 20
# of restaurants on page 21 is 20
# of restaurants on page 22 is 20
# of restaurants on page 23 is 20
# of restaurants on page 24 is 20
# of restaurants on page 25 is 20
# of restaurants on page 26 is 20
# of restaurants on page 27 is 20
# of restaurants on page 28 is 20
# of restaurants on page 29 is 20
# of restaurants on pag

In [190]:
len(restaurant_urls), len(set(restaurant_urls))

(1983, 1983)

In [None]:
# save restaurant URLs
with open("restaurant_urls.txt", "w") as file:
    for url in restaurant_urls:
        file.write(url + "\n")

In [7]:
# Load restraurant URLs
with open("restaurant_urls.txt", "r") as file:
    restaurant_urls = [line.strip() for line in file]

In [None]:
# Apply nest_asyncio to allow async calls in Jupyter
nest_asyncio.apply()

# Function to fetch HTML content from a single URL
async def fetch_html(url: str, n_rest: int):
    try:

        headers = {
            "User-Agent": random.choice([
                "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3", 
                "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36",
                "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36",
            ])
        }
        
        # Create an asynchronous session to send the HTTP GET request
        async with aiohttp.ClientSession() as session:
            # Send the GET request to the URL
            async with session.get(url, headers=headers) as response:
                # Delay between requests to mimic human behavior (random delay)
                await asyncio.sleep(random.uniform(1, 2))

                # Read the response body as text
                html = await response.text()

                # Beautify the HTML content using BeautifulSoup
                text = BeautifulSoup(html, features='lxml').prettify()

                # Create directories to store HTML files (organized by page number)
                # Here, we group the HTML files into pages of 20
                page = n_rest//20 + 1
                os.makedirs(f"page {page}", exist_ok=True)

                # Write the formatted HTML content to a file in the appropriate folder
                async with aiofiles.open(f"page {page}/restaurant_{n_rest+1}.html", "w") as f:
                    await f.write(text)
                
                return None # Return None when the task is successfully completed
            
    # If there's an error (e.g., network issue, invalid URL), print the error and return the URL            
    except Exception as e:
        print(f'For {url} error: {e}')
        return url

# Function to fetch HTML from a list of URLs asynchronously
async def fetch_all_html(urls: list):
    # Create a list of tasks, each task fetching a single URL
    tasks = [fetch_html(url, index) for index, url in enumerate(urls)]

    # Gather all tasks and execute them concurrently
    html_pages = await asyncio.gather(*tasks)

In [None]:
# Run the asynchronous HTML fetching function
asyncio.run(fetch_all_html(restaurant_urls))