# Generating Data

In [None]:
import pandas as pd

from bs4 import BeautifulSoup

from tqdm.notebook import tqdm

import re

import os

In [2]:
def extract_price_and_cuisine(input_string):
    # Extract price range (sequence of '€')
    price_range = re.search(r'€+', input_string)
    price_range = price_range.group() if price_range else ''
    
    # Extract cuisine type (after last '·')
    cuisine = input_string.split('·')[-1].strip()
    
    return price_range, cuisine

In [None]:
os.makedirs(f"restaurants_parsed_data", exist_ok=True)
folders = list(range(1,101))
rest_n = 1

for fol in tqdm(folders):

    # get files in page i folder and order them based on retaurants
    files = os.listdir(f'page {fol}')
    files = sorted(files, key=lambda x: int(re.search(r'(\d+)', x).group()))

    for rest in files:
        # Load restraurant URLs
        with open(f'page {fol}' + '/' + rest, "r") as file:
            html = file.read()
        
        # MakeH HTML BeautifulSoup object
        html = BeautifulSoup(html, features='lxml')

        # Restaurant name
        restaurantName = html.select_one('h1', {'class' : 'data-sheet__title'}).text.strip()

        if restaurantName == '':
            print(f'restaurantName is missing in {rest}')

        # Restaurant location
        location = html.find_all('div', {'class' : 'data-sheet__block--text'})[0].text.strip()
        location = location.split(',')

        address = location[0]
        city = location[1]
        postalCode = location[2]
        country = location[3]
        
        if address == '':
            print(f'address is missing in {rest}')
        if city == '':
            print(f'city is missing in {rest}')
        if postalCode == '':
            print(f'postalCode is missing in {rest}')
        if country == '':
            print(f'country is missing in {rest}')
        
        # Restaurant characteristics
        attributes = html.find_all('div', {'class' : 'data-sheet__block--text'})[1].text.strip()
        # attributes = attributes.split()

        # priceRange = attributes[0]
        # cuisineType = attributes[-2] + attributes[-1]
        priceRange, cuisineType = extract_price_and_cuisine(attributes)

        if priceRange == '':
            print(f'priceRange is missing in {rest}')
        if cuisineType == '':
            print(f'cuisineType is missing in {rest}')


        # Restaurant desctiption
        if len(html.find_all('div', {'class' : 'data-sheet__description'})) != 0:
            description = html.find_all('div', {'class' : 'data-sheet__description'})[0].text.strip()
        else:
            print(f'description is missing in {rest}')
            description = ''
            

        # Restaurant services
        services = html.find_all('div', {'class' : 'col col-12 col-lg-6'})

        for ser in services:
            if len(ser.find_all('div')) == 0 :
                facilitiesServices = [x.text.strip() for x in ser.find_all('li')]
            else:
                creditCards = [x['data-src'].split('/')[-1].split('-')[0].capitalize() for x in ser.find_all('img')]
        
        if len(facilitiesServices) == 0:
            print(f'facilitiesServices is missing in {rest}')
        if len(creditCards) == 0:
            print(f'creditCards is missing in {rest}')

        # Restaurant phone number
        if len(html.find_all('a', {'data-event' : 'CTA_tel'})) !=0:
            phoneNumber = html.find_all('a', {'data-event' : 'CTA_tel'})[0]['href'].replace('tel:', '')
        else:
            print(f'phoneNumber is missing in {rest}')
            phoneNumber = ''

        # Restaurant website
        if len(html.find_all('a', {'data-event' : 'CTA_website'})) != 0:
            website = html.find_all('a', {'data-event' : 'CTA_website'})[0]['href']
        else:
            print(f'website is missing in {rest}')
            website = ''

        # Make dictionary
        restaurant_data = {
                "restaurantName": restaurantName,                # string
                "address": address,                              # string
                "city": city,                                    # string
                "postalCode": postalCode,                        # string
                "country": country,                              # string
                "priceRange": priceRange,                        # string
                "cuisineType": cuisineType,                      # string
                "description": description,                      # string
                "facilitiesServices": ",".join(facilitiesServices),        # list of strings, actually string (impossible to save list in tsv file)
                "creditCards": ",".join(creditCards),                      # list of strings, actually string (impossible to save list in tsv file)
                "phoneNumber": phoneNumber,                      # string
                "website": website                               # string
            }
        
        # List of restaurant data 
        data = [restaurant_data]

        # Save as TSV
        df = pd.DataFrame(data)
        df.to_csv(f"restaurants_parsed_data/restaurant_{rest_n}.tsv", sep="\t", index=False, header=False)

        rest_n += 1

HBox(children=(HTML(value=''), FloatProgress(value=0.0), HTML(value='')))

website is missing in restaurant_22.html
website is missing in restaurant_73.html
website is missing in restaurant_80.html
website is missing in restaurant_87.html
website is missing in restaurant_125.html
website is missing in restaurant_129.html
website is missing in restaurant_144.html
website is missing in restaurant_159.html
website is missing in restaurant_169.html
website is missing in restaurant_178.html
website is missing in restaurant_192.html
website is missing in restaurant_228.html
website is missing in restaurant_261.html
website is missing in restaurant_278.html
website is missing in restaurant_282.html
website is missing in restaurant_288.html
website is missing in restaurant_305.html
website is missing in restaurant_310.html
website is missing in restaurant_320.html
website is missing in restaurant_324.html
website is missing in restaurant_326.html
website is missing in restaurant_354.html
website is missing in restaurant_394.html
website is missing in restaurant_404.h

# Check Generated Data

In [25]:
pd.set_option('display.max_rows', None)

In [34]:
# List of file paths (or you can use a pattern to match all TSV files)
file_paths = os.listdir('restaurants_parsed_data')  # Replace with your actual file names or paths
file_paths = ['restaurants_parsed_data/' + i for i in file_paths]

# Read all files and store them in a list of DataFrames
dfs = [pd.read_csv(file, sep='\t', header=None, names = ['restaurantName', 'address', 'city', 'postalCode', 'country', 'priceRange', 'cuisineType',
                                                             'description', 'facilitiesServices', 'creditCards', 'phoneNumber', 'website']) for file in tqdm(file_paths)]

# Concatenate all DataFrames into one
combined_df = pd.concat(dfs, ignore_index=True)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1982.0), HTML(value='')))




In [35]:
combined_df.head(10)

Unnamed: 0,restaurantName,address,city,postalCode,country,priceRange,cuisineType,description,facilitiesServices,creditCards,phoneNumber,website
0,O Me O Il Mare,Via Roma 45/47,Gragnano,80054,Italy,€€€€,"Italian Contemporary, Modern Cuisine",After many years’ experience in Michelin-starr...,"Air conditioning,Interesting wine list,Wheelch...","Amex,Dinersclub,Mastercard,Visa",+39 081 620 0550,http://omeoilmare.com
1,Alessandro Feo,via Angelo Lista 24,Marina di Casal Velino,84040,Italy,€€,"Campanian, Seafood",In a beautiful stone-vaulted building (an old ...,"Air conditioning,Terrace,Wheelchair access","Amex,Dinersclub,Mastercard,Visa",+39 328 893 7083,https://www.alessandrofeoristorante.it/
2,Lalibera,via Pertinace 24/a,Alba,12051,Italy,€€,"Piedmontese, Traditional Cuisine","A modern, designer-style restaurant staffed by...","Air conditioning,Interesting wine list","Amex,Mastercard,Visa",+39 0173 293155,https://lalibera.com/
3,Antica Macelleria Cecchini - Solociccia,via Chiantigiana 5,Panzano,50022,Italy,€€,Meats and Grills,One of the most famous butchers in Italy has n...,"Air conditioning,Bring your own bottle,Restaur...","Amex,Mastercard,Visa",+39 055 852020,https://www.dariocecchini.com
4,Brindo,via Libertà 18,Cusago,20047,Italy,€,Lombardian,The service is characterized by a cohesive and...,Air conditioning,"Amex,Mastercard,Visa",+39 02 9039 4429,https://www.brindo.it
5,Edelweiss,Località Crodo,Viceno,28862,Italy,€,"Country cooking, Seasonal Cuisine",A real bastion of local cuisine for over 60 ye...,"Car park,Garden or park,Wheelchair access","Amex,Dinersclub,Mastercard,Visa",+39 0324 618791,https://www.albergoedelweiss.com/ristorante/
6,Ceresio 7,via Ceresio 7,Milan,20154,Italy,€€€,Modern Cuisine,This designer-style restaurant is housed on th...,"Air conditioning,Great view,Terrace","Amex,Dinersclub,Mastercard,Visa",+39 02 3103 9221,https://www.ceresio7.com
7,Impronta d'Acqua,via Aurelia 2121,Cavi di Lavagna,16030,Italy,€€€,"Italian Contemporary, Creative",Situated on the straight road skirting the sea...,"Air conditioning,Restaurant offering vegetaria...","Amex,Dinersclub,Mastercard,Visa",+39 375 529 1077,https://www.improntadacqua.com/
8,Imàgo,piazza Trinità dei Monti 6,Rome,187,Italy,€€€€,Italian Contemporary,The location of this rooftop restaurant could ...,"Air conditioning,Great view,Interesting wine list","Amex,Unionpay,Dinersclub,Jcb,Maestrocard,Maste...",+39 06 6993 4726,https://www.hotelhasslerroma.com/it/ristoranti...
9,L'Aia dei Cappellani,contrada Maurino,Trecchina,85049,Italy,€,"Country cooking, Traditional Cuisine",In the dining room there are old photos and eq...,"Air conditioning,Car park,Terrace","Mastercard,Visa",+39 0973 826937,https://www.laiadeicappellani.com/
