In [92]:
import pandas as pd
from bs4  import BeautifulSoup
import numpy as np
import plotly.express as px

# Restaurants

In [2]:
resto_soup = pd.read_csv('restaurant_soup').drop(columns='Unnamed: 0').squeeze()
resto_soup2 = pd.read_csv('restaurant_soup2.csv').drop(columns='Unnamed: 0').squeeze()

In [3]:
resto_soup2.index

Index(['Cerveseria Catalana', 'My Fucking Restaurant', 'Ciutat Comtal',
       'Tapeo', 'Arume', 'Taquerías Tamarindo', 'Vinitus', 'Somorrostro',
       'Cañete', 'Firebug',
       ...
       'Chez Cocó', 'Portolés', 'Bo de Gracia', 'Margherita', 'Bar Nolla',
       'Casa Dorita', 'Palmito Beach Bar', 'Takumi Sapporo Ramen Kitchen',
       'Bosque Palermo', 'Son Hao'],
      dtype='object', length=207)

In [4]:
resto_soup.index


Index(['0', '10', '20', '30', '40', '50', '60', '70', '80', '90', '100', '110',
       '120', '130', '140', '150', '160', '170', '180', '190', '200', '210',
       '220', '230'],
      dtype='object')

#### Name

In [5]:
# restaurant name
name = []
for row in resto_soup:  # row contain 10 restaurants
    soup = BeautifulSoup(row, 'html.parser')
    result = soup.select('.css-1egxyvc .css-1m051bw')
    for tag in result:
        # print(tag.text)
        name.append(tag.text)

In [6]:
# It's like there are duplicates
pd.Series(name).value_counts()

La Paradeta        5
Macchina           2
Pirineu en Boca    2
Kamasot            2
Obe Restaurant     2
                  ..
Cecconi’s          1
Ugot Bruncherie    1
Arume              1
Hummus             1
El Venezia         1
Length: 207, dtype: int64

#### Price range

In [7]:
price_range = []
container_class = 'container__09f24__mpR8_ hoverable__09f24__wQ_on margin-t3__09f24__riq4X margin-b3__09f24__l9v5d padding-t3__09f24__TMrIW padding-r3__09f24__eaF7p padding-b3__09f24__S8R2d padding-l3__09f24__IOjKY border--top__09f24__exYYb border--right__09f24__X7Tln border--bottom__09f24___mg5X border--left__09f24__DMOkM border-color--default__09f24__NPAKY'
for row in resto_soup:
    soup = BeautifulSoup(row, 'html.parser')
    result = soup.find_all(class_ = container_class)
    for tag in result:
        header_text = tag.div.div.div.nextSibling.div
        price = header_text.select_one('.css-1s7bx9e')
        if price is not None:
            price_range.append(price.text)
        else:
            price_range.append(np.nan)


In [8]:
# test
for n, price in zip(name, price_range):
    print(n, price) 


Cerveseria Catalana €€
My Fucking Restaurant €€€
Ciutat Comtal €€
Tapeo €€
Arume €€
Taquerías Tamarindo €
Vinitus €€
Somorrostro €€
Cañete €€€
Firebug €€
El Nacional €€
Firebug €€
Tucco Real Food Born €
Disfrutar €€€€
Tosca €€
Brunch & Cake €€
Can Paixano €
El Pintxo de Petritxol €€
Boa-Bao nan
El Asador de Aranda €€€
El Asador de Aranda €€€
Restaurant La Tasqueta de Blai €
Telefèric €€
Arcano €€€
Tickets €€€€
El Jardín del Edén nan
Cera 23 €€
A Tu Bola €
Guell €€
Bar Mut €€€
Buenos Aires €€
Le Romane nan
La Pepita €€
Bar Mut €€€
Cachitos €€
2254 €€€
Federal €
A Tu Bola €
Lizarran €
Momo €€
Paradiso €€
Sensi €€
Alsur Café €
2254 €€€
Maitea €€
Federal €
Betlem €€
Llamber €€€
Micu Maku €
O’Toxo 3 Hermanos €€
Bilbao Berria €€
La Cuina de Laietana €€
Macchina €
Bo de Boqueria nan
Caravelle €€
La Bodegueta €€€
Raó nan
Terrace la Isabela €€€
El Vaso de Oro €€
9 Nine €€€
Paco Alcalde €€
Xian nan
Casa Alfonso €€
La Caravana €
Mosquito €
Nubar nan
Quimet & Quimet €€
La Bombeta €€
Billy Brunch n

#### Rating

In [9]:
rating = []
container_class = 'container__09f24__mpR8_ hoverable__09f24__wQ_on margin-t3__09f24__riq4X margin-b3__09f24__l9v5d padding-t3__09f24__TMrIW padding-r3__09f24__eaF7p padding-b3__09f24__S8R2d padding-l3__09f24__IOjKY border--top__09f24__exYYb border--right__09f24__X7Tln border--bottom__09f24___mg5X border--left__09f24__DMOkM border-color--default__09f24__NPAKY'
for row in resto_soup:
    soup = BeautifulSoup(row, 'html.parser')
    result = soup.find_all(class_ = container_class)
    for tag in result:
        header_text = tag.div.div.div.nextSibling.div
        rat = header_text.select_one('.overflow--hidden__09f24___ayzG')
        
        if rat is not None:
            rating.append(rat['aria-label'].split()[0])

            # print(rat['aria-label'])

        else:
            rating.append(np.nan)
        

In [10]:
# test
for n, rat in zip(name, rating):
    print(n, rat) 

Cerveseria Catalana 4.5
My Fucking Restaurant 4.5
Ciutat Comtal 4
Tapeo 4.5
Arume 4.5
Taquerías Tamarindo 4.5
Vinitus 4.5
Somorrostro 4.5
Cañete 4.5
Firebug 4.5
El Nacional 4
Firebug 4.5
Tucco Real Food Born 4.5
Disfrutar 5
Tosca 4.5
Brunch & Cake 4
Can Paixano 4.5
El Pintxo de Petritxol 4.5
Boa-Bao 4.5
El Asador de Aranda 4.5
El Asador de Aranda 4.5
Restaurant La Tasqueta de Blai 4.5
Telefèric 4
Arcano 4.5
Tickets 4.5
El Jardín del Edén 4.5
Cera 23 4.5
A Tu Bola 4.5
Guell 4.5
Bar Mut 4.5
Buenos Aires 4.5
Le Romane 5
La Pepita 4.5
Bar Mut 4.5
Cachitos 4
2254 4.5
Federal 4.5
A Tu Bola 4.5
Lizarran 4
Momo 4.5
Paradiso 4.5
Sensi 4.5
Alsur Café 4
2254 4.5
Maitea 4.5
Federal 4.5
Betlem 4.5
Llamber 4.5
Micu Maku 4.5
O’Toxo 3 Hermanos 4.5
Bilbao Berria 4
La Cuina de Laietana 4.5
Macchina 4.5
Bo de Boqueria 5
Caravelle 4.5
La Bodegueta 4
Raó 4.5
Terrace la Isabela 4.5
El Vaso de Oro 4.5
9 Nine 5
Paco Alcalde 4.5
Xian 4.5
Casa Alfonso 4
La Caravana 5
Mosquito 4
Nubar 4.5
Quimet & Quimet 4.5
La 

#### Location

In [11]:
resto_soup2.shape

(207,)

In [12]:

location = [] 
for row in resto_soup2:
    soup = BeautifulSoup(row, 'html.parser')
    result = soup.select('address .raw__09f24__T4Ezm')
    adress_elements = []
    for tag in result:
        adress_elements.append(tag.text)
    location.append( '--'.join(adress_elements) )


In [13]:
location

['Carrer de Mallorca, 236--08008 Barcelona--Spain',
 'Carrer Nou de la Rambla, 35--08001 Barcelona--Spain',
 'Rambla de Catalunya, 18--08007 Barcelona--Spain',
 'Carrer de Montcada, 29--08003 Barcelona--Spain',
 "Carrer d'En Botella, 11--08007 Barcelona--Spain",
 'Carrer de Aragón, 236--08007 Barcelona--Spain',
 'Carrer Del Consell de Cent, 333--08007 Barcelona--Spain',
 'Carrer de Sant Carles, 11--08003 Barcelona--Spain',
 'Carrer de la Unió, 17--08001 Barcelona--Spain',
 'Passeig de Sant Joan, 23--08010 Barcelona--Spain',
 'Passeig de Gracia, 24--08007 Barcelona--Spain',
 'Carrer del Consolat de Mar, 23--08003 Barcelona--Spain',
 'Carrer de Villarroel, 163--08036 Barcelona--Spain',
 'Carrer de Sant Pere Més Alt, 8--08003 Barcelona--Spain',
 "Carrer d'Enric Granados, 145--08008 Barcelona--Spain",
 'Carrer de la Reina Cristina, 7--08003 Barcelona--Spain',
 'Carrer de Petritxol, 9--08002 Barcelona--Spain',
 'Plaça del Dr. Letamendi, 1--08007 Barcelona--Spain',
 'Calle Pau Claris, 70--08

#### Website

In [14]:
website = []
for row in resto_soup2:
    soup = BeautifulSoup(row, 'html.parser')
    result = soup.select_one('.css-na3oda+ .css-1p9ibgf .css-1um3nx')
    # for tag in result:
    if result is not None:
        website.append( result.text ) 

    else:
        website.append(np.nan)


In [15]:
website

['http://cerveceriacatalana.food…',
 'http://www.myfuckingrestaurant…',
 nan,
 'http://www.tapeoborn.cat',
 'http://arumerestaurant.com/',
 nan,
 nan,
 'http://www.restaurantesomorros…',
 'http://www.barcanete.com',
 'http://www.firebugbarcelona.co…',
 'http://www.elnacionalbcn.com',
 'http://www.tuccorealfood.com',
 'http://en.disfrutarbarcelona.c…',
 nan,
 'https://brunchandcake.com',
 'http://www.canpaixano.com',
 'http://www.elpintxodepetritxol…',
 'http://www.boabao.es',
 'http://asadordearanda.com/cont…',
 'https://www.latasquetadeblai.c…',
 'http://www.teleferic.es',
 nan,
 'http://www.ticketsbar.es',
 'http://www.jardindeleden.es',
 'http://www.cera23.com',
 'http://www.atubolarest.com/',
 'https://www.guelltapasbarcelon…',
 'http://www.barmut.com/',
 'http://www.bairesbcn.com',
 'http://www.leromane.com',
 'http://www.lapepitabcn.com',
 'http://www.cachitosrambla.com',
 'http://www.restaurante2254.es/',
 'http://www.federalcafe.es/',
 'http://www.lizarran.es',
 'http://momobar

#### Number of photos

In [16]:

nb_photos = []
photo_header_class = 'photo-header-content-container__09f24__jDLBB border-color--default__09f24__NPAKY'
for row in resto_soup2:
    soup = BeautifulSoup(row, 'html.parser')
    photo_header = soup.find(class_ = photo_header_class)
    if photo_header is not None:

        nb_photos.append(  pd.Series(photo_header.text).str.extract(r'See (\d+) photos').squeeze()  )
        # print('-'*100)

    else:
        nb_photos.append(np.nan)
    

In [17]:
#
# df numbers of pho
for name, photo in zip(resto_soup2.index, nb_photos):
    print(name, photo)

Cerveseria Catalana 3852
My Fucking Restaurant 235
Ciutat Comtal 3450
Tapeo 1114
Arume 1600
Taquerías Tamarindo 326
Vinitus 1075
Somorrostro 905
Cañete 1156
Firebug 258
El Nacional 1326
Tucco Real Food Born 614
Disfrutar 1340
Tosca 404
Brunch & Cake 85
Can Paixano 419
El Pintxo de Petritxol 184
Boa-Bao 20
El Asador de Aranda 105
Restaurant La Tasqueta de Blai 576
Telefèric 416
Arcano nan
Tickets 2292
El Jardín del Edén 43
Cera 23 1224
A Tu Bola 132
Guell 155
Bar Mut 294
Buenos Aires 97
Le Romane 27
La Pepita 1065
Cachitos 237
2254 99
Federal 136
Lizarran 78
Momo 46
Paradiso 280
Sensi nan
Alsur Café 662
Maitea 272
Betlem 161
Llamber 262
Micu Maku 99
O’Toxo 3 Hermanos 191
Bilbao Berria 288
La Cuina de Laietana 39
Macchina 126
Bo de Boqueria 22
Caravelle 444
La Bodegueta 71
Raó 146
Terrace la Isabela 17
El Vaso de Oro 359
9 Nine 146
Paco Alcalde 120
Xian 32
Casa Alfonso 166
La Caravana 15
Mosquito 172
Nubar 30
Quimet & Quimet 1380
La Bombeta 138
Billy Brunch 104
Citizen Café 124
Pinotxo B

#### Opening hours

In [18]:
opening_hours = {'name':list(), 'day':list(), 'opening':list(), 'closing':list()}
for row, name in zip(resto_soup2, resto_soup2.index):
    soup = BeautifulSoup(row, 'html.parser')
    days = []
    days_container = soup.select('.day-of-the-week__09f24__JJea_')
    time_container = soup.select('.no-wrap__09f24__c3plq.css-1p9ibgf')
    for day, time in zip(days_container, time_container):

        opening_hours['name'].append(name)
        opening_hours['day'].append(day.text)
        opening_hours['opening'].append(   pd.Series(time.text).str.extract(r'(\d+:\d+ \w+) - \d+:\d+ \w+').squeeze()   )
        opening_hours['closing'].append( pd.Series(time.text).str.extract(r'\d+:\d+ \w+ - (\d+:\d+ \w+)').squeeze() )

In [19]:
# Some restaurants open twice a day...it was hard to get the second time
# Things to based analysis (numbers of openings hours day on average, do they open on the weekend.)
opening_hours_df = pd.DataFrame(opening_hours)

for i in range(len(opening_hours_df)):
    print( list(opening_hours_df.iloc[i, :]) ) 



['Cerveseria Catalana', 'Mon', '9:00 AM', '1:30 AM']
['Cerveseria Catalana', 'Tue', '9:00 AM', '1:30 AM']
['Cerveseria Catalana', 'Wed', '9:00 AM', '1:30 AM']
['Cerveseria Catalana', 'Thu', '9:00 AM', '1:30 AM']
['Cerveseria Catalana', 'Fri', '9:00 AM', '1:30 AM']
['Cerveseria Catalana', 'Sat', '9:00 AM', '1:30 AM']
['Cerveseria Catalana', 'Sun', '9:00 AM', '1:30 AM']
['My Fucking Restaurant', 'Mon', '5:00 PM', '12:30 AM']
['My Fucking Restaurant', 'Tue', '5:00 PM', '12:30 AM']
['My Fucking Restaurant', 'Wed', '5:00 PM', '12:30 AM']
['My Fucking Restaurant', 'Thu', '5:00 PM', '12:30 AM']
['My Fucking Restaurant', 'Fri', '5:00 PM', '12:30 AM']
['My Fucking Restaurant', 'Sat', '5:00 PM', '12:30 AM']
['My Fucking Restaurant', 'Sun', '5:00 PM', '12:30 AM']
['Ciutat Comtal', 'Mon', '8:00 AM', '1:30 AM']
['Ciutat Comtal', 'Tue', '8:00 AM', '1:30 AM']
['Ciutat Comtal', 'Wed', '8:00 AM', '1:30 AM']
['Ciutat Comtal', 'Thu', '8:00 AM', '1:30 AM']
['Ciutat Comtal', 'Fri', '8:00 AM', '1:30 AM']
['

#### Amenities

#### Restaurant dataset

# Hotels

In [51]:
hotel_soup = pd.read_csv('hotel_soup.csv').drop(columns='Unnamed: 0').squeeze()
hotel_soup2 = pd.read_csv('hotel_soup2.csv').drop(columns='Unnamed: 0').squeeze()
print(hotel_soup2.index)
print(hotel_soup.index)

Index(['Barceló Raval', 'Hotel 1898', 'W Barcelona',
       'Hotel Continental Barcelona', 'Pulitzer Barcelona',
       'Le Méridien Barcelona', 'Hotel Cotton House', 'Hotel Ayre Roselon',
       'Hotel Majestic', 'Renaissance Barcelona Hotel',
       ...
       'Catalonia Barcelona Golf.1', 'Mihlton.1', 'Ilunion Almirante.1',
       'Hotel Peninsular.1', 'Banys Oriental.1', 'Hotel Capri by Fraser.1',
       'Hotel Granados 83.1', 'Hotel Oasis.1', 'Ámister Art Hotel.1',
       'Hotel Regina.1'],
      dtype='object', length=259)
Index(['0', '10', '20', '30', '40', '50', '60', '70', '80', '90', '100', '110',
       '120', '130', '140', '150', '160', '170', '180', '190', '200', '210',
       '220', '230'],
      dtype='object')


#### Name

In [52]:
#### Name
name = []
for row in hotel_soup:  # row contain 10 restaurants
    soup = BeautifulSoup(row, 'html.parser')
    result = soup.select('.css-1egxyvc .css-1m051bw')
    for tag in result:
        # print(tag.text)
        name.append(tag.text)


In [53]:
# It's like there are duplicates
pd.Series(name).value_counts()

St. Christopher’s Inn                         2
Four Points by Sheraton Barcelona Diagonal    2
OD Barcelona                                  2
Chic&Basic Velvet                             2
Acta BCN 40                                   2
                                             ..
K+K Hotel Picasso Barcelona                   1
Hostalin Barcelona                            1
Eurostars Grand Marina                        1
H10 Metropolitan                              1
Hotel Barcelona Universal                     1
Length: 234, dtype: int64

#### Price range

In [54]:
#### Price range
price_range = []
container_class = 'container__09f24__mpR8_ hoverable__09f24__wQ_on margin-t3__09f24__riq4X margin-b3__09f24__l9v5d padding-t3__09f24__TMrIW padding-r3__09f24__eaF7p padding-b3__09f24__S8R2d padding-l3__09f24__IOjKY border--top__09f24__exYYb border--right__09f24__X7Tln border--bottom__09f24___mg5X border--left__09f24__DMOkM border-color--default__09f24__NPAKY'
for row in hotel_soup:
    soup = BeautifulSoup(row, 'html.parser')
    result = soup.find_all(class_ = container_class)
    for tag in result:
        header_text = tag.div.div.div.nextSibling.div
        price = header_text.select_one('.css-1s7bx9e')
        if price is not None:
            price_range.append(price.text)
        else:
            price_range.append(np.nan)

# test
for n, price in zip(name, price_range):
    print(n, price) 

Barceló Raval €€
Hotel 1898 €€€
W Barcelona €€€€
Hotel Continental Barcelona €
Pulitzer Barcelona €€
Le Méridien Barcelona €€€
Hotel Cotton House €€€
Hotel Ayre Roselon €
Hotel Majestic €€€
Renaissance Barcelona Hotel €€€
Villa Emilia €€
Hotel Arts Barcelona nan
Mandarin Oriental €€€€
Hotel Jazz €€
K+K Hotel Picasso Barcelona nan
Hotel Europark €€
Grand Hotel Central nan
Hotel Barcelona 1882 nan
Alma Barcelona €€€
Novotel Barcelona City €€
Casa Fuster €€€€
The Barcelona EDITION €€€
Hotel Barcelona Universal €€
Hotel NH Collection Barcelona Gran Hotel Calderón €€€
H10 Madison nan
Claris €€€
Alexandra Barcelona Hotel €€€
Hilton Barcelona Hotel €€€
Hotel Duquesa de Cardona €€€
Hotel Mercer €€€
Casa Bonay €€
El Palace €€€€
Hotel Praktik €€
Hotel U232 €€
Soho House €€€€
Hotel Catalonia Born €€
Hotel Condes de Barcelona €€€
Hotel Sofia nan
Olivia Balmes €€€
Colonial €€
Best Western Premier Hotel Dante €€
Arc la Rambla €€
Vividora nan
Hotel Colón €€€
H10 Port Vell €€
Barcelona Airport Hotel n

#### Rating

In [55]:
#### Rating
rating = []
container_class = 'container__09f24__mpR8_ hoverable__09f24__wQ_on margin-t3__09f24__riq4X margin-b3__09f24__l9v5d padding-t3__09f24__TMrIW padding-r3__09f24__eaF7p padding-b3__09f24__S8R2d padding-l3__09f24__IOjKY border--top__09f24__exYYb border--right__09f24__X7Tln border--bottom__09f24___mg5X border--left__09f24__DMOkM border-color--default__09f24__NPAKY'
for row in hotel_soup:
    soup = BeautifulSoup(row, 'html.parser')
    result = soup.find_all(class_ = container_class)
    for tag in result:
        header_text = tag.div.div.div.nextSibling.div
        rat = header_text.select_one('.overflow--hidden__09f24___ayzG')
        
        if rat is not None:
            rating.append(rat['aria-label'].split()[0])

            # print(rat['aria-label'])

        else:
            rating.append(np.nan)
        
# test
for n, rat in zip(name, rating):
    print(n, rat) 

Barceló Raval 4.5
Hotel 1898 4.5
W Barcelona 4
Hotel Continental Barcelona 4.5
Pulitzer Barcelona 4
Le Méridien Barcelona 4
Hotel Cotton House 4.5
Hotel Ayre Roselon 4.5
Hotel Majestic 4
Renaissance Barcelona Hotel 4
Villa Emilia 5
Hotel Arts Barcelona 4
Mandarin Oriental 4
Hotel Jazz 4.5
K+K Hotel Picasso Barcelona 4.5
Hotel Europark 4.5
Grand Hotel Central 4
Hotel Barcelona 1882 4.5
Alma Barcelona 4.5
Novotel Barcelona City 4
Casa Fuster 4.5
The Barcelona EDITION 4
Hotel Barcelona Universal 4
Hotel NH Collection Barcelona Gran Hotel Calderón 4
H10 Madison 5
Claris 4
Alexandra Barcelona Hotel 4
Hilton Barcelona Hotel 3.5
Hotel Duquesa de Cardona 4.5
Hotel Mercer 4.5
Casa Bonay 4.5
El Palace 4.5
Hotel Praktik 4.5
Hotel U232 4.5
Soho House 5
Hotel Catalonia Born 4
Hotel Condes de Barcelona 4
Hotel Sofia 5
Olivia Balmes 4.5
Colonial 3.5
Best Western Premier Hotel Dante 4
Arc la Rambla 4
Vividora 5
Hotel Colón 4
H10 Port Vell 5
Barcelona Airport Hotel 2.5
Hotel España Ramblas 3.5
Hotel Oh

#### Reviews

In [59]:
reviews = []
container_class = 'container__09f24__mpR8_ hoverable__09f24__wQ_on margin-t3__09f24__riq4X margin-b3__09f24__l9v5d padding-t3__09f24__TMrIW padding-r3__09f24__eaF7p padding-b3__09f24__S8R2d padding-l3__09f24__IOjKY border--top__09f24__exYYb border--right__09f24__X7Tln border--bottom__09f24___mg5X border--left__09f24__DMOkM border-color--default__09f24__NPAKY'
for row in hotel_soup:
    soup = BeautifulSoup(row, 'html.parser')
    result = soup.find_all(class_ = container_class)
    for tag in result:
        header_text = tag.div.div.div.nextSibling.div
        rev = header_text.select_one('.reviewCount__09f24__tnBk4')
        
        if rev is not None:
            
            reviews.append(rev.text)


        else:
            reviews.append(np.nan)
        
#test
for n, rev in zip(name, reviews):
    print(n, rev) 

Barceló Raval 56
Hotel 1898 76
W Barcelona 176
Hotel Continental Barcelona 12
Pulitzer Barcelona 42
Le Méridien Barcelona 84
Hotel Cotton House 47
Hotel Ayre Roselon 7
Hotel Majestic 40
Renaissance Barcelona Hotel 53
Villa Emilia 25
Hotel Arts Barcelona 103
Mandarin Oriental 40
Hotel Jazz 22
K+K Hotel Picasso Barcelona 20
Hotel Europark 46
Grand Hotel Central 38
Hotel Barcelona 1882 6
Alma Barcelona 29
Novotel Barcelona City 13
Casa Fuster 24
The Barcelona EDITION 21
Hotel Barcelona Universal 41
Hotel NH Collection Barcelona Gran Hotel Calderón 24
H10 Madison 16
Claris 31
Alexandra Barcelona Hotel 45
Hilton Barcelona Hotel 46
Hotel Duquesa de Cardona 32
Hotel Mercer 12
Casa Bonay 15
El Palace 20
Hotel Praktik 18
Hotel U232 29
Soho House 5
Hotel Catalonia Born 11
Hotel Condes de Barcelona 15
Hotel Sofia 6
Olivia Balmes 28
Colonial 4
Best Western Premier Hotel Dante 9
Arc la Rambla 9
Vividora 2
Hotel Colón 48
H10 Port Vell 9
Barcelona Airport Hotel 24
Hotel España Ramblas 17
Hotel Ohla 2

#### Location

In [26]:
hotel_soup2.shape

(259,)

In [27]:
#### Location

location = [] 
for row in hotel_soup2:
    soup = BeautifulSoup(row, 'html.parser')
    result = soup.select('address .raw__09f24__T4Ezm')
    adress_elements = []
    for tag in result:
        adress_elements.append(tag.text)
    location.append( '--'.join(adress_elements) )

In [30]:
for n, loc in zip(hotel_soup2.index, location):
    print(n,loc)

Barceló Raval Rambla del Raval, 17 - 21--08001 Barcelona--Spain
Hotel 1898 La Rambla, 109--08002 Barcelona--Spain
W Barcelona Plaça de la Rosa dels Vents, 1--08039 Barcelona--Spain
Hotel Continental Barcelona Paseo la Rambla, 138--08002 Barcelona--Spain
Pulitzer Barcelona Carrer Bergara, 8--08002 Barcelona--Spain
Le Méridien Barcelona La Rambla 111--Pintor Fortuny, 4-6--08002 Barcelona--Spain
Hotel Cotton House Gran Via de les Corts Catalanes, 670--08010 Barcelona--Spain
Hotel Ayre Roselon Rossello 390--08025--08025 Barcelona--Spain
Hotel Majestic Passeig de Gracia, 68--08007 Barcelona--Spain
Renaissance Barcelona Hotel Pau Claris, 122--08009 Barcelona--Spain
Villa Emilia Carrer de Calàbria, 115--08015 Barcelona--Spain
Hotel Arts Barcelona Marina 19-21--08005 Barcelona--Spain
Mandarin Oriental Passeig de Gràcia, 38-40--08007 Barcelona--Spain
Hotel Jazz Pelai, 3, Eixample--08001 Barcelona--Spain
K+K Hotel Picasso Barcelona Passeig de Picasso, 26--08003 Barcelona--Spain
Hotel Europark Ca

#### Website

In [31]:
#### Website
website = []
for row in hotel_soup2:
    soup = BeautifulSoup(row, 'html.parser')
    result = soup.select_one('.css-na3oda+ .css-1p9ibgf .css-1um3nx')
    # for tag in result:
    if result is not None:
        website.append( result.text ) 

    else:
        website.append(np.nan)



In [32]:
for n, web in zip(hotel_soup2.index, website):
    print(n,web)

Barceló Raval https://www.barcelo.com/es
Hotel 1898 http://www.hotel1898.com
W Barcelona https://www.marriott.com/hotel…
Hotel Continental Barcelona https://www.hotelcontinental.c…
Pulitzer Barcelona http://www.hotelpulitzer.es
Le Méridien Barcelona https://www.marriott.com/hotel…
Hotel Cotton House http://www.hotelcottonhouse.co…
Hotel Ayre Roselon http://www.ayrehoteles.com
Hotel Majestic http://www.hotelmajestic.es
Renaissance Barcelona Hotel https://www.marriott.com/hotel…
Villa Emilia https://www.hotelvillaemilia.c…
Hotel Arts Barcelona https://www.ritzcarlton.com/en…
Mandarin Oriental http://www.mandarinoriental.es…
Hotel Jazz http://www.hoteljazz.com/
K+K Hotel Picasso Barcelona http://www.kkhotels.com/picass…
Hotel Europark http://www.hoteleuropark.com
Grand Hotel Central http://www.grandhotelcentral.c…
Hotel Barcelona 1882 http://www.hotelbarcelona1882.…
Alma Barcelona http://www.almabarcelona.com
Novotel Barcelona City http://www.novotel.com/gb/hote…
Casa Fuster http://www.ho

#### Number of photos

In [131]:
#### Number of photos
nb_photos = []
photo_header_class = 'photo-header-content-container__09f24__jDLBB border-color--default__09f24__NPAKY'
for row in hotel_soup2:
    soup = BeautifulSoup(row, 'html.parser')
    photo_header = soup.find(class_ = photo_header_class)
    if photo_header is not None:

        nb_photos.append(  pd.Series(photo_header.text).str.extract(r'See (\d+) photos').squeeze()  )
        # print('-'*100)

    else:
        nb_photos.append(np.nan)
    



In [None]:
for n, photo in zip(hotel_soup2.index, nb_photos):
    print(n, photo)

#### Opening hours
Most of the hotels open 24 hours...so opening hours would not be helpful here
We can skip it for now

#### Amenities

#### Hotel dataset

In [158]:
# Elements from first page
hotel_df = pd.DataFrame({'name':name, 'rating':rating, 'reviews': reviews, 'price_range': price_range})

In [159]:
# overview
hotel_df.head()

Unnamed: 0,name,rating,reviews,price_range
0,Barceló Raval,4.5,56,€€
1,Hotel 1898,4.5,76,€€€
2,W Barcelona,4.0,176,€€€€
3,Hotel Continental Barcelona,4.5,12,€
4,Pulitzer Barcelona,4.0,42,€€


In [160]:
hotel_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 240 entries, 0 to 239
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   name         240 non-null    object
 1   rating       221 non-null    object
 2   reviews      221 non-null    object
 3   price_range  175 non-null    object
dtypes: object(4)
memory usage: 7.6+ KB


In [161]:
# Missing values
hotel_df.isnull().sum().sort_values(ascending=False)

price_range    65
reviews        19
rating         19
name            0
dtype: int64

In [139]:
# duplicates values
hotel_df[hotel_df.duplicated()]

Unnamed: 0,name,rating,reviews,price_range
114,Gaudi Hotel Barcelona,2.5,15,
150,St. Christopher’s Inn,4.0,20,
190,Four Points by Sheraton Barcelona Diagonal,4.0,26,€€
210,Acta BCN 40,5.0,2,
211,OD Barcelona,5.0,5,
230,Chic&Basic Velvet,1.0,1,


In [162]:
# drop duplicates
hotel_df.drop_duplicates(inplace=True)

#### Cleaning

In [163]:
# Price range (price per person)
# Based on information from yelp and Quora:
# $= under $10. $$=11-30. $$$=31-60. $$$$= over $61
# Converting to EUR

# 1 EUR = 1.11234 USD
Euro = (pd.Series([10, 11, 30, 31, 60, 61]) * 1.11234).apply(lambda x: int(x))
print(Euro)
mapper = {'€':'under 11', "€€":'[12 - 33]', '€€€':'[34 - 66]', '€€€€':'over 67'}

hotel_df['price_range'] = hotel_df['price_range'].map(mapper)


0    11
1    12
2    33
3    34
4    66
5    67
dtype: int64


In [164]:
# Rating and number of reviews
hotel_df['rating'] = hotel_df['rating'].apply(lambda x: float(x)) 
hotel_df['reviews'] = hotel_df['reviews'].apply(lambda x: float(x))

hotel_df.describe()

Unnamed: 0,rating,reviews
count,215.0,215.0
mean,3.909302,14.669767
std,0.804431,18.538361
min,1.0,1.0
25%,3.5,4.0
50%,4.0,9.0
75%,4.5,18.0
max,5.0,176.0


In [165]:
# check cleaning
hotel_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 234 entries, 0 to 239
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   name         234 non-null    object 
 1   rating       215 non-null    float64
 2   reviews      215 non-null    float64
 3   price_range  174 non-null    object 
dtypes: float64(2), object(2)
memory usage: 9.1+ KB


#### Analysis

In [166]:
# Distribution of rating
px.histogram(data_frame=hotel_df, x= 'rating',)

In [None]:

hotel_df

In [167]:
# distribution of reviews

px.histogram(data_frame=hotel_df, x = 'reviews')

In [168]:
px.box(data_frame=hotel_df, x = 'rating', y = 'reviews')

In [169]:
# Top 20 Most reviews

hotel_df.sort_values(by='reviews', ascending=False).head(20)

Unnamed: 0,name,rating,reviews,price_range
2,W Barcelona,4.0,176.0,over 67
11,Hotel Arts Barcelona,4.0,103.0,
5,Le Méridien Barcelona,4.0,84.0,[34 - 66]
1,Hotel 1898,4.5,76.0,[34 - 66]
50,Hilton Diagonal Mar,4.0,67.0,[34 - 66]
0,Barceló Raval,4.5,56.0,[12 - 33]
9,Renaissance Barcelona Hotel,4.0,53.0,[34 - 66]
43,Hotel Colón,4.0,48.0,[34 - 66]
6,Hotel Cotton House,4.5,47.0,[34 - 66]
27,Hilton Barcelona Hotel,3.5,46.0,[34 - 66]


In [173]:
# Normalizing the rating/overview

# original
px.scatter(data_frame=hotel_df, y='reviews', color= 'rating')

In [176]:
# rating over overiew
hotel_df['rating_reviews'] = hotel_df.rating / hotel_df.reviews

# min max normalization
maximum = hotel_df['rating_reviews'].max()
minimum = hotel_df['rating_reviews'].min()
hotel_df['normalized_rating_reviews'] = hotel_df['rating_reviews'].apply(lambda x: (x - minimum) / (maximum - minimum) )

In [178]:
px.scatter(data_frame=hotel_df, x = 'reviews', y='normalized_rating_reviews', color= 'rating')

# Pubs

In [36]:
pub_soup = pd.read_csv('pub_soup.csv').drop(columns='Unnamed: 0').squeeze()
pub_soup2 = pd.read_csv('pub_soup2.csv').drop(columns='Unnamed: 0').squeeze()
print(pub_soup2.index)
print(pub_soup.index)

Index(['Scobies Irish Pub', 'H1898 Rooftopbar', 'L’Ovella Negra',
       'Bobby’s Free', 'Madame George', 'Nevermind', 'The Michael Collins',
       'Bar Rubí', 'Old Fashioned', 'The Bollocks',
       ...
       'Tripode', 'La Pubilla del Taulat', 'Piacere Caffe', 'D9', 'Vanguard',
       'El Ciclista', 'That’s the Way', 'Pipa', 'Sifó', 'Kè?'],
      dtype='object', length=232)
Index(['0', '10', '20', '30', '40', '50', '60', '70', '80', '90', '100', '110',
       '120', '130', '140', '150', '160', '170', '180', '190', '200', '210',
       '220', '230'],
      dtype='object')


#### Name

In [37]:
#### Name
name = []
for row in pub_soup:  # row contain 10 restaurants
    soup = BeautifulSoup(row, 'html.parser')
    result = soup.select('.css-1egxyvc .css-1m051bw')
    for tag in result:
        # print(tag.text)
        name.append(tag.text)


In [38]:
# It's like there are duplicates
pd.Series(name).value_counts()

Bloomsday Literary Pub        2
The Cottage                   2
The Fastnet                   2
El Rouge                      2
Cheers                        2
                             ..
Barna Brew                    1
Bodegon del Norte - Marzan    1
Ina                           1
Never More                    1
Hemingway                     1
Length: 232, dtype: int64

#### Price range

In [39]:
#### Price range
price_range = []
container_class = 'container__09f24__mpR8_ hoverable__09f24__wQ_on margin-t3__09f24__riq4X margin-b3__09f24__l9v5d padding-t3__09f24__TMrIW padding-r3__09f24__eaF7p padding-b3__09f24__S8R2d padding-l3__09f24__IOjKY border--top__09f24__exYYb border--right__09f24__X7Tln border--bottom__09f24___mg5X border--left__09f24__DMOkM border-color--default__09f24__NPAKY'
for row in pub_soup:
    soup = BeautifulSoup(row, 'html.parser')
    result = soup.find_all(class_ = container_class)
    for tag in result:
        header_text = tag.div.div.div.nextSibling.div
        price = header_text.select_one('.css-1s7bx9e')
        if price is not None:
            price_range.append(price.text)
        else:
            price_range.append(np.nan)

# test
for n, price in zip(name, price_range):
    print(n, price) 

Scobies Irish Pub €
H1898 Rooftopbar €€
L’Ovella Negra €
Bobby’s Free €
Madame George €
Nevermind €
The Michael Collins €€
Bar Rubí €
Old Fashioned €€
The Bollocks €
Biercab €
Flaherty’s €
Dow Jones €€
El Bosc de les Fades €€
Obama €€
Tandem €€
Hemingway €€€
Samba Brasil €€
The George Payne €€
Snooker €€
Limerick nan
Otto Zutz €€
My Bar €€
McCarthy’s nan
Garage Beer Co €
Chill Bar €
JazzMan €€
Polaroid €
George & Dragon €€€
La Cinemateca €€
La Pepita €€
London Bar €
Marsella €
The Black Horse €€
The Philharmonic €€
Elephanta €€
La Birreria €
First Bar €
Hogan’s Australian Pub €€
The Black Lion €
The Lime House €€
Bonavida €
Dublin Sports €€
Musical María €€
Dunne’s Irish Bar €€
Xixbar €€
Gato Negro €
Lennox nan
Le Standard nan
Belushi’s €
Craft Barcelona €
Stoke Bar €€
Fizz Barcelona nan
Bloomsday Irish Pub nan
Barna Brew €
Pippermint €€
Obama Gastropub nan
Oviso €
Touch Music Karaoke €€
Sor Rita €
La Barrica €
Arcano €€€
Bar Zodiaco €
Big Bang Bar €
Maestró €€
Belchica €€
Xpressing Un

#### Rating

In [40]:
#### Rating
rating = []
container_class = 'container__09f24__mpR8_ hoverable__09f24__wQ_on margin-t3__09f24__riq4X margin-b3__09f24__l9v5d padding-t3__09f24__TMrIW padding-r3__09f24__eaF7p padding-b3__09f24__S8R2d padding-l3__09f24__IOjKY border--top__09f24__exYYb border--right__09f24__X7Tln border--bottom__09f24___mg5X border--left__09f24__DMOkM border-color--default__09f24__NPAKY'
for row in pub_soup:
    soup = BeautifulSoup(row, 'html.parser')
    result = soup.find_all(class_ = container_class)
    for tag in result:
        header_text = tag.div.div.div.nextSibling.div
        rat = header_text.select_one('.overflow--hidden__09f24___ayzG')
        
        if rat is not None:
            rating.append(rat['aria-label'].split()[0])

            # print(rat['aria-label'])

        else:
            rating.append(np.nan)
        
# test
for n, rat in zip(name, rating):
    print(n, rat) 

Scobies Irish Pub 4.5
H1898 Rooftopbar 4
L’Ovella Negra 4
Bobby’s Free 4.5
Madame George 5
Nevermind 4.5
The Michael Collins 4
Bar Rubí 4.5
Old Fashioned 5
The Bollocks 4
Biercab 4.5
Flaherty’s 4
Dow Jones 3.5
El Bosc de les Fades 4
Obama 3.5
Tandem 5
Hemingway 5
Samba Brasil 4
The George Payne 3.5
Snooker 4
Limerick 4
Otto Zutz 3.5
My Bar 4.5
McCarthy’s 4
Garage Beer Co 4.5
Chill Bar 4.5
JazzMan 4.5
Polaroid 4
George & Dragon 3.5
La Cinemateca 4
La Pepita 4.5
London Bar 4
Marsella 4
The Black Horse 4
The Philharmonic 4
Elephanta 4.5
La Birreria 4.5
First Bar 4.5
Hogan’s Australian Pub 4
The Black Lion 3.5
The Lime House 4
Bonavida 4.5
Dublin Sports 3.5
Musical María 4.5
Dunne’s Irish Bar 3.5
Xixbar 4.5
Gato Negro 4.5
Lennox 4
Le Standard 4.5
Belushi’s 3.5
Craft Barcelona 4
Stoke Bar 4
Fizz Barcelona 4.5
Bloomsday Irish Pub 5
Barna Brew 4.5
Pippermint 3
Obama Gastropub 3.5
Oviso 4
Touch Music Karaoke 4.5
Sor Rita 4.5
La Barrica 3
Arcano 4.5
Bar Zodiaco 4
Big Bang Bar 4
Maestró 4
Belchi

#### Location

In [41]:
pub_soup2.shape

(232,)

In [42]:
#### Location

location = [] 
for row in pub_soup2:
    soup = BeautifulSoup(row, 'html.parser')
    result = soup.select('address .raw__09f24__T4Ezm')
    adress_elements = []
    for tag in result:
        adress_elements.append(tag.text)
    location.append( '--'.join(adress_elements) )

In [43]:
for n, loc in zip(pub_soup2.index, location):
    print(n,loc)

Scobies Irish Pub Ronda de la Universidad, 8--08007 Barcelona--Spain
H1898 Rooftopbar Les Rambles, 109--08002 Barcelona--Spain
L’Ovella Negra Carrer de les Sitges, 5--08001 Barcelona--Spain
Bobby’s Free Carrer de Pau Claris, 85--08010 Barcelona--Spain
Madame George Carrer de Pujades, 179--08005 Barcelona--Spain
Nevermind Carrer dels Tallers, 68--08001 Barcelona--Spain
The Michael Collins Plaça de la Sagrada Família, 4--08013 Barcelona--Spain
Bar Rubí Carrer Banys Vells, 6--08003 Barcelona--Spain
Old Fashioned Carrer de Santa Teresa, 1--08012 Barcelona--Spain
The Bollocks Carrer Ample, 46--08002 Barcelona--Spain
Biercab Carrer de Muntaner, 55--08011 Barcelona--Spain
Flaherty’s Plaça de Joaquim Xirau, s/n--08002 Barcelona--Spain
Dow Jones Carrer del Bruc, 97--08009 Barcelona--Spain
El Bosc de les Fades Passatge de la Banca, 5--08002 Barcelona--Spain
Obama Gran Via de les Corts Catalanes, 603--08007 Barcelona--Spain
Tandem Carrer d'Aribau, 86--08036 Barcelona--Spain
Hemingway Carrer de Mu

#### Website

In [44]:
#### Website
website = []
for row in pub_soup2:
    soup = BeautifulSoup(row, 'html.parser')
    result = soup.select_one('.css-na3oda+ .css-1p9ibgf .css-1um3nx')
    # for tag in result:
    if result is not None:
        website.append( result.text ) 

    else:
        website.append(np.nan)



In [45]:
for n, web in zip(pub_soup2.index, website):
    print(n,web)

Scobies Irish Pub http://scobiesirishpub.com/ind…
H1898 Rooftopbar http://www.hotel1898.com/es/
L’Ovella Negra http://www.ovellanegra.com
Bobby’s Free http://www.bobbysfree.com/en/
Madame George http://www.madamegeorgebar.com
Nevermind nan
The Michael Collins http://www.michaelcollinspubs.…
Bar Rubí nan
Old Fashioned http://cocktailsbarcelona.oldf…
The Bollocks http://www.bollocksbcn.com
Biercab http://biercab.com/
Flaherty’s http://www.pflaherty.com/barce…
Dow Jones http://www.dowjonesbar.com
El Bosc de les Fades http://www.museocerabcn.com/bo…
Obama http://www.obamabcn.com
Tandem nan
Hemingway http://www.hemingwaybcn.com
Samba Brasil nan
The George Payne http://www.thegeorgepayne.com
Snooker http://www.snookerbarcelona.co…
Limerick nan
Otto Zutz http://www.ottozutz.com
My Bar http://mybarcelona.es/
McCarthy’s http://mccarthysbarbarcelona.c…
Garage Beer Co nan
Chill Bar http://www.chillbarcelona.com
JazzMan http://www.jazzmanbcn.com
Polaroid http://www.polaroidbar.es/
George & Dragon 

#### Number of photos

In [46]:
#### Number of photos
nb_photos = []
photo_header_class = 'photo-header-content-container__09f24__jDLBB border-color--default__09f24__NPAKY'
for row in pub_soup2:
    soup = BeautifulSoup(row, 'html.parser')
    photo_header = soup.find(class_ = photo_header_class)
    if photo_header is not None:

        nb_photos.append(  pd.Series(photo_header.text).str.extract(r'See (\d+) photos').squeeze()  )
        # print('-'*100)

    else:
        nb_photos.append(np.nan)
    



In [47]:
for n, photo in zip(pub_soup2.index, nb_photos):
    print(n, photo)

Scobies Irish Pub 17
H1898 Rooftopbar 14
L’Ovella Negra 50
Bobby’s Free 42
Madame George 35
Nevermind 26
The Michael Collins 47
Bar Rubí 71
Old Fashioned 218
The Bollocks 22
Biercab 215
Flaherty’s 69
Dow Jones 56
El Bosc de les Fades 100
Obama 66
Tandem 53
Hemingway 45
Samba Brasil 4
The George Payne 40
Snooker 9
Limerick 6
Otto Zutz 18
My Bar 6
McCarthy’s nan
Garage Beer Co nan
Chill Bar 213
JazzMan 32
Polaroid 36
George & Dragon nan
La Cinemateca 5
La Pepita 1065
London Bar 56
Marsella 55
The Black Horse 8
The Philharmonic 6
Elephanta 41
La Birreria 5
First Bar 21
Hogan’s Australian Pub 34
The Black Lion 23
The Lime House 11
Bonavida 31
Dublin Sports 7
Musical María 5
Dunne’s Irish Bar 17
Xixbar 199
Gato Negro 20
Lennox 9
Le Standard nan
Belushi’s 26
Craft Barcelona 45
Stoke Bar 36
Fizz Barcelona nan
Bloomsday Irish Pub 4
Barna Brew 14
Pippermint 31
Obama Gastropub 8
Oviso 70
Touch Music Karaoke 38
Sor Rita 51
La Barrica 15
Arcano 555
Bar Zodiaco 25
Big Bang Bar 43
Maestró 119
Belchi

#### Opening hours


In [48]:
opening_hours = {'name':list(), 'day':list(), 'opening':list(), 'closing':list()}
for row, name in zip(pub_soup2, pub_soup2.index):
    soup = BeautifulSoup(row, 'html.parser')
    days = []
    days_container = soup.select('.day-of-the-week__09f24__JJea_')
    time_container = soup.select('.no-wrap__09f24__c3plq.css-1p9ibgf')
    for day, time in zip(days_container, time_container):

        opening_hours['name'].append(name)
        opening_hours['day'].append(day.text)
        opening_hours['opening'].append(   pd.Series(time.text).str.extract(r'(\d+:\d+ \w+) - \d+:\d+ \w+').squeeze()   )
        opening_hours['closing'].append( pd.Series(time.text).str.extract(r'\d+:\d+ \w+ - (\d+:\d+ \w+)').squeeze() )

In [49]:
opening_hours_df = pd.DataFrame(opening_hours)

for i in range(len(opening_hours_df)):
    print( list(opening_hours_df.iloc[i, :]) ) 

['Scobies Irish Pub', 'Mon', '5:00 PM', '2:30 AM']
['Scobies Irish Pub', 'Tue', '5:00 PM', '2:30 AM']
['Scobies Irish Pub', 'Wed', '5:00 PM', '2:30 AM']
['Scobies Irish Pub', 'Thu', '5:00 PM', '2:30 AM']
['Scobies Irish Pub', 'Fri', '2:00 PM', '3:00 AM']
['Scobies Irish Pub', 'Sat', '2:00 PM', '3:00 AM']
['Scobies Irish Pub', 'Sun', '5:00 PM', '2:30 AM']
['L’Ovella Negra', 'Mon', '9:00 AM', '3:00 AM']
['L’Ovella Negra', 'Tue', '9:00 AM', '3:00 AM']
['L’Ovella Negra', 'Wed', '9:00 AM', '3:00 AM']
['L’Ovella Negra', 'Thu', '9:00 AM', '3:00 AM']
['L’Ovella Negra', 'Fri', '9:00 AM', '3:00 AM']
['L’Ovella Negra', 'Sat', '5:00 PM', '3:00 AM']
['L’Ovella Negra', 'Sun', '5:00 PM', '3:00 AM']
['Bobby’s Free', 'Mon', nan, nan]
['Bobby’s Free', 'Tue', nan, nan]
['Bobby’s Free', 'Wed', '7:00 PM', '3:00 AM']
['Bobby’s Free', 'Thu', '7:00 PM', '3:00 AM']
['Bobby’s Free', 'Fri', '7:00 PM', '3:00 AM']
['Bobby’s Free', 'Sat', '7:00 PM', '3:00 AM']
['Bobby’s Free', 'Sun', '7:00 PM', '3:00 AM']
['Madame 

#### Amenities

#### Pub dataset