In [1]:
from bs4 import BeautifulSoup
import requests
import datetime
import numpy as np

In [2]:
#find information for all hotel rates from tomorrow until the next 2 weeks.
today = datetime.date.today()
#tomorrow
date_now = today + datetime.timedelta(days=1) 
#3days from tomorrow
date_end = date_now + datetime.timedelta(days=30)


dates=[]
day_of_week = []
titles=[]
links=[]
stars=[]
locations = []
current_prices = []
original_prices = []
discount_percentages = []
ratings = []
total_reviews = []

while date_now <= date_end:
    str_date = str(date_now.strftime("%Y-%m-%d"))
    URL = "https://en.dayuse.com.hk/s/hong-kong?page=1&checkinDate="+str_date
    html = requests.get(URL)
    soup = BeautifulSoup(html.text, "html.parser")
    page=1


    while True:
        # scrape each page
        for i in soup.find_all(class_="js-search-results-cardhotel"):
            dates.append(date_now)
            day_of_week.append(date_now.strftime('%A'))
            
            #find hotel title and link
            titles.append(i.a["title"].lower())
            links.append(i.a["href"])
            
            # find hotel stars
            star = i.find(class_ = 'js-search-cardhotel-stars')
            stars.append(star.img["alt"][-1])
            
            #location
            location = i.find(class_="js-search-cardhotel-poi")
            locations.append(location.text)
            
            #find price
            price = i.find(class_="js-search-cardhotel-pricing-information-dayuse")
            current_prices.append(int(str(price.text).replace(",","").replace("HK$","")))
            
            #get original price
            if i.find(class_="js-search-cardhotel-pricing-information-comparative-price"):
                original_price = i.find(class_="js-search-cardhotel-pricing-information-comparative-price")
                original_prices.append(int(str(original_price.text).replace(",","").replace("HK$","")))
            else: 
                #just add current price as no discount offered
                original_prices.append(int(str(price.text).replace(",","").replace("HK$",""))) 
                
            #percentage discount    
            if i.find(class_='js-search-cardhotel-pricing-information-discount-percentage'):
                discount_per = i.find(class_='js-search-cardhotel-pricing-information-discount-percentage')
                discount_percentages.append(int(str(discount_per.text).replace("%","").replace("-","")))
            else:
                discount_percentages.append(0)
                
            #rating
            if i.find(class_='js-search-cardhotel-reviews-averageRating'):
                rating = i.find(class_='js-search-cardhotel-reviews-averageRating')
                ratings.append(float(str(rating.text).replace("/5","")))
            else:
                ratings.append(np.nan)
                
            #total_reviews
            if i.find(class_="js-search-cardhotel-reviews-count"):
                review_count = i.find(class_="js-search-cardhotel-reviews-count")
                total_reviews.append(int((review_count.text).split()[0]))
            else:
                total_reviews.append(np.nan)
            

        # going to the next page to scrape
        nextexist = soup.find(class_="inline fill-current transform rotate-180")

        if nextexist:
            page+=1
            nextlink = "https://en.dayuse.com.hk/s/hong-kong?page="+str(page)+"&checkinDate="+str_date
            subhtml = requests.get(nextlink)
            soup = BeautifulSoup(subhtml.text, "html.parser")
        else:
            break
            
    date_now += datetime.timedelta(days=1)

In [3]:
#Data Visualization
import pandas as pd

d = {'Date': dates, 
     'Day_of_week':day_of_week,
     'Hotel_name': titles, 
     'Link': links,
     "Stars": stars, 
     "Location": locations,
     "Current_price_HKD" : current_prices,
     "Original_prices_HKD": original_prices,
     "Discount_percentage": discount_percentages,
     "Rating": ratings,
     "Total_reviews": total_reviews,
    }
df = pd.DataFrame(data=d)

In [4]:
df.head()

Unnamed: 0,Date,Day_of_week,Hotel_name,Link,Stars,Location,Current_price_HKD,Original_prices_HKD,Discount_percentage,Rating,Total_reviews
0,2021-11-09,Tuesday,the kimberley hotel,https://en.dayuse.com.hk/hotels/hong-kong/the-...,4,Tsim Sha Tsui,350,900,62,4.3,19.0
1,2021-11-09,Tuesday,holiday inn golden mile hong kong,https://en.dayuse.com.hk/hotels/hong-kong/holi...,4,Tsim Sha Tsui,398,600,34,4.4,104.0
2,2021-11-09,Tuesday,the kowloon hotel,https://en.dayuse.com.hk/hotels/hong-kong/the-...,4,Tsim Sha Tsui,380,800,53,4.2,87.0
3,2021-11-09,Tuesday,holiday inn express causeway bay,https://en.dayuse.com.hk/hotels/hong-kong/holi...,3,Causeway Bay,380,550,31,4.4,47.0
4,2021-11-09,Tuesday,ozo wesley hong kong,https://en.dayuse.com.hk/hotels/hong-kong/ozo-...,4,Wan Chai,370,1000,63,4.3,56.0


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3173 entries, 0 to 3172
Data columns (total 11 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Date                 3173 non-null   object 
 1   Day_of_week          3173 non-null   object 
 2   Hotel_name           3173 non-null   object 
 3   Link                 3173 non-null   object 
 4   Stars                3173 non-null   object 
 5   Location             3173 non-null   object 
 6   Current_price_HKD    3173 non-null   int64  
 7   Original_prices_HKD  3173 non-null   int64  
 8   Discount_percentage  3173 non-null   int64  
 9   Rating               2476 non-null   float64
 10  Total_reviews        2476 non-null   float64
dtypes: float64(2), int64(3), object(6)
memory usage: 272.8+ KB


In [8]:
len(links)

3173

In [9]:
#expanding our dataset to get amenities list for each hotel
reallinks=[]
for i in links:
  reallinks.append(i[:-23])

In [11]:
len(reallinks)

3173

In [12]:
setlinks=list(set(reallinks))

In [13]:
len(setlinks)

113

In [14]:
settitles=list(set(titles))

In [15]:
setlinks.sort()

In [16]:
settitles.sort()

In [33]:
#get all hotel descriptions and addresses:
description=[]
address = []
for i in setlinks:
    URL = i
    subsubhtml = requests.get(URL)
    soup = BeautifulSoup(subsubhtml.text, "html.parser")
    descr = soup.find(id='hotel-description')
    description.append(descr.text)
    
    addr = soup.find(class_='hidden md:block mt-5').find(class_='paragraph')
    address.append(addr.text)

In [37]:
#Hotel desc_df
hotel_data = {'Hotel_name': settitles, "Address": address,"Description": description}
hotel_desc_df = pd.DataFrame(data=hotel_data)

In [38]:
df = df.merge(hotel_desc_df, on="Hotel_name")

In [40]:
#get all amenities for each hotel
amenitieslist=[]
for i in setlinks:
    URL = i
    subsubhtml = requests.get(URL)
    soup = BeautifulSoup(subsubhtml.text, "html.parser")
    amenities=[]
    for i in soup.find(class_="flex flex-wrap py-2").find_all(class_="flex-none flex items-center pr-2 mb-3 w-1/3 text-sm"):
        amenities.append(i.img["alt"])
    amenitieslist.append(amenities)

In [42]:
#get list of amenities
allamenities=[]
for i in amenitieslist:
  for j in i:
    allamenities.append(j)

In [43]:
#unique amenities
allamenities=list(set(allamenities))

In [44]:
bins=[]
for l in amenitieslist:
    lbins=[]
    for k in allamenities:
        if l.count(k)==0:
            lbins.append(0)
        else:
            lbins.append(1)
    bins.append(lbins)


In [45]:
amenitydf=pd.DataFrame(allamenities)

In [46]:
amenitydf

Unnamed: 0,0
0,Free WiFi
1,Rooftop bar
2,Rooftop pool
3,Fitness room
4,Parking close by
5,Designated smoking area
6,Concierge Service
7,Indoor pool
8,Express dry cleaning
9,Meeting room


In [47]:
amenitydf.set_index(0, inplace=True)

In [48]:
amenitydf

Free WiFi
Rooftop bar
Rooftop pool
Fitness room
Parking close by
Designated smoking area
Concierge Service
Indoor pool
Express dry cleaning
Meeting room
Bar


In [49]:
#Why i+2?
for i in range(len(bins)):
    amenitydf[i]=bins[i]


  amenitydf[i]=bins[i]


In [51]:
transposeddf=amenitydf.transpose()

In [52]:
transposeddf

Unnamed: 0,Free WiFi,Rooftop bar,Rooftop pool,Fitness room,Parking close by,Designated smoking area,Concierge Service,Indoor pool,Express dry cleaning,Meeting room,...,Airport shuttle (fees apply),Spa,Pet friendly,Parking (fees apply),WiFi (fees apply),Terrace,Massage,Jacuzzi,Steam room,Valet Parking
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,1,1,1,0,1,0,0,0,1,...,0,0,0,1,0,0,0,0,0,0
3,1,0,0,1,1,0,0,0,0,1,...,0,0,0,0,0,1,0,0,0,0
4,1,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
108,1,0,0,0,0,0,0,0,0,1,...,1,0,0,0,0,1,0,0,0,0
109,1,0,0,1,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
110,1,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,1,0,0,0,0
111,1,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [53]:
transposeddf["Hotel_name"]=settitles

In [54]:
finaldf=df.merge(transposeddf, on="Hotel_name")

In [55]:
finaldf

Unnamed: 0,Date,Day_of_week,Hotel_name,Link,Stars,Location,Current_price_HKD,Original_prices_HKD,Discount_percentage,Rating,...,Airport shuttle (fees apply),Spa,Pet friendly,Parking (fees apply),WiFi (fees apply),Terrace,Massage,Jacuzzi,Steam room,Valet Parking
0,2021-11-09,Tuesday,the kimberley hotel,https://en.dayuse.com.hk/hotels/hong-kong/the-...,4,Tsim Sha Tsui,350,900,62,4.3,...,0,0,0,0,0,0,0,0,0,0
1,2021-11-10,Wednesday,the kimberley hotel,https://en.dayuse.com.hk/hotels/hong-kong/the-...,4,Tsim Sha Tsui,350,900,62,4.3,...,0,0,0,0,0,0,0,0,0,0
2,2021-11-11,Thursday,the kimberley hotel,https://en.dayuse.com.hk/hotels/hong-kong/the-...,4,Tsim Sha Tsui,350,900,62,4.3,...,0,0,0,0,0,0,0,0,0,0
3,2021-11-12,Friday,the kimberley hotel,https://en.dayuse.com.hk/hotels/hong-kong/the-...,4,Tsim Sha Tsui,450,900,50,4.3,...,0,0,0,0,0,0,0,0,0,0
4,2021-11-13,Saturday,the kimberley hotel,https://en.dayuse.com.hk/hotels/hong-kong/the-...,4,Tsim Sha Tsui,450,900,50,4.3,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3168,2021-12-02,Thursday,popway hotel,https://en.dayuse.com.hk/hotels/hong-kong/popw...,4,Tsim Sha Tsui,595,650,9,,...,0,0,0,0,0,0,0,0,0,0
3169,2021-12-06,Monday,popway hotel,https://en.dayuse.com.hk/hotels/hong-kong/popw...,4,Tsim Sha Tsui,595,650,9,,...,0,0,0,0,0,0,0,0,0,0
3170,2021-12-07,Tuesday,popway hotel,https://en.dayuse.com.hk/hotels/hong-kong/popw...,4,Tsim Sha Tsui,595,650,9,,...,0,0,0,0,0,0,0,0,0,0
3171,2021-12-08,Wednesday,popway hotel,https://en.dayuse.com.hk/hotels/hong-kong/popw...,4,Tsim Sha Tsui,595,650,9,,...,0,0,0,0,0,0,0,0,0,0


In [56]:
df["Date"]=pd.to_datetime(df.Date)

In [57]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3173 entries, 0 to 3172
Data columns (total 13 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   Date                 3173 non-null   datetime64[ns]
 1   Day_of_week          3173 non-null   object        
 2   Hotel_name           3173 non-null   object        
 3   Link                 3173 non-null   object        
 4   Stars                3173 non-null   object        
 5   Location             3173 non-null   object        
 6   Current_price_HKD    3173 non-null   int64         
 7   Original_prices_HKD  3173 non-null   int64         
 8   Discount_percentage  3173 non-null   int64         
 9   Rating               2476 non-null   float64       
 10  Total_reviews        2476 non-null   float64       
 11  Address              3173 non-null   object        
 12  Description          3173 non-null   object        
dtypes: datetime64[ns](1), float64(2),

In [58]:
!pip install selenium
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait



In [59]:
driver = webdriver.Chrome('./chromedriver')
sub_reviews_list=[]
for i in setlinks:
  driver.get(i)
  subhtml = driver.page_source
  soup=BeautifulSoup(subhtml,"html.parser")
  sub_review = []
  if soup.find("aside", class_="px-4 py-2 pb-10 w-full md:max-w-screen-xl md:px-16 m-auto"): 
    for i in soup.find("aside", class_="px-4 py-2 pb-10 w-full md:max-w-screen-xl md:px-16 m-auto").find_all("div", class_= 'flex justify-between'):
      sub_review.append(i.find_all('div')[1].text)
  else:
    for i in range(8):
      sub_review.append(np.nan)
  sub_reviews_list.append(sub_review)

  driver = webdriver.Chrome('./chromedriver')


In [71]:
sub_reviews_list

[['3.5/5', '4.0/5', '3.0/5', '2.5/5', '3.0/5', '4.5/5', '3.0/5', '3.5/5'],
 [nan, nan, nan, nan, nan, nan, nan, nan],
 ['4.6/5', '4.2/5', '4.4/5', '4.3/5', '4.1/5', '4.0/5', '4.2/5', '4.3/5'],
 [nan, nan, nan, nan, nan, nan, nan, nan],
 ['4.4/5', '4.4/5', '4.1/5', '4.3/5', '4.5/5', '4.0/5', '3.7/5', '4.2/5'],
 [nan, nan, nan, nan, nan, nan, nan, nan],
 ['4.3/5', '4.3/5', '4.0/5', '4.3/5', '4.3/5', '4.3/5', '4.0/5', '3.8/5'],
 ['3.5/5', '4.1/5', '2.5/5', '4.0/5', '4.2/5', '3.0/5', '3.6/5', '3.4/5'],
 ['4.2/5', '4.7/5', '4.0/5', '4.5/5', '4.3/5', '4.4/5', '3.9/5', '4.2/5'],
 ['4.6/5', '4.6/5', '4.6/5', '4.8/5', '4.8/5', '4.5/5', '4.5/5', '4.3/5'],
 ['4.4/5', '4.6/5', '4.4/5', '4.5/5', '4.5/5', '4.4/5', '4.3/5', '4.3/5'],
 ['4.7/5', '4.9/5', '4.6/5', '4.9/5', '4.6/5', '4.5/5', '4.6/5', '4.4/5'],
 ['4.1/5', '4.2/5', '3.9/5', '4.0/5', '3.8/5', '3.9/5', '3.7/5', '3.8/5'],
 [nan, nan, nan, nan, nan, nan, nan, nan],
 ['4.8/5', '4.6/5', '4.6/5', '4.7/5', '4.8/5', '4.3/5', '4.6/5', '4.6/5'],
 [n

In [72]:
Cleanliness = []
Check_in = []
Room_quality = []
Reception = []
Access = []
Value_for_money = []
Amenities = []
Tranquility = []
Category_names = ['Hotel Name','Cleanliness','Check_in','Room_quality','Reception','Access','Value_for_money','Amenities','Tranquility']

for i in sub_reviews_list:
    Cleanliness.append(i[0])
    Check_in.append(i[1])
    Room_quality.append(i[2])
    Reception.append(i[3])
    Access.append(i[4])
    Value_for_money.append(i[5])
    Amenities.append(i[6])
    Tranquility.append(i[7])
        
print(Tranquility)



['3.5/5', nan, '4.3/5', nan, '4.2/5', nan, '3.8/5', '3.4/5', '4.2/5', '4.3/5', '4.3/5', '4.4/5', '3.8/5', nan, '4.6/5', nan, '4.1/5', nan, nan, '3.3/5', '4.0/5', '4.2/5', '3.8/5', '4.2/5', '3.8/5', '4.3/5', '4.2/5', '4.3/5', '4.6/5', '4.3/5', '4.2/5', nan, '3.2/5', '3.3/5', '4.8/5', '4.0/5', nan, nan, '3.3/5', '4.6/5', nan, '4.6/5', '3.8/5', '3.6/5', '4.2/5', '4.0/5', '4.4/5', '3.9/5', '4.4/5', '4.1/5', nan, nan, '4.6/5', '4.3/5', '4.3/5', '4.2/5', '4.3/5', '3.7/5', '4.3/5', '4.8/5', '4.0/5', nan, '3.7/5', '4.3/5', nan, '3.5/5', '4.4/5', '4.5/5', '4.3/5', '4.2/5', nan, '4.8/5', nan, '4.2/5', '3.7/5', '4.2/5', nan, '3.5/5', '4.4/5', nan, nan, nan, nan, nan, '4.4/5', '4.0/5', '5.0/5', '4.0/5', '4.0/5', '3.9/5', '4.3/5', '4.2/5', '3.5/5', '4.2/5', '4.3/5', '4.3/5', '4.8/5', '4.4/5', '4.5/5', '3.9/5', '4.3/5', '4.5/5', '3.9/5', '5.0/5', '3.9/5', '4.3/5', nan, '4.1/5', '4.3/5', nan, '3.4/5', nan, nan]


In [74]:
temp = pd.DataFrame({'Hotel_name': settitles,
                    'Cleanliness': Cleanliness,
                    "Check_in": Check_in,
                    "Room_quality": Room_quality,
                    "Reception": Reception,
                    "Access": Access,
                    "Value_for_money": Value_for_money,
                    "Amenities": Amenities,
                    "Tranquility":Tranquility})

In [75]:
temp

Unnamed: 0,Hotel_name,Cleanliness,Check_in,Room_quality,Reception,Access,Value_for_money,Amenities,Tranquility
0,acesite knutsford hotel,3.5/5,4.0/5,3.0/5,2.5/5,3.0/5,4.5/5,3.0/5,3.5/5
1,akvo hotel,,,,,,,,
2,alva hotel by royal,4.6/5,4.2/5,4.4/5,4.3/5,4.1/5,4.0/5,4.2/5,4.3/5
3,apartment kapok,,,,,,,,
4,attitude on granville,4.4/5,4.4/5,4.1/5,4.3/5,4.5/5,4.0/5,3.7/5,4.2/5
...,...,...,...,...,...,...,...,...,...
108,xi hotel,4.3/5,4.3/5,4.5/5,3.8/5,4.8/5,4.5/5,4.3/5,4.3/5
109,y loft,,,,,,,,
110,yha mei ho house,4.0/5,4.0/5,3.6/5,3.9/5,3.7/5,3.9/5,3.3/5,3.4/5
111,yi serviced apartments,,,,,,,,


In [77]:
import pandas as pd

temp2 = temp.rename(columns={0:'Hotel_name',1:'Cleanliness',2:'Check-in',3:'Room Quality',4:'Reception',5:'Access',6:'Value_for_money',7:'Amenities',8:'Tranquility'})

temp2


Unnamed: 0,Hotel_name,Cleanliness,Check_in,Room_quality,Reception,Access,Value_for_money,Amenities,Tranquility
0,acesite knutsford hotel,3.5/5,4.0/5,3.0/5,2.5/5,3.0/5,4.5/5,3.0/5,3.5/5
1,akvo hotel,,,,,,,,
2,alva hotel by royal,4.6/5,4.2/5,4.4/5,4.3/5,4.1/5,4.0/5,4.2/5,4.3/5
3,apartment kapok,,,,,,,,
4,attitude on granville,4.4/5,4.4/5,4.1/5,4.3/5,4.5/5,4.0/5,3.7/5,4.2/5
...,...,...,...,...,...,...,...,...,...
108,xi hotel,4.3/5,4.3/5,4.5/5,3.8/5,4.8/5,4.5/5,4.3/5,4.3/5
109,y loft,,,,,,,,
110,yha mei ho house,4.0/5,4.0/5,3.6/5,3.9/5,3.7/5,3.9/5,3.3/5,3.4/5
111,yi serviced apartments,,,,,,,,


In [88]:
finaldf_1=finaldf.merge(temp2, on="Hotel_name")

finaldf_1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3173 entries, 0 to 3172
Data columns (total 49 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Date                          3173 non-null   object 
 1   Day_of_week                   3173 non-null   object 
 2   Hotel_name                    3173 non-null   object 
 3   Link                          3173 non-null   object 
 4   Stars                         3173 non-null   object 
 5   Location                      3173 non-null   object 
 6   Current_price_HKD             3173 non-null   int64  
 7   Original_prices_HKD           3173 non-null   int64  
 8   Discount_percentage           3173 non-null   int64  
 9   Rating                        2476 non-null   float64
 10  Total_reviews                 2476 non-null   float64
 11  Address                       3173 non-null   object 
 12  Description                   3173 non-null   object 
 13  Fre

In [79]:
import re
findaldf_2 = finaldf_1.replace('/5',"",regex=True)

In [89]:
findaldf_2['Cleanliness'] = pd.to_numeric(findaldf_2['Cleanliness'])
findaldf_2['Check_in']= pd.to_numeric(findaldf_2['Check_in'])
findaldf_2['Room_quality']= pd.to_numeric(findaldf_2['Room_quality'])
findaldf_2['Reception']= pd.to_numeric(findaldf_2['Reception'])
findaldf_2['Access']= pd.to_numeric(findaldf_2['Access'])
findaldf_2['Value_for_money']= pd.to_numeric(findaldf_2['Value_for_money'])
findaldf_2['Amenities']= pd.to_numeric(findaldf_2['Amenities'])
findaldf_2['Tranquility']= pd.to_numeric(findaldf_2['Tranquility'])
findaldf_2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3173 entries, 0 to 3172
Data columns (total 49 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Date                          3173 non-null   object 
 1   Day_of_week                   3173 non-null   object 
 2   Hotel_name                    3173 non-null   object 
 3   Link                          3173 non-null   object 
 4   Stars                         3173 non-null   object 
 5   Location                      3173 non-null   object 
 6   Current_price_HKD             3173 non-null   int64  
 7   Original_prices_HKD           3173 non-null   int64  
 8   Discount_percentage           3173 non-null   int64  
 9   Rating                        2476 non-null   float64
 10  Total_reviews                 2476 non-null   float64
 11  Address                       3173 non-null   object 
 12  Description                   3173 non-null   object 
 13  Fre

In [143]:
findaldf_2.to_csv('DayUse_Webscrape_DataFile_exported_on_8th_Nov_2021.csv')