In [19]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
from urllib.parse import urlencode, quote_plus

---
### 爬蟲
---

In [20]:
def booking_crawler(location, checkin_date, checkout_date):
    def generate_booking_url(location, checkin_date, checkout_date):
        base_url = "https://www.booking.com/searchresults.zh-tw.html"

        # 定義搜索參數
        params = {
            'ss': location,                  # 地點
            'checkin': checkin_date,         # 入住日期
            'checkout': checkout_date,       # 退房日期
            'lang': 'zh-tw',                 # 語言
            'sb': '1',                        # 啟用搜索盒
            'src_elem': 'sb',                 # 搜索來源元素
            'src': 'index',                   # 搜索來源
            'dest_id': '',                    # 你需要提供正確的目的地 ID
            'dest_type': 'city',              # 目的地類型
            'group_adults': '2',              # 成人數
            'no_rooms': '1',                  # 房間數
            'group_children': '0',            # 兒童數
            'sb_travel_purpose': 'leisure',   # 旅行目的
        }

        # 使用 urlencode 和 quote_plus 將參數編碼並構建 URL 字符串
        encoded_params = urlencode(params, quote_via=quote_plus)
        final_url = f"{base_url}?{encoded_params}"

        return final_url

    
    location_input = location
    checkin_date_input = checkin_date
    checkout_date_input = checkout_date

    url_book = generate_booking_url(location_input, checkin_date_input, checkout_date_input)
    def scrape_booking_data(url, num_pages=6):
        headers = {
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.113 Safari/537.36',
            'Accept-Language': 'en-GB,en-US;q=0.9,en;q=0.8',
        }

        all_data = pd.DataFrame()  # 保存所有頁面的數據

        for page in range(1, num_pages + 1):
            # 構建每頁的 URL
            page_url = f"{url}&offset={(page - 1) * 25}"

            response = requests.get(page_url, headers=headers)
            soup = BeautifulSoup(response.text, "html.parser")

            hotels = soup.findAll("div", {'data-testid': 'property-card'})

            name_list = []
            location_list = []
            price_list = []
            rating_list = []
            distance_list = []
            comments_list = []

            for hotel in hotels:
                name = hotel.find("div", {'data-testid': 'title'}).text.strip()
                name_list.append(name)

                location = hotel.find("span", {'data-testid': 'address'}).text.strip()
                location_list.append(location)

                price = hotel.find("span", {'data-testid': 'price-and-discounted-price'}).text.strip().replace('\xa0', ' ')
                price_list.append(price)

                rating = float(hotel.find("div", class_="a3b8729ab1 d86cee9b25").text.strip())
                rating_list.append(rating)

                distance = hotel.find("span", {'data-testid': 'distance'}).text.strip()
                distance_list.append(distance)

                comments = hotel.find("div", class_="a3b8729ab1 e6208ee469 cb2cbb3ccb").text.strip()
                comments_list.append(comments)

            # 將每頁的數據組織成 DataFrame
            data = {
                'name': name_list,
                'location': location_list,
                'price': price_list,
                'rating': rating_list,
                'distance': distance_list,
                'comments': comments_list
            }

            df = pd.DataFrame(data)
            all_data = pd.concat([all_data, df], ignore_index=True)

            # 休眠一下，避免對網站造成過多請求
            time.sleep(1)

        return all_data


    return scrape_booking_data(url_book,num_pages=5).drop_duplicates().reset_index(drop=True)

In [21]:
location_input = input("請輸入地點：")
checkin_date_input = input("請輸入入住日期（YYYY-MM-DD）：")
checkout_date_input = input("請輸入退房日期（YYYY-MM-DD）：")

result_df = booking_crawler(location_input, checkin_date_input, checkout_date_input)

---
### 資料清理
---

In [125]:
result_df['name'] = result_df['name'].astype(str)
result_df['price'] = result_df['price'].str.replace('TWD ', '').str.replace(',', '').astype(int).astype(int)
result_df['comments'] = result_df['comments'].astype(str)

In [126]:
result_df["distance"]=result_df["distance"].str.replace('距中心 ', '')

In [None]:
for i in range(len(result_df["distance"])):
    if '公尺' in result_df["distance"][i].split(" ")[1]:
        result_df["distance"][i] = float(result_df["distance"][i].split(" ")[0])*0.001
    else:
        result_df["distance"][i] = float(result_df["distance"][i].split(" ")[0])

In [128]:
result_df["distance"]=result_df["distance"].astype(float)

In [140]:
result_df.dtypes

name         object
location     object
price         int32
rating      float64
distance    float64
comments     object
dtype: object

---
### 將資料處理過程包成函式
---

In [22]:
def preprocess_data(df):
    # 將 'name' 欄位轉為字串
    df['name'] = df['name'].astype(str)

    # 將 'price' 欄位去除 'TWD '，並移除逗號，轉為整數型態
    df['price'] = df['price'].str.replace('TWD ', '').str.replace(',', '').astype(int)

    # 將 'comments' 欄位轉為字串
    df['comments'] = df['comments'].astype(str)
# 將 'distance' 欄位轉為浮點數
    df["distance"]=df["distance"].str.replace('距中心 ', '')
    # 將 'distance' 欄位去除 '距中心 '，並根據單位轉為浮點數
    for i in range(len(df["distance"])):
        if '公尺' in df["distance"][i].split(" ")[1]:
            df["distance"][i] = float(df["distance"][i].split(" ")[0]) * 0.001
        else:
            df["distance"][i] = float(df["distance"][i].split(" ")[0])
    df["distance"]=df["distance"].astype(float)
    

    return df


In [None]:
preprocess_data(result_df)

---
### 資料視覺化
---

In [24]:
import plotly.express as px

In [25]:
graph1 = px.scatter(
    data_frame=result_df,
    x='price',
    y='distance',
    color='rating',
    hover_name='name',
    hover_data=['price', 'rating'],
    title="Hotel Information",
    labels={"price": "Price (TWD)","distance": "Distance (km)"}
)
graph1.show()

---
### Dash
---