# Checklist:
## 1. Missing Data
## 2. Regex Structure
## 3. Add the word "restaurant" to "name"
## 4. Paginate: Different Pages

In [1]:
!pip install playwright


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [2]:
!playwright install

In [3]:
from playwright.async_api import async_playwright

In [4]:
playwright = await async_playwright().start()
browser = await playwright.chromium.launch(headless=False)
page = await browser.new_page()

In [5]:
await page.goto("https://www.fantuanorder.com/")

<Response url='https://www.fantuanorder.com/' request=<Request url='https://www.fantuanorder.com/' method='GET'>>

In [6]:
html = await page.content()
html



In [7]:
# Extract restaurant elements
# Wait for the page content to load
await page.wait_for_selector("#scrollableDiv")

# Continuously click "Load More" button until all content is loaded
while True:
    try:
        load_more_button = await page.query_selector(".loadMore .btn")
        if load_more_button:
            #print("Clicking 'Load More' button...")
            await load_more_button.click()
            await page.wait_for_timeout(2000)  # Wait for new content to load
        else:
            break  # No "Load More" button found
    except Exception as e:
        print("No more content to load or an error occurred:", e)
        break

# Extract restaurant elements
restaurant_elements = await page.query_selector_all(".ant-list-item")

restaurant_data = []

# Iterate over each restaurant and extract details
for restaurant in restaurant_elements:
    # Full text for raw processing later
    raw_name_element = await restaurant.query_selector(".name")
    raw_name = await raw_name_element.inner_text() if raw_name_element else "No name"

    # Coupon or offer
    coupon_element = await restaurant.query_selector(".sign")
    coupon = await coupon_element.inner_text() if coupon_element else "No Coupon"

    # Rating
    rate_element = await restaurant.query_selector(".rate")
    rating = await rate_element.inner_text() if rate_element else "No Rating"

    # Delivery info
    state_label_element = await restaurant.query_selector(".stateLabel")
    delivery_info = await state_label_element.inner_text() if state_label_element else "No delivery info"

    # Append raw data to the list
    restaurant_data.append({
        "raw_name": raw_name,
        "Coupon": coupon,
        "Rating": rating,
        "raw_delivery_info": delivery_info,
    })

    # Print raw extracted information 
    print(f"Raw Name: {raw_name}")
    print(f"Coupon: {coupon}")
    print(f"Rating: {rating}")
    print(f"Raw Delivery Info: {delivery_info}")
    print("-" * 80)

Raw Name: $13 OFF | Shanghai Time (Midtown)
Coupon: $11.5 off $40 rookie coupon
Rating: 4.7
Raw Delivery Info: Over 73min • Delivery $4.49 • average $23 • Chinese
--------------------------------------------------------------------------------
Raw Name: Tasty Hand-Pulled Noodles 2 (Midtown)
Coupon: $11.5 off $40 rookie coupon
Rating: 4.7
Raw Delivery Info: 49-59min • Delivery $2.99 • average $25 • Chinese / Noodles
--------------------------------------------------------------------------------
Raw Name: $13 OFF | Hutaoli (Midtown)
Coupon: Specials 100% off
Rating: 4.4
Raw Delivery Info: 62-72min • Delivery $4.49 • Chinese / Spicy
--------------------------------------------------------------------------------
Raw Name: HEYTEA (Broadway)
Coupon: $11.5 off $40 rookie coupon
Rating: 4.8
Raw Delivery Info: 60-70min • Delivery $4.49 • average $9 • Bubble Tea
--------------------------------------------------------------------------------
Raw Name: $13 OFF | The Best Sichuan (Midtown)
Coupo

In [8]:
import pandas as pd

# Load raw data into a DataFrame
df = pd.DataFrame(restaurant_data)

# Extract the clean name and location from `raw_name`
df[["Name", "Location"]] = df["raw_name"].str.extract(r"^(?:.*\|\s)?(.+?)\s\((.+?)\)$")

# Extract the delivery time, fee, average price, and food type from `raw_delivery_info`
df["Delivery Time"] = df["raw_delivery_info"].str.extract(r"(\d+-\d+min|Over \d+min)")
df["Delivery Fee"] = df["raw_delivery_info"].str.extract(r"Delivery \$([\d.]+)").fillna("") # check this .fillna("")
df["Average Price"] = df["raw_delivery_info"].str.extract(r"average \$([\d.]+)").fillna("")
df["Food Type"] = df["raw_delivery_info"].str.extract(r"•\s([\w\s]+)$")

# Add a dollar sign to Delivery Fee and Average Price
df["Delivery Fee"] = df["Delivery Fee"].apply(lambda x: f"${x}" if x else "")
df["Average Price"] = df["Average Price"].apply(lambda x: f"${x}" if x else "")

# Drop the raw fields if no longer needed
df = df.drop(columns=["raw_name", "raw_delivery_info"])

# Display the DataFrame
print(df)

# Save the data to CSV
df.to_csv("fantuan_data.csv", index=False)

                          Coupon Rating                         Name  \
0    $11.5 off $40 rookie coupon    4.7                Shanghai Time   
1    $11.5 off $40 rookie coupon    4.7  Tasty Hand-Pulled Noodles 2   
2              Specials 100% off    4.4                      Hutaoli   
3    $11.5 off $40 rookie coupon    4.8                       HEYTEA   
4              Specials 100% off    4.7             The Best Sichuan   
..                           ...    ...                          ...   
184  $11.5 off $40 rookie coupon    4.8             Bings and Noodle   
185  $11.5 off $40 rookie coupon    4.7                Ajisen Rammen   
186  $11.5 off $40 rookie coupon    4.5     Mountain House Manhattan   
187             Specials 77% off    4.4       108 Food Dried Hot Pot   
188  $11.5 off $40 rookie coupon    4.6         Debutea Peach Summer   

     Location Delivery Time Delivery Fee Average Price   Food Type  
0     Midtown    Over 73min        $4.49           $23     Chinese