## DEPENDENCIES

In [73]:
from tools import *

import os
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

# Access the API key
tripadvisor_api_key = os.getenv("TRIPADVISOR_API_KEY")
rapid_api_key = os.getenv("RAPID_API_KEY")

## Sub
rapid_api_key_1 = os.getenv("RAPID_API_KEY_ALT_1")
rapid_api_key_2 = os.getenv("RAPID_API_KEY_ALT_2")
rapid_api_key_3 = os.getenv("RAPID_API_KEY_ALT_3")
rapid_api_key_4 = os.getenv("RAPID_API_KEY_ALT_4")
rapid_api_key_5 = os.getenv("RAPID_API_KEY_ALT_5")


In [7]:
import json

# Filepath to the JSON file
file_path = "data/vietnam-provinces.json"

# Open and read the JSON file
with open(file_path, "r", encoding="utf-8") as file:
    data = json.load(file)

provinces = [data[i]['NameEn'] for i in range(len(data))]

In [5]:
import requests

url = "https://tripadvisor-scraper.p.rapidapi.com/restaurants/list"

querystring = {"query":provinces[1],"page":"1"}

headers = {
	"x-rapidapi-key": "f01c3f6968msh4a12cf8c0c8f3c2p1d9e36jsne6d66dae12ae",
	"x-rapidapi-host": "tripadvisor-scraper.p.rapidapi.com"
}

response = requests.get(url, headers=headers, params=querystring)

In [9]:
response.raise_for_status()  # Raise an error for bad responses
data = response.json()
len(data['results'])

30

## RAPID-API REQUESTS

In [40]:
import os
import json
import requests

def save_restaurants_by_province(api_key, province_name,page_number=1):
    url = "https://tripadvisor-scraper.p.rapidapi.com/restaurants/list"
    querystring = {"query": province_name, "page": page_number}
    headers = {
        "x-rapidapi-key": api_key,
        "x-rapidapi-host": "tripadvisor-scraper.p.rapidapi.com"
    }
    response = requests.get(url, headers=headers, params=querystring)
    response.raise_for_status()
    data = response.json()

    # Ensure the directory exists
    output_dir = os.path.join("data", "rapid_restaurant_id")
    os.makedirs(output_dir, exist_ok=True)

    # Save the whole response data to file named after the province
    output_path = os.path.join(output_dir, f"{province_name}.json")
    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(data, f, ensure_ascii=False, indent=2)
    print(f"Saved response data to {output_path}")

In [None]:
for province in provinces:
    save_restaurants_by_province(rapid_api_key, province)

## UNDERSTAND RECORDS

In [32]:
import os
import json
import pandas as pd

folder_path = "data/rapid_restaurant_id"
records = []

for file_name in os.listdir(folder_path):
    if file_name.endswith(".json"):
        file_path = os.path.join(folder_path, file_name)
        with open(file_path, "r", encoding="utf-8") as f:
            data = json.load(f)
            total_page = data.get("total_pages")
            total_items_count = data.get("total_items_count")
            records.append({
                "file_name": file_name.split('.')[0],  # Remove the .json extension
                "total_page": total_page,
                "total_items_count": total_items_count
            })

df = pd.DataFrame(records)


In [33]:
df.sort_values("total_items_count", ascending=False, inplace=True)
df.reset_index(drop=True, inplace=True)
df.head(10)

Unnamed: 0,file_name,total_page,total_items_count
0,Ho Chi Minh,173,5182
1,Ha Tinh,173,5182
2,Ha Noi,120,3592
3,Da Nang,55,1645
4,Dak Nong,55,1645
5,Quang Nam,46,1354
6,Khanh Hoa,32,941
7,Kien Giang,25,731
8,Lam Dong,20,577
9,Son La,19,551


In [34]:
df.total_page.sum(), df.total_items_count.sum()

(889, 25610)

In [39]:
df[df.total_page>2]

Unnamed: 0,file_name,total_page,total_items_count
0,Ho Chi Minh,173,5182
1,Ha Tinh,173,5182
2,Ha Noi,120,3592
3,Da Nang,55,1645
4,Dak Nong,55,1645
5,Quang Nam,46,1354
6,Khanh Hoa,32,941
7,Kien Giang,25,731
8,Lam Dong,20,577
9,Son La,19,551


## SCRAPE WHOLE DATA

In [None]:
import json
province_name = "Ho Chi Minh"
file_path = f"data/rapid_restaurant_id/{province_name}.json"

with open(file_path, "r", encoding="utf-8") as f:
    data = json.load(f)

data['total_pages']

save_restaurants_by_province(rapid_api_key, province_name, page_number=2)

173

173

In [None]:
import os
import json
import requests
import pandas as pd

def save_all_restaurants_to_csv(api_key, province_name):
    url = "https://tripadvisor-scraper.p.rapidapi.com/restaurants/list"
    headers = {
        "x-rapidapi-key": api_key,
        "x-rapidapi-host": "tripadvisor-scraper.p.rapidapi.com"
    }
    # Get the first page to find total_pages
    querystring = {"query": province_name, "page": 1}
    response = requests.get(url, headers=headers, params=querystring)
    response.raise_for_status()
    data = response.json()
    total_pages = data.get("total_pages", 1)
    all_results = []

    # Collect results from all pages
    for page_number in range(1, total_pages + 1):
        querystring = {"query": province_name, "page": page_number}
        response = requests.get(url, headers=headers, params=querystring)
        response.raise_for_status()
        page_data = response.json()
        results = page_data.get("results", [])
        all_results.extend(results)
        print(f"Fetched page {page_number}/{total_pages} for {province_name}")

    # Convert to DataFrame and save as CSV
    if all_results:
        df = pd.DataFrame(all_results)
        if "restaurant_id" in df.columns:
            df.set_index("restaurant_id", inplace=True)
        output_dir = os.path.join("data", "rapid_restaurant_id")
        os.makedirs(output_dir, exist_ok=True)
        output_path = os.path.join(output_dir, f"{province_name}.csv")
        df.to_csv(output_path, encoding="utf-8-sig")
        print(f"Saved all restaurants for {province_name} to {output_path}")
    else:
        print(f"No results found for {province_name}")


Fetched page 1/3 for Nghe An
Fetched page 2/3 for Nghe An
Fetched page 3/3 for Nghe An
Saved all restaurants for Nghe An to data\rapid_restaurant_id\Nghe An.csv


In [74]:
rapid_api_key_5

'7331240ffbmsh28012578f471242p112fadjsn89ed4163ed04'

In [76]:
# Example usage:
save_all_restaurants_to_csv(rapid_api_key_5, "Ho Chi Minh")

Fetched page 1/173 for Ho Chi Minh
Fetched page 2/173 for Ho Chi Minh
Fetched page 3/173 for Ho Chi Minh
Fetched page 4/173 for Ho Chi Minh
Fetched page 5/173 for Ho Chi Minh
Fetched page 6/173 for Ho Chi Minh
Fetched page 7/173 for Ho Chi Minh
Fetched page 8/173 for Ho Chi Minh
Fetched page 9/173 for Ho Chi Minh
Fetched page 10/173 for Ho Chi Minh
Fetched page 11/173 for Ho Chi Minh
Fetched page 12/173 for Ho Chi Minh
Fetched page 13/173 for Ho Chi Minh
Fetched page 14/173 for Ho Chi Minh
Fetched page 15/173 for Ho Chi Minh
Fetched page 16/173 for Ho Chi Minh
Fetched page 17/173 for Ho Chi Minh
Fetched page 18/173 for Ho Chi Minh
Fetched page 19/173 for Ho Chi Minh
Fetched page 20/173 for Ho Chi Minh
Fetched page 21/173 for Ho Chi Minh
Fetched page 22/173 for Ho Chi Minh
Fetched page 23/173 for Ho Chi Minh
Fetched page 24/173 for Ho Chi Minh
Fetched page 25/173 for Ho Chi Minh
Fetched page 26/173 for Ho Chi Minh
Fetched page 27/173 for Ho Chi Minh
Fetched page 28/173 for Ho Chi Minh
F

## HO CHI MINH CITY

In [80]:
df = pd.read_csv("data/rapid_restaurant_id/Ho Chi Minh.csv", encoding="utf-8-sig",index_col=0)
df.shape

(6045, 15)

In [81]:
df.head(2).T

Unnamed: 0,0,1
id,26794670,26681281
name,Phở Việt Nam - Phạm Hồng Thái,Baieta Saigon
link,https://www.tripadvisor.com/Restaurant_Review-...,https://www.tripadvisor.com/Restaurant_Review-...
reviews,200,3
rating,4.8,4.3
price_range_usd,$,$$ - $$$
is_sponsored,True,True
menu_link,http://phovietnam.vn/,
reservation_link,,
featured_image,https://dynamic-media-cdn.tripadvisor.com/medi...,https://dynamic-media-cdn.tripadvisor.com/medi...
