In [18]:
from __future__ import annotations

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
import time

from typing import TYPE_CHECKING

import json

if TYPE_CHECKING:
    from typing import TypedDict
    from selenium.webdriver.remote.webelement import WebElement

In [49]:
if TYPE_CHECKING:
    class Star(TypedDict):
        label: str
        value: int

    class Review(TypedDict):
        name: str
        review: str
        contrib: str
        humanized_timestamp: str
        stars: Star

In [4]:
# dataReviews: list[Review] = []

In [5]:
def noValueBuild (value: any, tag: str) -> str | int | any:
    if not value: return "[no_{}]".format(tag)
    return value

def getSingleValueInt (value: int |list[int], defaultValue: int = 0) -> int:
    try:
        if not value: return defaultValue
        if (isinstance(value, list) or isinstance(value, str)) and len(value) > 0:
            values = [int(x) for x in value.split() if x.isdigit()]
            return values[0] if len(values) > 0 else defaultValue
        if isinstance(value, list) and len(value) == 0: pass
        return value
    except ValueError:
        return defaultValue
        print('Value must be either int or int[].')

In [51]:
# url = "https://www.google.com/maps/place/Pulau+Kumala/@-0.4298672,116.9991186,17z/data=!4m6!3m5!1s0x2df67bd6b64d962d:0xa9a5004b431c4365!8m2!3d-0.4298672!4d116.9991186!16s%2Fg%2F11l1qh4dw9?entry=ttu&g_ep=EgoyMDI0MTIxMS4wIKXMDSoASAFQAw%3D%3D"
# url = "https://www.google.com/maps/place/Jembatan+Penyeberangan+Pulau+Kumala+Tenggarong/@-0.4217344,116.9946724,16z/data=!4m8!3m7!1s0x2df67ace57e958ef:0xa9fb2adcb21c8264!8m2!3d-0.4233419!4d116.9944925!9m1!1b1!16s%2Fg%2F11cm9wv3mx?entry=ttu&g_ep=EgoyMDI1MDEwOC4wIKXMDSoASAFQAw%3D%3D";

url = "https://www.google.com/maps/place/Pondok+Pesantren+Nurul+Islam/@-0.357561,117.1100628,15.25z/data=!4m8!3m7!1s0x2df67704e119afcf:0x197440c70fa945db!8m2!3d-0.3726706!4d117.1155268!9m1!1b1!16s%2Fg%2F11d_wql4w0?entry=ttu&g_ep=EgoyMDI1MDEwOC4wIKXMDSoASAFQAw%3D%3D"
# url = "https://www.google.com/maps/place/Yuni+riadi+rental+%26+carter/@-0.3637248,117.1131,15z/data=!4m8!3m7!1s0x2df677aff999ce35:0x77a0d612984ffcac!8m2!3d-0.3675768!4d117.1169017!9m1!1b1!16s%2Fg%2F11t25k373w?entry=ttu&g_ep=EgoyMDI1MDEwOC4wIKXMDSoASAFQAw%3D%3D";
# url = "https://www.google.com/maps/place/London+Bridge/@51.5079145,-0.0903026,17z/data=!4m8!3m7!1s0x4876035159bb13c5:0xa61e28267c3563ac!8m2!3d51.5078788!4d-0.0877321!9m1!1b1!16zL20vMHA3N2c?entry=ttu&g_ep=EgoyMDI1MDEwOC4wIKXMDSoASAFQAw%3D%3D"

browser_locale = "en_EN"

options = Options()
prefs = {
    "profile.default_content_setting_values.geolocation": 2
}
options.add_argument("--incognito")
options.add_argument("--lang={}".format(browser_locale))
options.add_experimental_option("prefs", prefs)

# manually using ttw instead of WebDriverWait
TIME_TO_WAIT = 2

driver = webdriver.Chrome(options)

driver.get(url);
driver.implicitly_wait(1);

dom_review_pane = driver.find_elements(by=By.CSS_SELECTOR, value="[aria-label='Refine reviews']")
dom_review_pane
# print(dom_review_pane[0])

dataReviews: list[Review] = []

# container scrollable
dom_review_container: WebElement | None = driver.execute_script("""
    return document.querySelector('[role="main"]')?.children[1]
""")

if dom_review_container:
    print('review container', dom_review_container.get_property(name="scrollHeight"))
    fst = True
    while fst:
        time.sleep(TIME_TO_WAIT)
        sT = dom_review_container.get_property(name="scrollTop")
        sH = dom_review_container.get_property(name="scrollHeight")
        oH = dom_review_container.get_property(name="offsetHeight")
        
        if sT < (sH - oH):
            driver.execute_script("""
                return arguments[0].scrollTop = arguments[1]
            """, dom_review_container, dom_review_container.get_property(name="scrollHeight"))
            print('scrolling...', dom_review_container.get_property(name="scrollTop"), dom_review_container.get_property(name="scrollHeight"))
        else: fst = False

print("Done scrolling.")


dom_review_pane_by_rr = driver.execute_script("""
    return document.querySelector('[aria-label="Refine reviews"]')
""")
dom_review_pane_by_cta_sort = driver.execute_script("""
    return document.querySelector('[aria-label="Sort reviews"]')?.parentElement?.parentElement
""")

dom_review_pane_sibling = None

if dom_review_pane_by_rr or dom_review_pane_by_cta_sort:
    dom_review_pane_sibling = driver.execute_script("""
        return arguments[0].nextElementSibling
    """, dom_review_pane_by_rr if dom_review_pane_by_rr else dom_review_pane_by_cta_sort)

dataReviews = []

if dom_review_container and dom_review_pane_sibling:
    dom_reviews = driver.execute_script("""
        return arguments[0].querySelectorAll(":scope > [data-review-id]")
    """, dom_review_pane_sibling)

    if len(dom_reviews) >= 0:
        dataReview = []
        
        for review in dom_reviews:
            authorName, authorContrib, authorReview, ariaStars, authorTimestamp = None, None, None, None, None
            
            src = driver.execute_script("""
                return arguments[0].querySelector("[data-review-id]")?.firstElementChild
            """, review)
            if src:
                # get author's name
                authorName = driver.execute_script("""
                    return arguments[0].children[1]?.querySelector("[data-review-id]")?.children[0]?.textContent
                """, src)

                # get author's contrib
                authorContrib = driver.execute_script("""
                    return arguments[0].children[1]?.querySelector("[data-review-id]")?.children[1]?.textContent
                """, src)

                # get stars aria-label
                ariaStars = driver.execute_script("""
                    return arguments[0].children[3]?.firstElementChild?.children[0]?.getAttribute("aria-label")
                """, src)

                # get humanized_timestamp value
                authorTimestamp = driver.execute_script("""
                    return arguments[0].children[3]?.firstElementChild?.children[1]?.textContent
                """, src)

                # get review
                com_reviews = driver.execute_script("""
                    return arguments[0].children[3]?.children[1]?.querySelector("[id]")?.children
                """, src)
                if com_reviews and len(com_reviews) > 0:
                    if len(com_reviews) > 1:
                        print('truncated.', authorName)
                        driver.execute_script("""
                            return arguments[0].firstElementChild?.click()
                        """, com_reviews[1])
                    authorReview = driver.execute_script("""
                        return arguments[0].textContent
                    """, com_reviews[0])

                
            dataReviews.append({
                "name": noValueBuild(authorName, "author"),
                "review": noValueBuild(authorReview, "review"),
                "contrib": noValueBuild(authorContrib, "contrib"),
                "humanized_timestamp": noValueBuild(authorTimestamp, "humanized_timestamp"),
                "stars": {
                    "label": noValueBuild(ariaStars, "stars"),
                    "value": getSingleValueInt(ariaStars, 0),
                }
            })

print(json.dumps(dataReviews, indent=2, sort_keys=False))

driver.quit()

review container 2665
scrolling... 1989.5999755859375 2665
scrolling... 3577.60009765625 4253
scrolling... 4192 4867
Done scrolling.
[
  {
    "name": "Sri Plahesti",
    "review": "Alhamdulillah... Barokalloh... Islamic boarding school prints Tahfiz Qu'ran.... \ud83e\udd32\ud83e\udd32\ud83d\udcaa \u2026",
    "contrib": "2 reviews",
    "humanized_timestamp": "a year ago",
    "stars": {
      "label": "5 stars",
      "value": 5
    }
  },
  {
    "name": "Iwan _SMK PURWAJAYA",
    "review": "The place is OK, the dormitory is clean and comfortable. Lots of durian trees :)",
    "contrib": "Local Guide \u00b7 30 reviews \u00b7 108 photos",
    "humanized_timestamp": "6 years ago",
    "stars": {
      "label": "5 stars",
      "value": 5
    }
  },
  {
    "name": "Bukuan ID",
    "review": "days",
    "contrib": "Local Guide \u00b7 16 reviews \u00b7 55 photos",
    "humanized_timestamp": "3 years ago",
    "stars": {
      "label": "5 stars",
      "value": 5
    }
  },
  {
    "name

In [44]:
import pandas as pd

In [None]:
df_sources = pd.json_normalize(dataReviews)
df_sources