# Instructions
- Ensure no deduplication
- Ensure that when re-run old csv with data is kept 
- Ensure that csv contains time the data is scraped

# Left to scrape

In [1]:
# https://megatix.in.th/events?category=31
# https://www.savaya.com/
# https://ra.co/events/id/bali
# jakarta has issues scraping

#https://www.singaporeexpo.com.sg/what-s-on/events-expo?utm_source=google_pmax_sitelink&utm_medium=social_ads&utm_campaign=eventpackages-2526&utm_content=cny2026_general&gad_source=1&gad_campaignid=23175866147&gbraid=0AAAABBnFXuJ1_a_rRdCD1OzIMMiCFAgK_&gclid=CjwKCAiA_dDIBhB6EiwAvzc1cI6Q8DekR41jwb9VBhRaHME2P3lrb1gre2Apdfbq5tl-9aU5F7dkFRoCW8AQAvD_BwE
#https://www.thestar.sg/events
#https://www.songkick.com/venues/4360044-pasir-panjang-power-station
#https://phuket.cafedelmar.com/events
#https://www.musinsagarage.com/program
#https://www.wanderlochhall.com/15
#https://www.musinsagarage.com/program

# Singapore (Completed)

In [2]:
# COMBINED: Eventbrite + Ticketmaster + Megatix (Featured)
# NO DEDUP ‚Äî KEEP ALL EVENTS
# pip install tabulate

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from tabulate import tabulate
import pandas as pd
import time
import os

driver = webdriver.Chrome()
driver.set_window_size(1400, 1000)

# ---------- small helpers (added) ----------
def slow_scroll_page(driver, max_loops=200, pause=0.8):
    """
    Incrementally scrolls the page and stops when scrollHeight stops growing
    for several consecutive loops. Gentle pause to let lazy-load content render.
    """
    prev_h = 0
    no_growth = 0
    for _ in range(max_loops):
        driver.execute_script("window.scrollBy(0, Math.max(500, window.innerHeight*0.85));")
        time.sleep(pause)
        h = driver.execute_script("return Math.max(document.body.scrollHeight, document.documentElement.scrollHeight);")
        if h <= prev_h:
            no_growth += 1
            if no_growth >= 4:
                break
        else:
            no_growth = 0
            prev_h = h

def try_click_load_more_by_text(possible_texts=("Load more","Show more","See more","More")):
    """
    Best-effort clicker for generic 'Load more' buttons if present on screen.
    Safe to call even if not present.
    """
    xp = "//*[self::button or self::a][" + " or ".join([f"contains(translate(normalize-space(.), 'ABCDEFGHIJKLMNOPQRSTUVWXYZ','abcdefghijklmnopqrstuvwxyz'), '{t.lower()}')" for t in possible_texts]) + "]"
    try:
        btns = driver.find_elements(By.XPATH, xp)
        for b in btns:
            try:
                driver.execute_script("arguments[0].scrollIntoView({block:'center'});", b)
                time.sleep(0.3)
                driver.execute_script("arguments[0].click();", b)
                time.sleep(1.2)
                return True
            except:
                pass
    except:
        pass
    return False

all_rows = []

# -------------------- 1) EVENTBRITE (with auto-scroll + load more) --------------------
try:
    url = 'https://www.eventbrite.sg/d/singapore--singapore/singapore/?subcategories=3006%2C3025&page=1'
    driver.get(url)

    LIST_X = "//*[@id='root']/div/div[2]/div/div/div/div[1]/div/main/div/div/div/section[1]/div/section/div/div/section/ul"
    CARD_X = LIST_X + "/li"
    BASE   = ".//div/div/div[2]/section/div/section[2]/div"

    WebDriverWait(driver, 15).until(EC.presence_of_element_located((By.XPATH, LIST_X)))
    time.sleep(1)

    # --- slow scroll phase (added) ---
    for _ in range(8):  # a few cycles: scroll, then try to click 'load more' if it appears
        slow_scroll_page(driver, max_loops=40, pause=0.8)
        # try expanding if a button exists, then scroll again next iteration
        clicked = try_click_load_more_by_text(("Load more","Show more"))
        if not clicked:
            # if nothing to click and page stopped growing, break early
            break

    for ev in driver.find_elements(By.XPATH, CARD_X):
        try: title = ev.find_element(By.XPATH, f"{BASE}/a/h3").text.strip()
        except: title = "No Title"
        try: date = ev.find_element(By.XPATH, f"{BASE}/p[1]").text.strip()
        except: date = "No Date"
        try: location = ev.find_element(By.XPATH, f"{BASE}/p[2]").text.strip()
        except: location = "No Location"
        try: link = ev.find_element(By.XPATH, f"{BASE}/a").get_attribute("href")
        except: link = "No Link"
        all_rows.append({"Title": title, "Date": date, "Location": location, "Link": link})

    print(f"üü† Eventbrite: grabbed {len(driver.find_elements(By.XPATH, CARD_X))} events")
except Exception as e:
    print(f"‚ö† Eventbrite error: {e}")

# -------------------- 2) TICKETMASTER (scroll) --------------------
try:
    url = 'https://ticketmaster.sg/categories/concerts'
    driver.get(url)

    EVENTS_X = "//*[@id='events']"
    ANCHOR_X = "//*[@id='events']//a[contains(@href,'/activity/detail/')]"
    TITLE_REL = ".//div[2]/div[2]"
    DATE_REL  = ".//div[2]/div[1]"

    WebDriverWait(driver, 15).until(EC.presence_of_element_located((By.XPATH, EVENTS_X)))
    time.sleep(1)

    # --- slow scroll phase (added) ---
    driver.execute_script("document.getElementById('events')?.scrollIntoView({block:'start'});")
    for _ in range(6):
        slow_scroll_page(driver, max_loops=40, pause=0.8)
        # some pages lazy-load in batches; small pause between cycles
        time.sleep(0.6)

    for a in driver.find_elements(By.XPATH, ANCHOR_X):
        try: title = a.find_element(By.XPATH, TITLE_REL).text.strip()
        except: title = "No Title"
        try: date = a.find_element(By.XPATH, DATE_REL).text.strip()
        except: date = "No Date"
        link = a.get_attribute("href") or "No Link"
        location = "Singapore"
        all_rows.append({"Title": title, "Date": date, "Location": location, "Link": link})

    print(f"üîµ Ticketmaster: grabbed {len(driver.find_elements(By.XPATH, ANCHOR_X))} events")
except Exception as e:
    print(f"‚ö† Ticketmaster error: {e}")

# -------------------- 3) MEGATIX (Featured scroll) --------------------
try:
    url = "https://megatix.com.sg/?page=2"
    driver.get(url)

    WebDriverWait(driver, 15).until(EC.presence_of_element_located((By.XPATH, "//*[@id='__nuxt']")))
    FEATURED_WRAP_X = "//*[@id='__nuxt']/div/div[3]/div/main/section/section[2]/div[2]/div"
    FEATURED_CARD_X = "//*[@id='__nuxt']/div/div[3]/div/main/section/section[2]/div[2]//article[.//h3]"

    wrap = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH, FEATURED_WRAP_X)))
    driver.execute_script("arguments[0].scrollIntoView({block:'center'});", wrap)
    time.sleep(0.8)

    # --- slow scroll phase (added) ---
    for _ in range(6):
        slow_scroll_page(driver, max_loops=40, pause=0.9)
        # if site uses infinite list inside this section, ensure it's in view
        driver.execute_script("arguments[0].scrollIntoView({block:'center'});", wrap)
        time.sleep(0.4)

    for ev in driver.find_elements(By.XPATH, FEATURED_CARD_X):
        try: title = ev.find_element(By.XPATH, ".//h3/span").text.strip()
        except: title = "No Title"
        try: date = ev.find_element(By.XPATH, ".//div[1]/div[1]/span").text.strip()
        except: date = "No Date"
        try: location = ev.find_element(By.XPATH, ".//div[1]/div[3]/span").text.strip()
        except: location = "Singapore"
        try: link = ev.find_element(By.XPATH, ".//ancestor::a").get_attribute("href") or "No Link"
        except: link = "No Link"
        all_rows.append({"Title": title, "Date": date, "Location": location, "Link": link})

    print(f"üü¢ Megatix (Featured): grabbed {len(driver.find_elements(By.XPATH, FEATURED_CARD_X))} events")
except Exception as e:
    print(f"‚ö† Megatix error: {e}")

driver.quit()

# -------------------- FINAL OUTPUT (append-only, no duplicates across runs) --------------------
OUTPUT_CSV = "all_singapore.csv"

# Add scrape timestamp
scraped_at = time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime())
for row in all_rows:
    row["scraped_at_utc"] = scraped_at

df_all = pd.DataFrame(all_rows, columns=["Title", "Date", "Location", "Link", "scraped_at_utc"]).reset_index(drop=True)

# Clean up text fields
df_all["Link"] = df_all["Link"].fillna("").str.strip()
df_all["Title"] = df_all["Title"].fillna("").str.strip()
df_all["Date"] = df_all["Date"].fillna("").str.strip()
df_all["Location"] = df_all["Location"].fillna("").str.strip()

# Load previous CSV if exists
try:
    df_existing = pd.read_csv(OUTPUT_CSV)
    if "Link" not in df_existing.columns:
        for col in ["Title", "Date", "Location", "Link", "scraped_at_utc"]:
            if col not in df_existing.columns:
                df_existing[col] = ""
        df_existing = df_existing[["Title", "Date", "Location", "Link", "scraped_at_utc"]]
except FileNotFoundError:
    df_existing = pd.DataFrame(columns=["Title", "Date", "Location", "Link", "scraped_at_utc"])

existing_links = set(df_existing["Link"].astype(str).tolist())
new_rows = df_all[~df_all["Link"].astype(str).isin(existing_links)]

df_out = pd.concat([df_existing, new_rows], ignore_index=True)

print(f"\n‚úÖ Newly scraped this run: {len(df_all)}")
print(f"‚ûï New (not seen before): {len(new_rows)}")
print(f"üì¶ Total in master after save: {len(df_out)}")

df_out.to_csv(OUTPUT_CSV, index=False, encoding="utf-8")
print(f"\nüíæ CSV saved: {OUTPUT_CSV}")

üü† Eventbrite: grabbed 6 events
üîµ Ticketmaster: grabbed 52 events
üü¢ Megatix (Featured): grabbed 5 events

‚úÖ Newly scraped this run: 63
‚ûï New (not seen before): 7
üì¶ Total in master after save: 103

üíæ CSV saved: all_singapore.csv


# Singapore (Venues)

In [3]:
import os
import time
from datetime import datetime, timezone

import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

URL = "https://www.singaporeexpo.com.sg/what-s-on/events-expo?utm_source=google_pmax_sitelink&utm_medium=social_ads&utm_campaign=eventpackages-2526&utm_content=cny2026_general&gad_source=1&gad_campaignid=23175866147&gbraid=0AAAABBnFXuJ1_a_rRdCD1OzIMMiCFAgK_&gclid=CjwKCAiA_dDIBhB6EiwAvzc1cI6Q8DekR41jwb9VBhRaHME2P3lrb1gre2Apdfbq5tl-9aU5F7dkFRoCW8AQAvD_BwE"
CSV_PATH = "all_singapore_venues.csv"


# ---------------------------------------------------------
# UTILITIES
# ---------------------------------------------------------

def get_utc_timestamp():
    return datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S%z")


def setup_driver():
    options = webdriver.ChromeOptions()
    options.add_argument("--start-maximized")
    # options.add_argument("--headless=new")  # enable if you want headless
    return webdriver.Chrome(options=options)


def wait_for_page_loaded(driver, timeout=30):
    WebDriverWait(driver, timeout).until(
        EC.presence_of_element_located((By.ID, "eventListPage"))
    )


def lazy_scroll(driver, pause=2, max_loops=20):
    """Scroll until no more lazy-loading occurs."""
    last_height = driver.execute_script("return document.body.scrollHeight")

    for _ in range(max_loops):
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(pause)

        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height


# ---------------------------------------------------------
# SCRAPER
# ---------------------------------------------------------

def scrape_events(driver):
    driver.get(URL)
    wait_for_page_loaded(driver)
    lazy_scroll(driver)

    # event section (outer container)
    try:
        section_el = driver.find_element(By.XPATH, "//*[@id='eventListPage']/div[2]")
        section_text = section_el.text.strip()
    except:
        section_text = ""

    # all event card LI items
    items = driver.find_elements(By.XPATH, "//*[@id='eventListPage']/div[2]/ul/li")

    scraped_at = get_utc_timestamp()
    rows = []

    for li in items:
        try:
            name_el = li.find_element(By.XPATH, ".//div/a/div[2]/div[3]/h2")
            event_name = name_el.text.strip()
        except:
            event_name = ""

        try:
            dl_el = li.find_element(By.XPATH, ".//div/a/div[2]/div[1]")
            dl_text = dl_el.text.strip()
            parts = [x.strip() for x in dl_text.split("\n") if x.strip()]
            event_date = parts[0] if len(parts) > 0 else ""
            event_location = parts[1] if len(parts) > 1 else ""
        except:
            event_date = ""
            event_location = ""

        try:
            link_el = li.find_element(By.XPATH, ".//div/a")
            event_link = link_el.get_attribute("href") or ""
        except:
            event_link = ""

        if not event_name:
            continue

        rows.append({
            "event_name": event_name,
            "event_date": event_date,
            "event_location": event_location,
            "event_section": section_text,
            "event_link": event_link,
            "scraped_at_utc": scraped_at
        })

    return rows


# ---------------------------------------------------------
# MERGING / SAVING (NO ROW REMOVAL)
# ---------------------------------------------------------

def build_uid(df: pd.DataFrame) -> pd.Series:
    """Stable unique key for dedupe."""
    return (
        df.get("event_name", "").fillna("") + " | " +
        df.get("event_date", "").fillna("") + " | " +
        df.get("event_location", "").fillna("") + " | " +
        df.get("event_link", "").fillna("")
    )


def save_merged_csv(new_rows, path=CSV_PATH):
    new_df = pd.DataFrame(new_rows)
    if new_df.empty:
        print("No events scraped.")
        return

    # Build new UID column
    new_df["uid"] = build_uid(new_df)

    # Case 1: CSV already exists ‚Üí merge without deleting anything
    if os.path.exists(path):
        existing_df = pd.read_csv(path, dtype=str)

        if "uid" not in existing_df.columns:
            existing_df["uid"] = build_uid(existing_df)

        # Combine
        combined = pd.concat([existing_df, new_df], ignore_index=True)

        # Drop duplicates (keep existing rows)
        combined = combined.drop_duplicates(subset=["uid"], keep="first")

    else:
        # First creation
        combined = new_df

    combined.to_csv(path, index=False, encoding="utf-8-sig")
    print(f"Saved {len(combined)} rows to {path}")


# ---------------------------------------------------------
# MAIN
# ---------------------------------------------------------

def main():
    driver = setup_driver()
    try:
        events = scrape_events(driver)
        print(f"Scraped {len(events)} events.")
        save_merged_csv(events)
    finally:
        driver.quit()


if __name__ == "__main__":
    main()


Scraped 2 events.
Saved 42 rows to all_singapore_venues.csv


In [4]:
import os
import time
from datetime import datetime, timezone

import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

URL = "https://www.thestar.sg/events"
CSV_PATH = "all_singapore_venues.csv"

# ---------------------------------------------------------
# UTILITIES
# ---------------------------------------------------------

def get_utc_timestamp():
    return datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S%z")


def setup_driver():
    options = webdriver.ChromeOptions()
    options.add_argument("--start-maximized")
    # options.add_argument("--headless=new")  # uncomment if you want headless
    driver = webdriver.Chrome(options=options)
    return driver


def wait_for_page_loaded(driver, timeout=30):
    # Wait for the main event section container to appear
    WebDriverWait(driver, timeout).until(
        EC.presence_of_element_located(
            (By.XPATH, "/html/body/section/div/div/div[3]/div")
        )
    )


def lazy_scroll(driver, pause=2.0, max_loops=20):
    """
    Scroll down repeatedly to trigger any lazy-loading until
    page height stops changing or max_loops is reached.
    """
    last_height = driver.execute_script("return document.body.scrollHeight")

    for _ in range(max_loops):
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(pause)

        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height


# ---------------------------------------------------------
# SCRAPER
# ---------------------------------------------------------

def scrape_events(driver):
    driver.get(URL)
    wait_for_page_loaded(driver)
    lazy_scroll(driver)

    # Event section container (where all the event cards live)
    try:
        section_el = driver.find_element(
            By.XPATH, "/html/body/section/div/div/div[3]/div"
        )
    except Exception:
        print("Could not find main events section.")
        return []

    # Each event card is under that div as a direct child div
    items = section_el.find_elements(By.XPATH, "./div")

    scraped_at = get_utc_timestamp()
    rows = []

    for item in items:
        # Using your reference XPaths but in relative form:
        # /html/body/section/div/div/div[3]/div/div[1]/a/h4 ‚Üí .//a/h4
        # /html/body/section/div/div/div[3]/div/div[1]/a/h5 ‚Üí .//a/h5

        try:
            name_el = item.find_element(By.XPATH, ".//a/h4")
            event_name = name_el.text.strip()
        except Exception:
            event_name = ""

        try:
            date_el = item.find_element(By.XPATH, ".//a/h5")
            event_date = date_el.text.strip()
        except Exception:
            event_date = ""

        try:
            link_el = item.find_element(By.XPATH, ".//a")
            event_link = link_el.get_attribute("href") or ""
        except Exception:
            event_link = ""

        if not event_name:
            # Skip blank cards
            continue

        row = {
            "event_name": event_name,
            "event_date": event_date,
            "event_location": "The Star",  # you can change or remove this
            "event_section": "The Star Events",  # label for source/section
            "event_link": event_link,
            "scraped_at_utc": scraped_at,
        }
        rows.append(row)

    return rows


# ---------------------------------------------------------
# MERGING / SAVING (NO ROW REMOVAL)
# ---------------------------------------------------------

def build_uid(df: pd.DataFrame) -> pd.Series:
    """
    Build a stable unique ID for each event based on key fields.
    Used for dedup when merging with existing CSV.
    """
    return (
        df.get("event_name", "").fillna("") + " | " +
        df.get("event_date", "").fillna("") + " | " +
        df.get("event_location", "").fillna("") + " | " +
        df.get("event_link", "").fillna("")
    )


def save_merged_csv(new_rows, path=CSV_PATH):
    new_df = pd.DataFrame(new_rows)
    if new_df.empty:
        print("No events scraped.")
        return

    # Add UID to new data
    new_df["uid"] = build_uid(new_df)

    # If file exists, load and merge; otherwise, just create it
    if os.path.exists(path):
        existing_df = pd.read_csv(path, dtype=str)

        if "uid" not in existing_df.columns:
            existing_df["uid"] = build_uid(existing_df)

        # Combine old + new
        combined = pd.concat([existing_df, new_df], ignore_index=True)

        # Drop duplicates based on uid (keep the original/oldest)
        combined = combined.drop_duplicates(subset=["uid"], keep="first")
    else:
        combined = new_df

    combined.to_csv(path, index=False, encoding="utf-8-sig")
    print(f"Saved {len(combined)} rows to {path}")


# ---------------------------------------------------------
# MAIN
# ---------------------------------------------------------

def main():
    driver = setup_driver()
    try:
        rows = scrape_events(driver)
        print(f"Scraped {len(rows)} events from The Star.")
        save_merged_csv(rows)
    finally:
        driver.quit()


if __name__ == "__main__":
    main()


Scraped 18 events from The Star.
Saved 46 rows to all_singapore_venues.csv


In [5]:
import os
import time
from datetime import datetime, timezone

import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

URL = "https://www.songkick.com/venues/4360044-pasir-panjang-power-station"
CSV_PATH = "all_singapore_venues.csv"

# ---------------------------------------------------------
# UTILITIES
# ---------------------------------------------------------

def get_utc_timestamp():
    """Return current UTC time as a string."""
    return datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S%z")


def setup_driver():
    options = webdriver.ChromeOptions()
    options.add_argument("--start-maximized")
    # options.add_argument("--headless=new")  # uncomment if you want headless mode
    driver = webdriver.Chrome(options=options)
    return driver


def wait_for_page_loaded(driver, timeout=30):
    """
    Wait until the calendar-summary section is present.
    This is the container you referenced: //*[@id="calendar-summary"]
    """
    WebDriverWait(driver, timeout).until(
        EC.presence_of_element_located((By.ID, "calendar-summary"))
    )


def lazy_scroll(driver, pause=2.0, max_loops=20):
    """
    Scroll down repeatedly to trigger lazy-loading until the page
    height stops changing or max_loops is reached.
    """
    last_height = driver.execute_script("return document.body.scrollHeight")

    for _ in range(max_loops):
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(pause)

        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height


# ---------------------------------------------------------
# SCRAPER
# ---------------------------------------------------------

def scrape_events(driver):
    driver.get(URL)
    wait_for_page_loaded(driver)
    lazy_scroll(driver)

    # Main calendar section
    section_el = driver.find_element(By.ID, "calendar-summary")

    # Each li under the UL is an event
    items = section_el.find_elements(By.XPATH, "./ul/li")

    scraped_at = get_utc_timestamp()
    rows = []

    for li in items:
        # event name: //*[@id="calendar-summary"]/ul/li[2]/p[1]/a/span/strong
        try:
            name_el = li.find_element(By.XPATH, ".//p[1]/a/span/strong")
            event_name = name_el.text.strip()
        except Exception:
            event_name = ""

        # event venue: //*[@id="calendar-summary"]/ul/li[2]/p[2]/span[1]/a
        try:
            venue_el = li.find_element(By.XPATH, ".//p[2]/span[1]/a")
            event_location = venue_el.text.strip()
        except Exception:
            # fallback: whole p[2] text if span/a not present
            try:
                venue_fallback = li.find_element(By.XPATH, ".//p[2]")
                event_location = venue_fallback.text.strip()
            except Exception:
                event_location = ""

        # event date:
        # structure on Songkick is usually "date" as first line of the LI text,
        # so we safely derive it from the LI's text content.
        try:
            lines = [x.strip() for x in li.text.split("\n") if x.strip()]
            event_date = lines[0] if lines else ""
        except Exception:
            event_date = ""

        # event link (artist/event link)
        try:
            link_el = li.find_element(By.XPATH, ".//p[1]/a")
            event_link = link_el.get_attribute("href") or ""
        except Exception:
            event_link = ""

        if not event_name:
            # skip weird/empty entries
            continue

        rows.append(
            {
                "event_name": event_name,
                "event_date": event_date,
                "event_location": event_location,
                "event_section": "Songkick Pasir Panjang Power Station",
                "event_link": event_link,
                "scraped_at_utc": scraped_at,
            }
        )

    return rows


# ---------------------------------------------------------
# MERGING / SAVING (NO ROW REMOVAL)
# ---------------------------------------------------------

def build_uid(df: pd.DataFrame) -> pd.Series:
    """
    Build a stable unique ID for each event for deduplication.
    Uses event_name + date + location + link.
    """
    return (
        df.get("event_name", "").fillna("") + " | " +
        df.get("event_date", "").fillna("") + " | " +
        df.get("event_location", "").fillna("") + " | " +
        df.get("event_link", "").fillna("")
    )


def save_merged_csv(new_rows, path=CSV_PATH):
    new_df = pd.DataFrame(new_rows)
    if new_df.empty:
        print("No events scraped.")
        return

    new_df["uid"] = build_uid(new_df)

    if os.path.exists(path):
        existing_df = pd.read_csv(path, dtype=str)

        if "uid" not in existing_df.columns:
            existing_df["uid"] = build_uid(existing_df)

        # combine old + new
        combined = pd.concat([existing_df, new_df], ignore_index=True)

        # drop duplicates by uid; keep the first (oldest) occurrence
        combined = combined.drop_duplicates(subset=["uid"], keep="first")
    else:
        combined = new_df

    combined.to_csv(path, index=False, encoding="utf-8-sig")
    print(f"Saved {len(combined)} rows to {path}")


# ---------------------------------------------------------
# MAIN
# ---------------------------------------------------------

def main():
    driver = setup_driver()
    try:
        rows = scrape_events(driver)
        print(f"Scraped {len(rows)} events from Songkick Pasir Panjang Power Station.")
        save_merged_csv(rows)
    finally:
        driver.quit()


if __name__ == "__main__":
    main()


Scraped 2 events from Songkick Pasir Panjang Power Station.
Saved 46 rows to all_singapore_venues.csv


# Kuala Lumpur (Completed)

In [6]:
# --- Live Nation Malaysia: All Events Scraper (deduplicated) ---
# URL: https://www.livenation.my/event/allevents
# Output CSV: livenation_my_all_events.csv
# Columns: Title, Date, Location, Link
# pip install selenium tabulate pandas

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, StaleElementReferenceException
from tabulate import tabulate
import pandas as pd
import time
import os
import re

# ---------------- CONFIG ----------------
OUTPUT_CSV = "all_events_KL.csv"
URL = "https://www.livenation.my/event/allevents"

# Load existing CSV (if any) to prevent duplicates
if os.path.exists(OUTPUT_CSV):
    try:
        existing_df = pd.read_csv(OUTPUT_CSV)
        seen_links = set(existing_df["Link"].dropna().astype(str))
    except Exception:
        existing_df = pd.DataFrame(columns=["Title", "Date", "Location", "Link"])
        seen_links = set()
else:
    existing_df = pd.DataFrame(columns=["Title", "Date", "Location", "Link"])
    seen_links = set()

# ---------------- DRIVER SETUP ----------------
opts = webdriver.ChromeOptions()
opts.add_argument("--start-maximized")
opts.add_argument("--disable-notifications")
opts.add_argument("--disable-popup-blocking")
driver = webdriver.Chrome(options=opts)
driver.set_window_size(1400, 1000)
all_rows = []

def accept_cookies(drv):
    for xp in [
        "//button[contains(., 'Accept All')]",
        "//button[contains(., 'Accept all')]",
        "//button[contains(., 'I Accept')]",
        "//button[contains(., 'Agree')]",
        "//button[contains(., 'OK')]",
        "//*[@id='onetrust-accept-btn-handler']",
    ]:
        try:
            btn = WebDriverWait(drv, 3).until(EC.element_to_be_clickable((By.XPATH, xp)))
            drv.execute_script("arguments[0].click();", btn)
            time.sleep(0.2)
            break
        except TimeoutException:
            continue
        except Exception:
            continue

def clean_text(s: str) -> str:
    # Collapse whitespace/newlines and trim
    return re.sub(r"\s+", " ", (s or "")).strip()

# ---------------- SCRAPER ----------------
try:
    driver.get(URL)
    WebDriverWait(driver, 15).until(EC.presence_of_element_located((By.XPATH, "//*[@id='main']")))
    time.sleep(1.0)
    accept_cookies(driver)

    # Each LI that has an event link and the date container
    ALL_CARDS_X = "//*[@id='main']//ul//li[.//a[contains(@href,'/event/')]]"

    # Scroll to load everything (and click Load/Show more if present)
    prev = -1
    stable = 0
    for _ in range(80):
        cards = driver.find_elements(By.XPATH, ALL_CARDS_X)
        count = len(cards)
        if count == prev:
            stable += 1
            # Try "Load/Show more" if present
            try:
                more = driver.find_element(By.XPATH, "//button[contains(.,'Load more') or contains(.,'Show more')]")
                if more.is_enabled() and more.is_displayed():
                    driver.execute_script("arguments[0].click();", more)
                    time.sleep(1.2)
                    cards = driver.find_elements(By.XPATH, ALL_CARDS_X)
                    count = len(cards)
                    if count > prev:
                        stable = 0
            except Exception:
                pass
            if stable >= 2:
                break
        else:
            stable = 0
        prev = count
        driver.execute_script("window.scrollBy(0, 1200);")
        time.sleep(1.1)

    cards = driver.find_elements(By.XPATH, ALL_CARDS_X)
    print(f"üü¢ Found {len(cards)} event cards")

    for li in cards:
        try:
            # Link (primary dedup key)
            try:
                link = li.find_element(By.XPATH, ".//a[contains(@href,'/event/')]").get_attribute("href") or ""
            except Exception:
                link = ""
            if not link or link in seen_links:
                continue

            # Title (your path first)
            title = ""
            for xp in [
                ".//span/div/div/p[1]",   # close to //*[@id='1616419']/span/div/div/p[1]
                ".//p[1]",
                ".//div//p[1]"
            ]:
                try:
                    title = clean_text(li.find_element(By.XPATH, xp).text)
                    if title:
                        break
                except Exception:
                    continue
            if not title:
                title = "No Title"

            # Location (your path first)
            location = ""
            for xp in [
                ".//span/div/div/p[3]/span",  # close to //*[@id='1616419']/span/div/div/p[3]/span
                ".//p[3]/span",
                ".//p[3]"
            ]:
                try:
                    location = clean_text(li.find_element(By.XPATH, xp).text)
                    if location:
                        break
                except Exception:
                    continue
            if not location:
                location = "Malaysia"

            # Date ‚Äî USE YOUR DIV CONTAINER XPATH RELATIVELY, then fall back to <time>
            # Provided absolute: //*[@id="main"]/div[2]/div[1]/div/div/div/div[2]/ul[1]/li[1]/div/div[1]
            # Relative to each li, this corresponds to: .//div/div[1]
            date_text = ""
            for xp in [
                ".//div/div[1]",          # your container (first choice)
                ".//time",                # fallback: visible text
            ]:
                try:
                    node = li.find_element(By.XPATH, xp)
                    # prefer node.text, otherwise datetime attribute if it's <time>
                    date_text = clean_text(node.text or node.get_attribute("datetime") or "")
                    if date_text:
                        break
                except Exception:
                    continue

            all_rows.append({
                "Title": title,
                "Date": date_text,
                "Location": location,
                "Link": link
            })
            seen_links.add(link)

        except StaleElementReferenceException:
            continue
        except Exception as e:
            print(f"‚ö† Skipped one card due to error: {e}")
            continue

    print(f"‚úÖ Live Nation MY (All Events): grabbed {len(all_rows)} new events this run")

except TimeoutException:
    print("‚ö† Timeout waiting for Live Nation MY page")
except Exception as e:
    print(f"‚ö† Error: {e}")
finally:
    driver.quit()

# ---------------- OUTPUT ----------------
if all_rows:
    new_df = pd.DataFrame(all_rows, columns=["Title", "Date", "Location", "Link"])
    final_df = pd.concat([existing_df, new_df], ignore_index=True)
else:
    final_df = existing_df.copy()

# Final safety dedup
if not final_df.empty:
    final_df["Link"] = final_df["Link"].astype(str).str.strip()
    final_df = final_df.drop_duplicates(subset=["Link"], keep="first")
    final_df = final_df.drop_duplicates(subset=["Title", "Date", "Location"], keep="first")

print(f"\n‚úÖ TOTAL Unique Events: {len(final_df)}")
print(tabulate(final_df.tail(20), headers="keys", tablefmt="github", showindex=False))

final_df.to_csv(OUTPUT_CSV, index=False, encoding="utf-8")
print(f"\nüíæ CSV saved: {OUTPUT_CSV}")






üü¢ Found 11 event cards
‚úÖ Live Nation MY (All Events): grabbed 0 new events this run

‚úÖ TOTAL Unique Events: 118
| Title                                                                                 | Date                  | Location                                                    | Link                                                                                                                                                                                                                                                                         | TimeScraped      |
|---------------------------------------------------------------------------------------|-----------------------|-------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [7]:
# """
# BookMyShow MY ‚Äî Kuala Lumpur scraper (visible tabs; NaN-safe merge; 4-column CSV)
# - Stable scrape with slower listing reading
# - Safe merge that handles NaN, floats, blanks gracefully
# - CSV schema: Title | Date | Location | Link
# """

# import os
# import time
# import sys
# import traceback
# import hashlib
# from typing import Optional, List, Dict

# import pandas as pd
# from selenium import webdriver
# from selenium.webdriver.common.by import By
# from selenium.webdriver.support.ui import WebDriverWait
# from selenium.webdriver.support import expected_conditions as EC
# from selenium.common.exceptions import (
#     TimeoutException,
#     NoSuchElementException,
#     StaleElementReferenceException,
#     WebDriverException,
# )

# # ===================== CONFIG =====================
# LIST_URL   = "https://my.bookmyshow.com/en/collections/e?cities=Kuala+Lumpur___Kuala+Lumpur"
# OUTPUT_CSV = "all_events_KL.csv"

# HEADLESS          = False
# OPEN_IN_NEW_TAB   = True
# PAUSE_ON_EACH_TAB = 0.8
# TAKE_ERROR_SHOTS  = True

# SLOW_MODE               = True
# LISTING_INITIAL_PAUSE   = 1.2
# PER_CARD_SCROLL_SETTLE  = 0.35
# PER_CARD_READ_RETRIES   = 3
# PER_CARD_READ_SLEEP     = 0.25
# BETWEEN_CARD_PAUSE      = 0.1

# WAIT_SEC          = 20
# PAGE_LOAD_TIMEOUT = 25

# X_LIST_CONTAINER = "/html/body/div[1]/main/div/div/div[2]"
# X_CARD_ANCHOR    = X_LIST_CONTAINER + "/a[{i}]"
# X_NAME           = "/html/body/div[1]/main/div/div/div[2]/a[{i}]/div/div[2]/div/div/div[1]"
# X_DATE           = "/html/body/div[1]/main/div/div/div[2]/a[{i}]/div/div[2]/div/div/div[2]/span"
# X_VENUE_DETAIL   = "/html/body/div[1]/main/div/div/div[2]/div[1]/div[3]/div[4]/div[1]"

# # ===================== DRIVER =====================
# def build_driver(headless: bool = HEADLESS) -> webdriver.Chrome:
#     ua = ("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
#           "(KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36")
#     opts = webdriver.ChromeOptions()
#     opts.add_argument(f"user-agent={ua}")
#     if headless:
#         opts.add_argument("--headless=new")
#     opts.add_argument("--window-size=1400,1000")
#     opts.add_argument("--disable-notifications")
#     opts.add_argument("--disable-blink-features=AutomationControlled")
#     opts.page_load_strategy = "eager"

#     driver = webdriver.Chrome(options=opts)
#     driver.set_page_load_timeout(PAGE_LOAD_TIMEOUT)
#     driver.implicitly_wait(2)
#     return driver

# # ===================== SAFE STRING HELPERS =====================
# def _s(x) -> str:
#     """Convert any NaN, None, or float to safe string."""
#     if x is None:
#         return ""
#     try:
#         if isinstance(x, float) and pd.isna(x):
#             return ""
#     except Exception:
#         pass
#     return str(x)

# def normalize_link(u) -> str:
#     u = _s(u).strip().lower()
#     return u.rstrip("/")

# def stable_event_id(title, date_str, location, link_norm) -> str:
#     link_norm = _s(link_norm).strip().lower()
#     if link_norm:
#         return link_norm
#     key = "|".join([
#         _s(title).strip().lower(),
#         _s(date_str).strip().lower(),
#         _s(location).strip().lower(),
#     ])
#     return hashlib.md5(key.encode("utf-8")).hexdigest()

# # ===================== HELPERS =====================
# def wait_present(driver, xpath, timeout=WAIT_SEC):
#     return WebDriverWait(driver, timeout).until(
#         EC.presence_of_element_located((By.XPATH, xpath))
#     )

# def scroll_lazy(driver, rounds=10, pause=0.5):
#     last_h = 0
#     for _ in range(rounds):
#         driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
#         time.sleep(pause)
#         h = driver.execute_script("return document.body.scrollHeight;")
#         if h == last_h:
#             break
#         last_h = h

# def get_text_abs(driver, xpath) -> str:
#     try:
#         return driver.find_element(By.XPATH, xpath).text.strip()
#     except NoSuchElementException:
#         return ""
#     except StaleElementReferenceException:
#         try:
#             return driver.find_element(By.XPATH, xpath).text.strip()
#         except Exception:
#             return ""

# def read_text_nonempty(driver, xpath: str, retries: int, sleep_s: float) -> str:
#     txt = get_text_abs(driver, xpath)
#     if txt:
#         return txt
#     for _ in range(retries):
#         time.sleep(sleep_s)
#         txt = get_text_abs(driver, xpath)
#         if txt:
#             break
#     return txt

# def ensure_dir(path: str):
#     os.makedirs(path, exist_ok=True)

# def safe_screenshot(driver, path: str):
#     try:
#         driver.save_screenshot(path)
#     except Exception:
#         pass

# def robust_get(driver, url: str, tries: int = 2, settle: float = 0.4):
#     last_err: Optional[Exception] = None
#     for _ in range(tries):
#         try:
#             driver.get(url)
#             time.sleep(settle)
#             return
#         except Exception as e:
#             last_err = e
#             time.sleep(0.7)
#     if last_err:
#         raise last_err

# def open_in_new_tab(driver, url: str):
#     current = driver.current_window_handle
#     driver.execute_script("window.open(arguments[0], '_blank');", url)
#     WebDriverWait(driver, 10).until(lambda d: len(d.window_handles) > 1)
#     new_handle = [h for h in driver.window_handles if h != current][-1]
#     driver.switch_to.window(new_handle)
#     return current, new_handle

# def close_tab_and_return(driver, original_handle: str):
#     try:
#         driver.close()
#     finally:
#         driver.switch_to.window(original_handle)

# # ===================== MERGE (NaN-SAFE) =====================
# def merge_with_existing(df_new: pd.DataFrame, path: str) -> pd.DataFrame:
#     """
#     Merge with old CSV (if exists):
#     - Keeps old rows (no removals)
#     - No duplicates
#     - Overwrites same events with latest scrape
#     - NaN-safe
#     """
#     df_new = df_new.copy()
#     for col in ["Title","Date","Location","Link"]:
#         if col not in df_new.columns:
#             df_new[col] = ""
#     df_new[["Title","Date","Location","Link"]] = df_new[["Title","Date","Location","Link"]].fillna("")
#     df_new["LinkNorm"] = df_new["Link"].apply(normalize_link)
#     df_new["EventID"]  = df_new.apply(
#         lambda r: stable_event_id(r.get("Title",""), r.get("Date",""),
#                                   r.get("Location",""), r.get("LinkNorm","")), axis=1
#     )

#     try:
#         df_old = pd.read_csv(path)
#     except FileNotFoundError:
#         return df_new[["Title","Date","Location","Link"]]

#     df_old = df_old.copy()
#     for col in ["Title","Date","Location","Link"]:
#         if col not in df_old.columns:
#             df_old[col] = ""
#     df_old[["Title","Date","Location","Link"]] = df_old[["Title","Date","Location","Link"]].fillna("")
#     df_old["LinkNorm"] = df_old["Link"].apply(normalize_link)
#     df_old["EventID"]  = df_old.apply(
#         lambda r: stable_event_id(r.get("Title",""), r.get("Date",""),
#                                   r.get("Location",""), r.get("LinkNorm","")), axis=1
#     )

#     merged = pd.concat([df_old, df_new], ignore_index=True)
#     merged = merged.drop_duplicates(subset=["EventID"], keep="last")
#     merged = merged.drop_duplicates(subset=["LinkNorm"], keep="last")
#     return merged[["Title","Date","Location","Link"]]

# # ===================== CORE SCRAPE =====================
# def collect_listing_cards(driver) -> List[Dict[str, str]]:
#     print(f"Opening listing: {LIST_URL}")
#     robust_get(driver, LIST_URL, tries=3, settle=0.6)
#     wait_present(driver, X_LIST_CONTAINER, timeout=WAIT_SEC)
#     scroll_lazy(driver, rounds=10, pause=0.5)
#     if SLOW_MODE:
#         time.sleep(LISTING_INITIAL_PAUSE)

#     anchors = driver.find_elements(By.XPATH, X_LIST_CONTAINER + "/a")
#     n = len(anchors)
#     print(f"Found {n} cards in listing.")
#     cards = []
#     for i in range(1, n + 1):
#         name_x = X_NAME.format(i=i)
#         date_x = X_DATE.format(i=i)
#         a_x    = X_CARD_ANCHOR.format(i=i)

#         try:
#             a_el = driver.find_element(By.XPATH, a_x)
#             driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", a_el)
#         except Exception:
#             pass
#         time.sleep(PER_CARD_SCROLL_SETTLE)

#         title = read_text_nonempty(driver, name_x, PER_CARD_READ_RETRIES, PER_CARD_READ_SLEEP)
#         date  = read_text_nonempty(driver, date_x, PER_CARD_READ_RETRIES, PER_CARD_READ_SLEEP)

#         href = ""
#         try:
#             a_el = driver.find_element(By.XPATH, a_x)
#             href = (a_el.get_attribute("href") or "").strip()
#         except Exception:
#             pass

#         cards.append({"i": i, "title": title, "date": date, "href": href})
#         time.sleep(BETWEEN_CARD_PAUSE)
#     return cards

# def scrape_detail_for_location(driver, url: str, idx: int) -> str:
#     if not url:
#         return ""
#     try:
#         if OPEN_IN_NEW_TAB:
#             print(f"  ‚Üí Opening tab #{idx}: {url}")
#             original = driver.current_window_handle
#             driver.execute_script("window.open(arguments[0], '_blank');", url)
#             WebDriverWait(driver, 10).until(lambda d: len(d.window_handles) > 1)
#             new_handle = [h for h in driver.window_handles if h != original][-1]
#             driver.switch_to.window(new_handle)
#         else:
#             original = driver.current_window_handle
#             driver.get(url)

#         time.sleep(PAUSE_ON_EACH_TAB)

#         try:
#             WebDriverWait(driver, WAIT_SEC).until(
#                 EC.presence_of_element_located((By.XPATH, X_VENUE_DETAIL))
#             )
#             location = driver.find_element(By.XPATH, X_VENUE_DETAIL).text.strip()
#             print(f"    ‚úì Location scraped: {location[:60]}")
#         except TimeoutException:
#             location = ""
#             print("    ‚ö† Location not found (leaving blank).")

#         if OPEN_IN_NEW_TAB:
#             driver.close()
#             driver.switch_to.window(original)
#         return location
#     except Exception as e:
#         print(f"    ‚úó Error on detail page: {type(e).__name__}: {e}")
#         if TAKE_ERROR_SHOTS:
#             ensure_dir("debug_screens")
#             safe_screenshot(driver, f"debug_screens/error_event_{idx}.png")
#         try:
#             if OPEN_IN_NEW_TAB:
#                 driver.close()
#                 driver.switch_to.window(original)
#         except Exception:
#             pass
#         return ""

# def scrape_bms_kl(driver: webdriver.Chrome) -> pd.DataFrame:
#     cards = collect_listing_cards(driver)
#     rows = []
#     for idx, c in enumerate(cards, start=1):
#         title = c["title"]
#         date  = c["date"]
#         href  = c["href"]
#         print(f"\n[{idx}/{len(cards)}] {title} | {date}")
#         location = scrape_detail_for_location(driver, href, idx) if href else ""
#         rows.append({"Title": title, "Date": date, "Location": location, "Link": href})
#     return pd.DataFrame(rows, columns=["Title","Date","Location","Link"])

# # ===================== MAIN =====================
# def main(headless=HEADLESS):
#     driver = build_driver(headless=headless)
#     try:
#         df_new = scrape_bms_kl(driver)
#     finally:
#         try:
#             driver.quit()
#         except Exception:
#             pass
#     if df_new.empty:
#         print("No events found.")
#         return
#     out = merge_with_existing(df_new, OUTPUT_CSV)
#     out.to_csv(OUTPUT_CSV, index=False, encoding="utf-8")
#     print(f"\nSaved {len(out)} total rows -> {OUTPUT_CSV}")

# if __name__ == "__main__":
#     try:
#         main(headless=HEADLESS)
#     except Exception as e:
#         print(f"[fatal] {type(e).__name__}: {e}")
#         traceback.print_exc()
#         sys.exit(1)


## has issues scraping due to limits



In [8]:
# scrape_myticket_asia.py
# pip install selenium pandas tabulate

from __future__ import annotations
import os, time, re
from datetime import datetime
import pandas as pd

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import (
    TimeoutException, StaleElementReferenceException, ElementClickInterceptedException
)

HOME = "https://myticket.asia/"
OUTPUT_CSV = "all_events_KL.csv"
WAIT = 25

# ---------- driver ----------
def make_driver() -> webdriver.Chrome:
    opts = webdriver.ChromeOptions()
    opts.add_argument("--start-maximized")
    opts.add_argument("--disable-notifications")
    opts.add_argument("--disable-blink-features=AutomationControlled")
    opts.add_experimental_option("excludeSwitches", ["enable-automation"])
    opts.add_experimental_option("useAutomationExtension", False)
    # Uncomment for headless:
    # opts.add_argument("--headless=new")
    return webdriver.Chrome(options=opts)

# ---------- utils ----------
def slow_scroll_full_page(driver: webdriver.Chrome, step_px=500, nap=0.25, max_loops=200):
    last = -1
    for _ in range(max_loops):
        driver.execute_script("window.scrollBy(0, arguments[0]);", step_px)
        time.sleep(nap)
        y = driver.execute_script("return window.scrollY;")
        if y == last:
            break
        last = y

def click_all_load_more(driver: webdriver.Chrome, max_clicks=50):
    """
    Click all visible 'Load More' variants used by Modern Events Calendar.
    Non-exact: CSS + text-normalization fallback.
    """
    def visible(e): 
        try: return e.is_displayed() and e.is_enabled()
        except: return False

    clicks = 0
    while clicks < max_clicks:
        # Try common MEC 'load more' controls
        buttons = []
        buttons += driver.find_elements(By.CSS_SELECTOR, "a.mec-load-more, button.mec-load-more")
        buttons += driver.find_elements(By.CSS_SELECTOR, ".mec-load-more a, .mec-load-more button")
        # Text fallback (case-insensitive contains 'load more')
        buttons += [e for e in driver.find_elements(By.TAG_NAME, "a") if re.search(r'load\s*more', (e.text or '').lower())]

        # Dedup DOM elements
        seen_ids = set()
        uniq = []
        for b in buttons:
            try:
                key = b.id
                if key not in seen_ids:
                    seen_ids.add(key)
                    uniq.append(b)
            except Exception:
                continue

        clicked_any = False
        for b in uniq:
            if not visible(b): 
                continue
            try:
                driver.execute_script("arguments[0].scrollIntoView({block:'center'});", b)
                time.sleep(0.25)
                b.click()
                clicked_any = True
                clicks += 1
                time.sleep(1.2)  # allow new cards to render
            except (ElementClickInterceptedException, StaleElementReferenceException):
                continue
            except Exception:
                continue
        if not clicked_any:
            break

def load_existing(path: str) -> pd.DataFrame:
    if os.path.exists(path):
        try:
            return pd.read_csv(path)
        except Exception:
            pass
    return pd.DataFrame(columns=["Title", "Date", "Location", "Link", "TimeScraped"])

def save_append_dedup(df_new: pd.DataFrame, path: str):
    old = load_existing(path)
    all_df = pd.concat([old, df_new], ignore_index=True)
    # primary dedup by Link; secondary by (Title, Date, Location)
    all_df = all_df.drop_duplicates(subset=["Link"], keep="first")
    all_df = all_df.drop_duplicates(subset=["Title", "Date", "Location"], keep="first")
    all_df.to_csv(path, index=False)

def clean_text(s: str) -> str:
    return " ".join((s or "").split())

# ---------- flexible field extraction on detail pages (no exact XPaths) ----------
def extract_title(driver: webdriver.Chrome) -> str:
    # Prefer page H1 within main area, fallback to any h1
    for css in ["main h1", "#main-content h1", "h1"]:
        els = driver.find_elements(By.CSS_SELECTOR, css)
        for el in els:
            t = clean_text(el.text)
            if t: return t
    return ""

def extract_date(driver: webdriver.Chrome) -> str:
    # Try <time>, then common date classes, then definition list dd near date-like labels
    # 1) time tags
    for el in driver.find_elements(By.TAG_NAME, "time"):
        t = clean_text(el.text)
        if t: return t
    # 2) common date-ish classes/labels
    for css in ["[class*='date']", "[class*='time']"]:
        for el in driver.find_elements(By.CSS_SELECTOR, css):
            t = clean_text(el.text)
            if t and len(t) > 2: return t
    # 3) simple dl/dd extraction: pick the dd whose preceding label mentions date/time
    dds = driver.find_elements(By.CSS_SELECTOR, "dl dd")
    dts = driver.find_elements(By.CSS_SELECTOR, "dl dt")
    for i, dt in enumerate(dts):
        label = (dt.text or "").lower()
        if any(k in label for k in ["date", "time", "when"]):
            try:
                t = clean_text(dds[i].text)
                if t: return t
            except Exception:
                pass
    # fallback: first dd with non-empty text
    for el in dds:
        t = clean_text(el.text)
        if t: return t
    return ""

def extract_location(driver: webdriver.Chrome) -> str:
    # Look for 'venue' or 'location' labels first
    dds = driver.find_elements(By.CSS_SELECTOR, "dl dd")
    dts = driver.find_elements(By.CSS_SELECTOR, "dl dt")
    for i, dt in enumerate(dts):
        label = (dt.text or "").lower()
        if any(k in label for k in ["venue", "location", "where"]):
            try:
                t = clean_text(dds[i].text)
                if t: return t
            except Exception:
                pass
    # Generic class-based fallbacks
    for css in ["[class*='venue']", "[class*='location']"]:
        for el in driver.find_elements(By.CSS_SELECTOR, css):
            t = clean_text(el.text)
            if t and len(t) > 2: return t
    # last resort: look for address-like blocks
    for tag in ["address", "p", "span", "div"]:
        for el in driver.find_elements(By.TAG_NAME, tag):
            txt = clean_text(el.text)
            if any(k in txt.lower() for k in ["hall", "theatre", "theater", "ballroom", "arena", "convention", "stadium", "klcc"]):
                if len(txt) <= 120:
                    return txt
    return ""

# ---------- main ----------
def main():
    driver = make_driver()
    wait = WebDriverWait(driver, WAIT)
    rows = []

    try:
        driver.get(HOME)
        wait.until(EC.presence_of_element_located((By.TAG_NAME, "body")))
        time.sleep(1.0)

        # Load everything we can see: scroll + load more + scroll again
        slow_scroll_full_page(driver)
        click_all_load_more(driver)
        slow_scroll_full_page(driver)

        # Collect ALL event links (no exact XPath):
        # MEC usually wraps titles in h4 > a inside containers with id starting mec_skin_events_.
        card_links = driver.find_elements(By.CSS_SELECTOR, "div[id^='mec_skin_events_'] h4 a, section[id^='mec_skin_events_'] h4 a")
        # If homepage changes, also try generic h4 > a with myticket links
        if not card_links:
            card_links = [a for a in driver.find_elements(By.CSS_SELECTOR, "h4 a") if "myticket.asia" in (a.get_attribute("href") or "")]

        # Dedup by href
        links = []
        seen = set()
        for el in card_links:
            try:
                href = (el.get_attribute("href") or "").strip()
                title_from_list = clean_text(el.text)
                if href and href not in seen:
                    seen.add(href)
                    links.append((title_from_list, href))
            except StaleElementReferenceException:
                continue

        if not links:
            print("No event links found. Site structure may have changed.")
            return

        # Visit each link in a new tab and extract fields
        for title_from_list, href in links:
            driver.execute_script("window.open(arguments[0], '_blank');", href)
            driver.switch_to.window(driver.window_handles[-1])

            try:
                wait.until(EC.presence_of_element_located((By.TAG_NAME, "body")))
            except TimeoutException:
                pass

            name = extract_title(driver) or title_from_list
            date = extract_date(driver)
            location = extract_location(driver)

            row = {
                "Title": clean_text(name),
                "Date": clean_text(date),
                "Location": clean_text(location),
                "Link": href,
                "TimeScraped": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
            }
            rows.append(row)

            # close tab and return
            driver.close()
            driver.switch_to.window(driver.window_handles[0])
            time.sleep(0.4)

        df = pd.DataFrame(rows)

        # -------- NEW: show how many new events were added --------
        old_df = load_existing(OUTPUT_CSV)
        old_count = len(old_df)

        save_append_dedup(df, OUTPUT_CSV)

        updated_df = load_existing(OUTPUT_CSV)
        new_total = len(updated_df)
        added_count = new_total - old_count
        print(f"\nüÜï {added_count} new event(s) added ({new_total} total now in file).")
        # -------- END NEW --------

        from tabulate import tabulate
        print(tabulate(df, headers="keys", tablefmt="github", showindex=False))
        print(f"\nSaved {len(df)} unique rows ‚Üí {OUTPUT_CSV}")

    finally:
        driver.quit()

if __name__ == "__main__":
    main()







üÜï 2 new event(s) added (120 total now in file).
| Title                                                                 | Date                  | Location                                        | Link                                                                                                                    | TimeScraped         |
|-----------------------------------------------------------------------|-----------------------|-------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------|---------------------|
| A TRIBUTE TO SHARIFAH AINI                                            | DATE 06 Jun 2026      | LOCATION Dewan Filharmonik Petronas KLCC        | https://myticket.asia/events/a-tribute-to-sharifah-aini/?occurrence=2026-06-06                                          | 2025-12-15 16:23:24 |
| JAZZ, CLASSICAL AND BEYOND WITH THE MPO                               

# Jarkata (Completed)

In [9]:
# --- Megatix Indonesia: Featured Events Scraper (deduplicated) ---
# Output CSV: all_jakarta.csv
# Works for https://megatix.co.id/
# pip install selenium tabulate pandas

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from tabulate import tabulate
import pandas as pd
import time
import os
from datetime import datetime   # <-- added for timestamp

# ---------------- CONFIG ----------------
OUTPUT_CSV = "all_jakarta.csv"
URL = "https://megatix.co.id/"

# Load existing CSV (if any) to prevent duplicates
if os.path.exists(OUTPUT_CSV):
    existing_df = pd.read_csv(OUTPUT_CSV)
    seen_links = set(existing_df["Link"].dropna())
else:
    existing_df = pd.DataFrame()
    seen_links = set()

# ---------------- DRIVER SETUP ----------------
opts = webdriver.ChromeOptions()
opts.add_argument("--start-maximized")
opts.add_argument("--disable-notifications")
driver = webdriver.Chrome(options=opts)
driver.set_window_size(1400, 1000)
all_rows = []

# ---------------- SCRAPER ----------------
try:
    driver.get(URL)
    WebDriverWait(driver, 15).until(EC.presence_of_element_located((By.XPATH, "//*[@id='__nuxt']")))
    time.sleep(1.0)

    FEATURED_CARD_X = "//*[@id='__nuxt']/div/div[3]/div/main/section/section[2]/div[2]//article[.//h3]"

    # Scroll to load all cards
    prev = -1
    stable = 0
    for _ in range(60):
        cards = driver.find_elements(By.XPATH, FEATURED_CARD_X)
        count = len(cards)
        if count == prev:
            stable += 1
            if stable >= 2:
                break
        else:
            stable = 0
        prev = count
        driver.execute_script("window.scrollBy(0, 900);")
        time.sleep(1.2)

    cards = driver.find_elements(By.XPATH, FEATURED_CARD_X)
    print(f"üü¢ Found {len(cards)} featured cards")

    for ev in cards:
        try:
            title = ev.find_element(By.XPATH, ".//h3/span").text.strip()
        except:
            title = "No Title"
        try:
            date = ev.find_element(By.XPATH, ".//div[1]/div[1]/span").text.strip()
        except:
            date = "No Date"
        try:
            location = ev.find_element(By.XPATH, ".//div[1]/div[3]/span").text.strip()
        except:
            location = "Indonesia"
        try:
            link = ev.find_element(By.XPATH, ".//ancestor::a").get_attribute("href") or "No Link"
        except:
            link = "No Link"

        # Skip duplicates already in CSV
        if link in seen_links:
            continue

        all_rows.append({
            "Title": title,
            "Date": date,
            "Location": location,
            "Link": link,
            "TimeScraped": datetime.now().strftime("%Y-%m-%d %H:%M:%S")  # <-- added line
        })
        seen_links.add(link)

    print(f"‚úÖ Megatix Indonesia (Featured): grabbed {len(all_rows)} new events")

except TimeoutException:
    print("‚ö† Timeout waiting for Megatix Indonesia page")
except Exception as e:
    print(f"‚ö† Error: {e}")
finally:
    driver.quit()

# ---------------- OUTPUT ----------------
if all_rows:
    new_df = pd.DataFrame(all_rows)
    final_df = pd.concat([existing_df, new_df], ignore_index=True)
else:
    final_df = existing_df

print(f"\n‚úÖ TOTAL Unique Events: {len(final_df)}")
print(tabulate(final_df.tail(20), headers="keys", tablefmt="github", showindex=False))

final_df.to_csv(OUTPUT_CSV, index=False, encoding="utf-8")
print(f"\nüíæ CSV saved: {OUTPUT_CSV}")




üü¢ Found 7 featured cards
‚úÖ Megatix Indonesia (Featured): grabbed 2 new events

‚úÖ TOTAL Unique Events: 9
| Title                                       | Date             | Location          | Link                                                  | TimeScraped         |
|---------------------------------------------|------------------|-------------------|-------------------------------------------------------|---------------------|
| NORA EN PURE                                | Sat, 13 Dec 2025 | Savaya Bali       | https://megatix.co.id/events/nora-en-pure-1312        | 8/12/2025 10:21     |
| NORA EN PURE                                | Sat, 13 Dec 2025 | Savaya Bali       | https://megatix.co.id/events/nora-en-pure-1312        | 8/12/2025 10:21     |
| X-Clusive Presents: Bali NYE 2025 with Tyga | Wed, 31 Dec 2025 | The Stage         | https://megatix.co.id/events/nye-2025-with-tyga       | 8/12/2025 10:21     |
| ANJUNADEEP                                  | Sun, 14 Dec 2025

# Hong Kong (done)

In [10]:
# # --- HONG KONG: Live Nation only ---
# # Output CSV: all_hong_kong.csv
# # pip install selenium tabulate pandas

# from selenium import webdriver
# from selenium.webdriver.common.by import By
# from selenium.webdriver.support.ui import WebDriverWait
# from selenium.webdriver.support import expected_conditions as EC
# from selenium.common.exceptions import TimeoutException, NoSuchElementException
# from tabulate import tabulate
# import pandas as pd
# import time, os, unicodedata, shutil
# from urllib.parse import urlparse
# from datetime import datetime  # <-- added for timestamp

# # =================== CONFIG ===================
# OUTPUT_CSV = "all_hong_kong.csv"
# BACKUP_CSV = OUTPUT_CSV + ".bak"
# LIVE_HOME = "https://www.livenation.hk/en"
# WAIT = 25

# # =================== DRIVER ===================
# ua = ("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
#       "(KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36")
# opts = webdriver.ChromeOptions()
# opts.add_argument(f"user-agent={ua}")
# opts.add_argument("--start-maximized")
# opts.add_argument("--disable-notifications")
# opts.add_argument("--disable-blink-features=AutomationControlled")
# opts.add_experimental_option("excludeSwitches", ["enable-automation"])
# opts.add_experimental_option("useAutomationExtension", False)
# opts.page_load_strategy = "eager"

# driver = webdriver.Chrome(options=opts)
# driver.set_window_size(1400, 1000)
# driver.set_page_load_timeout(60)
# driver.set_script_timeout(60)

# # =================== UTILS ===================
# def accept_cookies(driver):
#     for by, sel in [
#         (By.ID, "onetrust-accept-btn-handler"),
#         (By.XPATH, "//*[@id='onetrust-accept-btn-handler']"),
#         (By.XPATH, "//*[self::button or self::a][contains(.,'Accept All') or contains(.,'Accept')]"),
#     ]:
#         try:
#             el = WebDriverWait(driver, 2).until(EC.element_to_be_clickable((by, sel)))
#             driver.execute_script("arguments[0].click();", el)
#             time.sleep(0.2)
#             break
#         except Exception:
#             continue

# def safe_text(el):
#     try:
#         return unicodedata.normalize("NFKC", " ".join(el.text.split()))
#     except Exception:
#         return ""

# def norm(s):
#     return " ".join(unicodedata.normalize("NFKC", (s or "").strip().lower()).split())

# def canonicalize_link(link):
#     if not link:
#         return ""
#     p = urlparse(link)
#     return f"{(p.netloc or '').lower()}{p.path or ''}"

# def make_event_id(platform, link, title, date):
#     canon = canonicalize_link(link)
#     if canon:
#         return f"{platform}|{canon}"
#     return f"{norm(platform)}|{norm(title)}|{norm(date)}"

# # =================== LOAD OLD CSV ===================
# seen_ids_persisted = set()
# old_df = None
# column_order = ["Platform", "Title", "Date", "Location", "Link", "TimeScraped"]  # <-- added column here

# if os.path.exists(OUTPUT_CSV):
#     try:
#         old_df = pd.read_csv(OUTPUT_CSV, dtype=str, keep_default_na=False)
#         if list(old_df.columns):
#             column_order = list(old_df.columns)
#         for _, r in old_df.iterrows():
#             seen_ids_persisted.add(
#                 make_event_id(
#                     str(r.get("Platform","")),
#                     str(r.get("Link","")),
#                     str(r.get("Title","")),
#                     str(r.get("Date","")),
#                 )
#             )
#         print(f"‚Ü∫ Loaded {len(seen_ids_persisted)} previously-scraped rows")
#     except Exception as e:
#         print(f"‚ö† Could not read existing CSV: {e}")

# # --- ensure TimeScraped is part of the schema even if old CSV didn't have it ---
# try:
#     if old_df is not None and "TimeScraped" not in old_df.columns:
#         old_df["TimeScraped"] = ""
#         column_order = list(old_df.columns)
#     if "TimeScraped" not in column_order:
#         column_order = list(column_order) + ["TimeScraped"]
# except Exception:
#     pass

# all_rows = []
# seen_links_this_run = set()

# def append_row(platform, title, date_text, location, link):
#     row = {
#         "Platform": platform,
#         "Title": title or "No Title",
#         "Date": date_text or "No Date",
#         "Location": location or "Hong Kong",
#         "Link": link or "No Link",
#         "TimeScraped": datetime.now().strftime("%Y-%m-%d %H:%M:%S")  # <-- added timestamp
#     }
#     eid = make_event_id(row["Platform"], row["Link"], row["Title"], row["Date"])
#     if eid in seen_ids_persisted:
#         return
#     canon = canonicalize_link(row["Link"])
#     if canon and canon in seen_links_this_run:
#         return
#     if canon:
#         seen_links_this_run.add(canon)
#     all_rows.append(row)

# # =================== LIVE NATION HK ===================
# def find_all_events_section():
#     try:
#         return WebDriverWait(driver, 12).until(
#             EC.presence_of_element_located((By.XPATH, "//*[self::h2 or self::h3][contains(.,'All Events')]/ancestor::section[1]"))
#         )
#     except TimeoutException:
#         return WebDriverWait(driver, 12).until(
#             EC.presence_of_element_located((By.XPATH, "//*[@id='main']/div/div[6]/section/div/div/div[2]"))
#         )

# def scroll_deep(times=8, pause=0.45):
#     for _ in range(times):
#         driver.execute_script("window.scrollBy(0, 1000);")
#         time.sleep(pause)

# def scrape_livenation_page(label="LiveNationHK"):
#     section = find_all_events_section()
#     cards = section.find_elements(By.XPATH, ".//li[.//a[@href]]")
#     if not cards:
#         cards = section.find_elements(By.XPATH, ".//ul[1]/li | .//div[contains(@class,'Card')]")
#     for li in cards:
#         link, title, location, date_text = "", "", "Hong Kong", ""
#         try:
#             a = li.find_element(By.XPATH, ".//a[@href]")
#             link = a.get_attribute("href") or ""
#         except NoSuchElementException:
#             pass
#         for xp in [".//p[contains(@class,'title')][1]", ".//p[1]", ".//*[self::h3 or self::h2 or self::p][1]"]:
#             try:
#                 title = safe_text(li.find_element(By.XPATH, xp))
#                 if title: break
#             except NoSuchElementException:
#                 continue
#         for xp in [".//p[contains(@class,'location')]", ".//p[2]"]:
#             try:
#                 l = safe_text(li.find_element(By.XPATH, xp))
#                 if l: location = l; break
#             except NoSuchElementException:
#                 continue
#         for xp in [".//time", ".//p[contains(@class,'date')][1]", ".//p[2]"]:
#             try:
#                 d = safe_text(li.find_element(By.XPATH, xp))
#                 if d and d.lower() != location.lower():
#                     date_text = d; break
#             except NoSuchElementException:
#                 continue
#         append_row(label, title, date_text, location, link)

# def run_livenation():
#     driver.get(LIVE_HOME)
#     accept_cookies(driver)
#     scroll_deep(8, 0.45)
#     scrape_livenation_page("LiveNationHK")
#     # Try "Page 2"
#     try:
#         nxt = WebDriverWait(driver, 6).until(
#             EC.element_to_be_clickable((By.XPATH, "//*[@id='main']/div/div[6]//nav//li[3]/button"))
#         )
#         driver.execute_script("arguments[0].click();", nxt)
#         time.sleep(1.0)
#         scroll_deep(8, 0.45)
#         scrape_livenation_page("LiveNationHK")
#     except TimeoutException:
#         pass

# # =================== RUN ===================
# try:
#     run_livenation()
# finally:
#     try:
#         driver.quit()
#     except Exception:
#         pass

# # =================== SAVE (append-only) ===================
# new_df = pd.DataFrame(all_rows, dtype=str).reset_index(drop=True)

# if os.path.exists(OUTPUT_CSV) and old_df is not None:
#     if not new_df.empty:
#         new_df["__id"] = new_df.apply(
#             lambda r: make_event_id(r.get("Platform",""), r.get("Link",""), r.get("Title",""), r.get("Date","")),
#             axis=1
#         )
#         new_df = new_df[~new_df["__id"].isin(seen_ids_persisted)].drop(columns="__id")
#     for col in column_order:
#         if col not in new_df.columns:
#             new_df[col] = ""
#     new_df = new_df[column_order]
#     final_df = pd.concat([old_df, new_df], ignore_index=True)
# else:
#     for col in column_order:
#         if col not in new_df.columns:
#             new_df[col] = ""
#     new_df = new_df[column_order]
#     final_df = new_df

# # Fill missing TimeScraped for older rows
# if "TimeScraped" in final_df.columns:
#     final_df["TimeScraped"] = final_df["TimeScraped"].replace("", pd.NA).fillna(datetime.now().strftime("%Y-%m-%d %H:%M:%S"))

# added = len(new_df)
# print(f"\n‚úÖ New rows appended this run: {added}")
# print(f"üì¶ Total rows in file: {len(final_df)}")
# print(tabulate(final_df.tail(max(1, min(added, 20))), headers="keys", tablefmt="github", showindex=False))

# tmp = OUTPUT_CSV + ".tmp"
# final_df.to_csv(tmp, index=False, encoding="utf-8-sig")
# if os.path.exists(OUTPUT_CSV):
#     try:
#         shutil.copy2(OUTPUT_CSV, BACKUP_CSV)
#     except Exception:
#         pass
# os.replace(tmp, OUTPUT_CSV)
# print(f"\nüíæ CSV updated: {OUTPUT_CSV}")
# if os.path.exists(BACKUP_CSV):
#     print(f"üßØ Backup saved: {BACKUP_CSV}")






In [11]:
# --- HKTicketing Scraper (XPath-only; no fallbacks) ---
# Site: https://premier.hkticketing.com/
# Output CSV: all_hong_kong.csv (Platform, Title, Date, Location, Link)
# pip install selenium tabulate pandas

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException
from selenium.webdriver.common.action_chains import ActionChains
from tabulate import tabulate
import pandas as pd
import time, os, math, sys
from datetime import datetime  # <-- added import for timestamp

HOME = "https://premier.hkticketing.com/"
OUTPUT_CSV = "all_hong_kong.csv"

# Tunables
WAIT = 25
PAGELOAD_TIMEOUT = 60
SCRIPT_TIMEOUT = 60
NAV_RETRIES = 4

# Load previous results (for de-dup by Link)
if os.path.exists(OUTPUT_CSV):
    try:
        existing_df = pd.read_csv(OUTPUT_CSV)
        if "Link" in existing_df.columns:
            seen_links = set(existing_df["Link"].dropna().astype(str))
        else:
            seen_links = set()
    except Exception:
        existing_df = pd.DataFrame(columns=["Platform", "Title", "Date", "Location", "Link"])
        seen_links = set()
else:
    existing_df = pd.DataFrame(columns=["Platform", "Title", "Date", "Location", "Link"])
    seen_links = set()

# ------------- Driver -------------
opts = webdriver.ChromeOptions()
opts.add_argument("--start-maximized")
opts.add_argument("--disable-notifications")
opts.add_argument("--disable-popup-blocking")
opts.add_argument("--disable-blink-features=AutomationControlled")
opts.add_experimental_option("excludeSwitches", ["enable-automation"])
opts.add_experimental_option("useAutomationExtension", False)
driver = webdriver.Chrome(options=opts)
driver.set_window_size(1400, 1000)
driver.set_page_load_timeout(PAGELOAD_TIMEOUT)
driver.set_script_timeout(SCRIPT_TIMEOUT)
actions = ActionChains(driver)

def save_debug(prefix: str):
    try:
        png = f"{prefix}_screenshot.png"
        html = f"{prefix}_source.html"
        driver.save_screenshot(png)
        with open(html, "w", encoding="utf-8") as f:
            f.write(driver.page_source)
        print(f"‚ö† Saved debug artifacts: {png}, {html}")
    except Exception:
        pass

def js_ready() -> bool:
    try:
        return driver.execute_script("return document.readyState") == "complete"
    except Exception:
        return False

def wait_for_home_ready(timeout=WAIT):
    end = time.time() + timeout
    while time.time() < end:
        if js_ready():
            try:
                if driver.find_elements(By.XPATH, '//*[@id="moreEventsSection"]') or \
                   driver.find_elements(By.XPATH, '//*[@id="heroNavPaging"]'):
                    return True
            except Exception:
                pass
        time.sleep(0.3)
    return False

def get_with_retries(url: str, attempts=NAV_RETRIES) -> None:
    last_err = None
    for i in range(1, attempts + 1):
        try:
            driver.get(url)
            driver.execute_script("window.scrollTo(0, 50);")
            time.sleep(0.3)
            if wait_for_home_ready(timeout=WAIT + 5):
                return
            else:
                raise TimeoutException("Home not ready within wait window.")
        except Exception as e:
            last_err = e
            save_debug(f"load_attempt_{i}")
            sleep_s = min(2 * i, 8)
            print(f"‚ö† Load attempt {i}/{attempts} failed: {e}. Retrying in {sleep_s}s...")
            time.sleep(sleep_s)
    raise TimeoutException(f"Failed to load {url} after {attempts} attempts: {last_err}")

def safe_get_text(by, xp, timeout=WAIT):
    el = WebDriverWait(driver, timeout).until(EC.presence_of_element_located((by, xp)))
    return el.text.strip()

def safe_click(by, xp, timeout=WAIT):
    el = WebDriverWait(driver, timeout).until(EC.element_to_be_clickable((by, xp)))
    driver.execute_script("arguments[0].scrollIntoView({block:'center'});", el)
    time.sleep(0.15)
    driver.execute_script("arguments[0].click();", el)

rows = []

try:
    get_with_retries(HOME, attempts=NAV_RETRIES)

    idx = 1
    while True:
        tile_xp = f'//*[@id="moreEventsSection"]/div[{idx}]/div[2]/a'
        title_click_xp = f'//*[@id="moreEventsSection"]/div[{idx}]/div[2]/a/div[2]/strong'

        try:
            tile = WebDriverWait(driver, 4).until(EC.presence_of_element_located((By.XPATH, tile_xp)))
        except TimeoutException:
            break

        link = tile.get_attribute("href") or ""
        safe_click(By.XPATH, title_click_xp)
        time.sleep(0.6)

        venue_xp = '//*[@id="ctl00_ctl00_uiBodyMain_uiBodyRight_uiPerfSelector_uiPerfSelectorUpdatePanel"]/div[2]/div[3]/p[1]'
        date_xp  = '//*[@id="ctl00_ctl00_uiBodyMain_uiBodyRight_uiPerfSelector_uiPerfSelectorUpdatePanel"]/div[2]/div[3]/p[2]'

        try:
            venue = safe_get_text(By.XPATH, venue_xp, timeout=WAIT)
        except Exception:
            venue = ""

        try:
            date_time = safe_get_text(By.XPATH, date_xp, timeout=WAIT)
        except Exception:
            date_time = ""

        try:
            page_title = driver.title.strip()
        except Exception:
            page_title = ""
        title_val = page_title if page_title else f"Event {idx}"

        if not link:
            try:
                link = driver.current_url
            except Exception:
                link = ""

        if link and link not in seen_links:
            rows.append({
                "Platform": "HKTicketing",
                "Title": title_val,
                "Date": date_time,
                "Location": venue,
                "Link": link,
                "TimeScraped": datetime.now().strftime("%Y-%m-%d %H:%M:%S")  # <-- added line
            })
            seen_links.add(link)

        driver.back()
        if not wait_for_home_ready(timeout=WAIT + 5):
            print("‚Ñπ Re-loading home after back navigation...")
            get_with_retries(HOME, attempts=2)
        time.sleep(0.4)
        idx += 1

    safe_click(By.XPATH, '//*[@id="heroNavPaging"]/a[1]')
    time.sleep(0.5)

    hero_name = safe_get_text(By.XPATH, '//*[@id="heroModuleInner"]/div[2]/div[2]/div[1]/h2/a')

    safe_click(By.XPATH, '//*[@id="heroModuleInner"]/div[3]/div[2]/div[2]/a')
    WebDriverWait(driver, WAIT).until(
        EC.presence_of_element_located((By.XPATH, '//*[@id="app-scroll-container"]'))
    )
    time.sleep(0.6)

    hero_venue = safe_get_text(By.XPATH, '//*[@id="app-scroll-container"]/div[1]/div[2]/div/div[2]/div/div/div[2]/div[2]/p[5]')
    hero_datetime = safe_get_text(By.XPATH, '//*[@id="app-scroll-container"]/div[1]/div[2]/div/div[2]/div/div/div[2]/div[2]/p[6]')

    try:
        hero_link = driver.current_url
    except Exception:
        hero_link = ""

    if hero_link and hero_link not in seen_links:
        rows.append({
            "Platform": "HKTicketing",
            "Title": hero_name,
            "Date": hero_datetime,
            "Location": hero_venue,
            "Link": hero_link,
            "TimeScraped": datetime.now().strftime("%Y-%m-%d %H:%M:%S")  # <-- added line
        })
        seen_links.add(hero_link)

except TimeoutException as e:
    print(f"‚ö† Timeout while loading or scraping: {e}")
    save_debug("fatal_timeout")
except NoSuchElementException as e:
    print(f"‚ö† Missing element via provided XPath: {e}")
    save_debug("fatal_no_such_element")
except Exception as e:
    print(f"‚ö† Unexpected error: {e}")
    save_debug("fatal_error")
finally:
    try:
        driver.quit()
    except Exception:
        pass

# ---------------- OUTPUT ----------------
if rows:
    new_df = pd.DataFrame(rows)
    cols = ["Platform", "Title", "Date", "Location", "Link", "TimeScraped"]  # <-- ensure column order includes TimeScraped
    for c in cols:
        if c not in new_df.columns:
            new_df[c] = ""
    new_df = new_df[cols]

    if set(cols) - set(existing_df.columns):
        for c in cols:
            if c not in existing_df.columns:
                existing_df[c] = ""
    existing_df = existing_df[cols]

    final_df = pd.concat([existing_df, new_df], ignore_index=True)
else:
    final_df = existing_df.copy()

if not final_df.empty:
    final_df["Link"] = final_df["Link"].astype(str).str.strip()
    final_df = final_df.drop_duplicates(subset=["Link"], keep="first")

print(f"\n‚úÖ TOTAL Unique Events: {len(final_df)}")
print(tabulate(final_df.tail(20), headers="keys", tablefmt="github", showindex=False))

final_df.to_csv(OUTPUT_CSV, index=False, encoding="utf-8-sig")
print(f"\nüíæ CSV saved: {OUTPUT_CSV}")





‚ö† Timeout while loading or scraping: Message: 

‚ö† Saved debug artifacts: fatal_timeout_screenshot.png, fatal_timeout_source.html

‚úÖ TOTAL Unique Events: 28
| Platform     | Title                                                                | Date                                                       | Location                                                                            | Link                                                                                                                   | TimeScraped      |
|--------------|----------------------------------------------------------------------|------------------------------------------------------------|-------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------|------------------|
| LiveNationHK | TWICE <THIS IS FOR> WORLD TOUR IN HONG KONG                          | 

In [12]:
# import os
# import csv
# import time
# from datetime import datetime, timezone

# from selenium import webdriver
# from selenium.webdriver.common.by import By
# from selenium.webdriver.support.ui import WebDriverWait
# from selenium.webdriver.support import expected_conditions as EC
# from selenium.webdriver.chrome.options import Options


# URL = "https://www.macstadiumhkpa.com/en/%E6%9C%80%E6%96%B0%E6%B4%BB%E5%8B%95"
# CSV_FILE = "all_events_hk.csv"
# EVENT_LOCATION = "MacPherson Stadium Hong Kong"


# def build_driver(headless: bool = True):
#     chrome_options = Options()
#     if headless:
#         chrome_options.add_argument("--headless=new")
#     chrome_options.add_argument("--disable-gpu")
#     chrome_options.add_argument("--no-sandbox")
#     chrome_options.add_argument("--disable-dev-shm-usage")
#     chrome_options.add_argument("--window-size=1920,1080")

#     driver = webdriver.Chrome(options=chrome_options)
#     return driver


# def wait_for_page_and_lazy_load(driver):
#     wait = WebDriverWait(driver, 20)

#     # Wait until some event name element is present (using the pattern of the ID you gave)
#     wait.until(
#         EC.presence_of_element_located(
#             (
#                 By.XPATH,
#                 '//*[starts-with(@id, "comp-kycrouab1__item-")]/p/span'
#             )
#         )
#     )

#     # Lazy-load: scroll until the bottom height stops changing
#     last_height = driver.execute_script("return document.body.scrollHeight")
#     while True:
#         driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
#         time.sleep(2)  # give time for Wix/lazy content to load
#         new_height = driver.execute_script("return document.body.scrollHeight")
#         if new_height == last_height:
#             break
#         last_height = new_height


# def load_existing_name_url_keys(csv_path):
#     """
#     Returns a set of (event_name, event_url) pairs that already exist in the CSV.
#     Works even if older CSV rows don't have an event_url column.
#     """
#     existing_keys = set()
#     if not os.path.exists(csv_path):
#         return existing_keys

#     with open(csv_path, "r", encoding="utf-8", newline="") as f:
#         reader = csv.DictReader(f)
#         for row in reader:
#             name = (row.get("event_name") or "").strip()
#             url = (row.get("event_url") or "").strip()
#             if name or url:
#                 existing_keys.add((name, url))
#     return existing_keys


# def save_events_to_csv(csv_path, events):
#     """
#     Append new events to the CSV, ensuring:
#     - Header is written if file does not exist.
#     - Existing rows are kept.
#     """
#     file_exists = os.path.exists(csv_path)
#     fieldnames = [
#         "event_name",
#         "event_date",
#         "event_location",
#         "event_url",
#         "time_scraped_utc",
#     ]

#     with open(csv_path, "a", encoding="utf-8", newline="") as f:
#         writer = csv.DictWriter(f, fieldnames=fieldnames)
#         if not file_exists:
#             writer.writeheader()
#         for ev in events:
#             writer.writerow(ev)


# def scrape_events(driver):
#     driver.get(URL)
#     wait_for_page_and_lazy_load(driver)

#     # Event names
#     name_elements = driver.find_elements(
#         By.XPATH,
#         '//*[starts-with(@id, "comp-kycrouab1__item-")]/p/span'
#     )

#     # Event dates
#     date_elements = driver.find_elements(
#         By.XPATH,
#         '//*[starts-with(@id, "comp-kycrouav__item-")]/p/span/span/span/span/span'
#     )

#     # Event URLs: "Details" links
#     url_elements = driver.find_elements(
#         By.XPATH,
#         '//a[contains(normalize-space(text()), "Details")]'
#     )

#     event_names = [el.text.strip() for el in name_elements if el.text.strip()]
#     event_dates = [el.text.strip() for el in date_elements if el.text.strip()]
#     event_urls = [el.get_attribute("href").strip() for el in url_elements if el.get_attribute("href")]

#     # Make sure we only pair up to the shortest list length
#     n = min(len(event_names), len(event_dates), len(event_urls))
#     event_names = event_names[:n]
#     event_dates = event_dates[:n]
#     event_urls = event_urls[:n]

#     # Existing (name, url) combos from CSV
#     existing_keys = load_existing_name_url_keys(CSV_FILE)

#     new_events = []
#     current_utc = datetime.now(timezone.utc).isoformat()

#     # Also de-dupe within this run
#     seen_this_run = set()

#     for name, date_str, url in zip(event_names, event_dates, event_urls):
#         norm_name = name.strip()
#         norm_url = url.strip()

#         key = (norm_name, norm_url)

#         # Skip if already in CSV or already added this run
#         if key in existing_keys or key in seen_this_run:
#             continue

#         seen_this_run.add(key)

#         new_events.append(
#             {
#                 "event_name": norm_name,
#                 "event_date": date_str.strip(),
#                 "event_location": EVENT_LOCATION,
#                 "event_url": norm_url,
#                 "time_scraped_utc": current_utc,
#             }
#         )

#     return new_events


# def main():
#     driver = build_driver(headless=True)
#     try:
#         new_events = scrape_events(driver)
#         if new_events:
#             save_events_to_csv(CSV_FILE, new_events)
#             print(f"Added {len(new_events)} new events to {CSV_FILE}.")
#         else:
#             print("No new events found. CSV unchanged.")
#     finally:
#         driver.quit()


# if __name__ == "__main__":
#     main()




# Jarkata (done)

In [13]:
import os
import csv
import time
from datetime import datetime, timezone

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import TimeoutException


URL = "https://megatix.co.id/madaboutcomedy"
CSV_FILE = "all_jakarta.csv"


def build_driver(headless: bool = False):  # show browser window so site ‚Äúpops up‚Äù
    chrome_options = Options()

    if headless:
        chrome_options.add_argument("--headless=new")

    chrome_options.add_argument("--disable-gpu")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_options.add_argument("--window-size=1400,1000")

    driver = webdriver.Chrome(options=chrome_options)
    return driver


def wait_for_page_and_lazy_load(driver):
    """
    Wait for at least one event name to appear, then scroll
    to allow lazy-loaded content to load.
    """
    wait = WebDriverWait(driver, 25)

    try:
        # Wait for any event name element, not a super-deep static path
        wait.until(
            EC.presence_of_element_located(
                (
                    By.XPATH,
                    '//*[@id="megatix"]/div/main/div/div[2]/div[1]'
                    '//div/div[2]/div[2]/div'
                )
            )
        )
    except TimeoutException:
        print("Warning: event name element did not load in time.")
        return

    # Lazy-load scroll loop
    last_height = 0
    for _ in range(5):
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(1.5)
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height


def load_existing_event_names(csv_path):
    """
    Load existing event names from CSV to avoid duplicates on re-runs.
    """
    existing_names = set()
    if not os.path.exists(csv_path):
        return existing_names

    with open(csv_path, "r", encoding="utf-8", newline="") as f:
        reader = csv.DictReader(f)
        for row in reader:
            name = (row.get("event_name") or "").strip()
            if name:
                existing_names.add(name)
    return existing_names


def save_events_to_csv(csv_path, events):
    """
    Append new events to CSV, preserving existing rows.
    """
    file_exists = os.path.exists(csv_path)
    fieldnames = ["event_name", "event_date", "event_location", "time_scraped_utc"]

    with open(csv_path, "a", encoding="utf-8", newline="") as f:
        writer = csv.DictWriter(f, fieldnames=fieldnames)
        if not file_exists:
            writer.writeheader()
        for ev in events:
            writer.writerow(ev)


def scrape_events(driver):
    driver.get(URL)
    wait_for_page_and_lazy_load(driver)

    # Find ALL event name elements on the page (not just one card)
    name_elements = driver.find_elements(
        By.XPATH,
        '//*[@id="megatix"]/div/main/div/div[2]/div[1]'
        '//div/div[2]/div[2]/div'
    )

    existing_names = load_existing_event_names(CSV_FILE)
    seen_this_run = set()
    new_events = []
    current_utc = datetime.now(timezone.utc).isoformat()

    for name_elem in name_elements:
        try:
            event_name = name_elem.text.strip()
        except Exception:
            event_name = ""

        if not event_name:
            continue

        # Skip duplicates based on event_name
        if event_name in existing_names or event_name in seen_this_run:
            continue

        # From the name element, get the info container (div[2] that holds date/name/location)
        # name is at: card/div[2]/div[2]/div
        # so container is: ../..  (div[2])
        try:
            info_container = name_elem.find_element(By.XPATH, './../..')
        except Exception:
            continue

        # Date: ./div[1]/span inside info_container
        try:
            date_elem = info_container.find_element(By.XPATH, './div[1]/span')
            event_date = date_elem.text.strip()
        except Exception:
            event_date = ""

        # Location: ./div[3]/span inside info_container
        try:
            loc_elem = info_container.find_element(By.XPATH, './div[3]/span')
            event_location = loc_elem.text.strip()
        except Exception:
            event_location = ""

        seen_this_run.add(event_name)

        new_events.append(
            {
                "event_name": event_name,
                "event_date": event_date,
                "event_location": event_location,
                "time_scraped_utc": current_utc,
            }
        )

    return new_events


def main():
    driver = build_driver(headless=False)  # visible window (popup)
    try:
        new_events = scrape_events(driver)
        if new_events:
            save_events_to_csv(CSV_FILE, new_events)
            print(f"Added {len(new_events)} new events to {CSV_FILE}.")
        else:
            print("No new events found. CSV unchanged.")

        # Optional: keep browser open briefly so you can see it
        time.sleep(5)

    finally:
        driver.quit()


if __name__ == "__main__":
    main()




Added 7 new events to all_jakarta.csv.


In [14]:
# --- Megatix Indonesia: Featured Events Scraper ---
# Output CSV: all_jakarta.csv
# Works for https://megatix.co.id/
# pip install selenium tabulate pandas

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from tabulate import tabulate
import pandas as pd
import time
import os  # <-- added to read existing CSV for "new events" comparison
from datetime import datetime  # <-- added for timestamp

# ---------------- CONFIG ----------------
OUTPUT_CSV = "all_jakarta.csv"
URL = "https://megatix.co.id/"

driver = webdriver.Chrome()
driver.set_window_size(1400, 1000)
all_rows = []

# ---------------- SCRAPER ----------------
try:
    driver.get(URL)

    # Wait for base element
    WebDriverWait(driver, 15).until(EC.presence_of_element_located((By.XPATH, "//*[@id='__nuxt']")))
    time.sleep(1.0)

    FEATURED_WRAP_X = "//*[@id='__nuxt']/div/div[3]/div/main/section/section[2]/div[2]/div"
    FEATURED_CARD_X = "//*[@id='__nuxt']/div/div[3]/div/main/section/section[2]/div[2]//article[.//h3]"

    # Scroll and load all featured cards
    prev = -1
    stable = 0
    for _ in range(60):
        cards = driver.find_elements(By.XPATH, FEATURED_CARD_X)
        count = len(cards)
        if count == prev:
            stable += 1
            if stable >= 2:
                break
        else:
            stable = 0
        prev = count
        driver.execute_script("window.scrollBy(0, 900);")
        time.sleep(1.2)

    print(f"üü¢ Found {len(driver.find_elements(By.XPATH, FEATURED_CARD_X))} featured cards")

    for ev in driver.find_elements(By.XPATH, FEATURED_CARD_X):
        try:
            title = ev.find_element(By.XPATH, ".//h3/span").text.strip()
        except:
            title = "No Title"
        try:
            date = ev.find_element(By.XPATH, ".//div[1]/div[1]/span").text.strip()
        except:
            date = "No Date"
        try:
            location = ev.find_element(By.XPATH, ".//div[1]/div[3]/span").text.strip()
        except:
            location = "Indonesia"
        try:
            link = ev.find_element(By.XPATH, ".//ancestor::a").get_attribute("href") or "No Link"
        except:
            link = "No Link"

        all_rows.append({
            "Title": title,
            "Date": date,
            "Location": location,
            "Link": link,
            "TimeScraped": datetime.now().strftime("%Y-%m-%d %H:%M:%S")  # <-- added line
        })

    print(f"‚úÖ Megatix Indonesia (Featured): grabbed {len(all_rows)} events")

except TimeoutException:
    print("‚ö† Timeout waiting for Megatix Indonesia page")
except Exception as e:
    print(f"‚ö† Error: {e}")

finally:
    driver.quit()

# ---------------- OUTPUT ----------------
df = pd.DataFrame(all_rows).reset_index(drop=True)

# --- NEW: Compare with existing CSV and print any newly added events ---
if os.path.exists(OUTPUT_CSV):
    try:
        old_df = pd.read_csv(OUTPUT_CSV)
        old_links = set(old_df.get("Link", pd.Series(dtype=str)).astype(str).str.strip())
    except Exception:
        old_links = set()
else:
    old_links = set()

# Normalize links for comparison and detect new ones
if not df.empty:
    df["Link"] = df["Link"].astype(str).str.strip()
    new_mask = ~df["Link"].isin(old_links)
    new_events = df[new_mask].copy()
    if len(new_events) > 0:
        print(f"\nüÜï {len(new_events)} new event(s) since last run:")
        for _, r in new_events.iterrows():
            # TimeScraped will also show here automatically since it's in the row
            print(f"  ‚Ä¢ {r['Title']} | {r['Date']} | {r['Location']} | {r['Link']} | {r['TimeScraped']}")
    else:
        print("\n‚Ñπ No new events added since last run.")
else:
    print("\n‚Ñπ No events scraped this run (empty listing).")
# --- END NEW ---

print(f"\n‚úÖ TOTAL Raw Events: {len(df)}")
print(tabulate(df, headers="keys", tablefmt="github", showindex=False))

df.to_csv(OUTPUT_CSV, index=False, encoding="utf-8")
print(f"\nüíæ CSV saved: {OUTPUT_CSV}")


üü¢ Found 7 featured cards
‚úÖ Megatix Indonesia (Featured): grabbed 7 events

‚Ñπ No new events added since last run.

‚úÖ TOTAL Raw Events: 7
| Title                                       | Date             | Location          | Link                                                  | TimeScraped         |
|---------------------------------------------|------------------|-------------------|-------------------------------------------------------|---------------------|
| BEN B√ñHMER (LIVE)                           | Sat, 27 Dec 2025 | Savaya Bali       | https://megatix.co.id/events/ben-bohmer-2712          | 2025-12-15 16:29:04 |
| BEN B√ñHMER (LIVE)                           | Sat, 27 Dec 2025 | Savaya Bali       | https://megatix.co.id/events/ben-bohmer-2712          | 2025-12-15 16:29:04 |
| X-Clusive Presents: Bali NYE 2025 with Tyga | Wed, 31 Dec 2025 | The Stage         | https://megatix.co.id/events/nye-2025-with-tyga       | 2025-12-15 16:29:04 |
| ARTBAT & MISS MONIQUE     

# Bangkok (done)

In [15]:
# # thai ticket major
# # pip install selenium pandas
# from selenium import webdriver
# from selenium.webdriver.common.by import By
# from selenium.webdriver.support.ui import WebDriverWait
# from selenium.webdriver.support import expected_conditions as EC
# from selenium.common.exceptions import TimeoutException, StaleElementReferenceException
# from datetime import datetime
# import pandas as pd
# import time

# HOME = "https://www.thaiticketmajor.com/concert/"

# # ---- Your detail-page XPaths (kept as-is) ----
# NAME_X  = "/html/body/div[1]/main/div[1]/div[2]/div/div[2]/div/div[1]/div/h1/font/font"
# VENUE_X = "/html/body/div[1]/main/div[1]/div[2]/div/div[2]/div/div[2]/div[1]/ul/li[2]/p/span/font/font"
# PRICE_X = "/html/body/div[1]/main/div[1]/div[2]/div/div[2]/div/div[2]/div[2]/ul/li[2]/div"

# # Listing: we‚Äôll collect all anchors in section[2] (your click path lives under here)
# # This is more reliable than clicking the nested /font/font node.
# LIST_ANCHORS_X = "//body/div[1]/main/section[2]//a[@href]"

# # ================== APPEND+DEDUP ADD-ON (no changes to your code below) ==================
# import atexit, os, re

# _OUTPUT_CSV_PATH = "all_bangkok.csv"

# def _normalize_link(u: str) -> str:
#     u = (u or "").strip().lower()
#     u = re.sub(r"#.*$", "", u)           # drop fragments
#     u = re.sub(r"\?.*$", "", u)          # drop query params
#     return u.rstrip("/")

# try:
#     _old_df_ttm = pd.read_csv(_OUTPUT_CSV_PATH)
# except Exception:
#     _old_df_ttm = pd.DataFrame()

# def _merge_back_ttm():
#     """Runs AFTER your script's own to_csv.
#        - Restores previous rows
#        - Removes duplicates
#        - Prints which events are newly added this run (or none)
#     """
#     try:
#         new_df = pd.read_csv(_OUTPUT_CSV_PATH)
#     except Exception:
#         # If the fresh write failed, nothing to merge
#         return

#     # Ensure consistent columns even if earlier files differ
#     need_cols = ["Event Name", "Venue", "Ticket Prices", "Link", "Scraped At (UTC)"]
#     for df in (_old_df_ttm, new_df):
#         for c in need_cols:
#             if c not in df.columns:
#                 df[c] = ""

#     # Normalize links for strong dedup
#     _old_df_ttm["LinkNorm"] = _old_df_ttm["Link"].map(_normalize_link)
#     new_df["LinkNorm"]      = new_df["Link"].map(_normalize_link)

#     # --- Identify which rows are NEW vs the old file (before merging) ---
#     old_keys = set(_old_df_ttm["LinkNorm"].astype(str))
#     # For rows with empty LinkNorm, fallback to (Event Name, Venue)
#     if "" in old_keys:
#         old_fallback = set(zip(
#             _old_df_ttm["Event Name"].astype(str).str.strip().str.lower(),
#             _old_df_ttm["Venue"].astype(str).str.strip().str.lower()
#         ))
#     else:
#         old_fallback = set()

#     newly_added_rows = []
#     for _, r in new_df.iterrows():
#         ln = str(r.get("LinkNorm", "") or "")
#         if ln:
#             if ln not in old_keys:
#                 newly_added_rows.append(r)
#         else:
#             key = (
#                 str(r.get("Event Name","")).strip().lower(),
#                 str(r.get("Venue","")).strip().lower()
#             )
#             if key not in old_fallback:
#                 newly_added_rows.append(r)

#     # Merge and dedup
#     merged = pd.concat([_old_df_ttm, new_df], ignore_index=True)

#     # Primary dedup by normalized link (keep first = preserve earliest row)
#     merged = merged.drop_duplicates(subset=["LinkNorm"], keep="first")

#     # Secondary guard when links are missing/unstable
#     merged = merged.drop_duplicates(subset=["Event Name", "Venue"], keep="first")

#     # Persist without helper column
#     merged = merged.drop(columns=["LinkNorm"], errors="ignore")
#     merged.to_csv(_OUTPUT_CSV_PATH, index=False, encoding="utf-8-sig")

#     # --- Print summary of new events ---
#     if newly_added_rows:
#         print(f"\nüÜï {len(newly_added_rows)} new event(s) added this run:")
#         for r in newly_added_rows:
#             title = str(r.get("Event Name","")).strip()
#             venue = str(r.get("Venue","")).strip()
#             link  = str(r.get("Link","")).strip()
#             date  = str(r.get("Scraped At (UTC)","")).strip()
#             print(f"  ‚Ä¢ {title} | {venue} | {link} | scraped {date}")
#     else:
#         print("\n‚Ñπ No new events added this run.")

# # Ensure merge runs after your script finishes writing the CSV
# atexit.register(_merge_back_ttm)
# # ================== END ADD-ON ==================

# def build_driver():
#     opts = webdriver.ChromeOptions()
#     opts.add_argument("--start-maximized")
#     opts.add_argument("--disable-notifications")
#     opts.add_argument("--disable-blink-features=AutomationControlled")
#     opts.page_load_strategy = "eager"
#     d = webdriver.Chrome(options=opts)
#     d.set_page_load_timeout(90)
#     return d

# def wait(drv, xp, sec=25, clickable=False):
#     cond = EC.element_to_be_clickable if clickable else EC.presence_of_element_located
#     return WebDriverWait(drv, sec).until(cond((By.XPATH, xp)))

# def t(drv, xp):
#     try:
#         el = drv.find_element(By.XPATH, xp)
#         # textContent is safer with nested <font> etc.
#         return " ".join((el.get_attribute("textContent") or "").split())
#     except:
#         return ""

# def main():
#     driver = build_driver()
#     rows = []
#     try:
#         driver.get(HOME)
#         # Wait for listing anchors to exist
#         wait(driver, LIST_ANCHORS_X)
#         time.sleep(0.5)

#         anchors = driver.find_elements(By.XPATH, LIST_ANCHORS_X)
#         hrefs = []
#         for a in anchors:
#             href = a.get_attribute("href")
#             # Keep only event-detail-like links (most are /event/... or /concert/...)
#             if href and "thaiticketmajor.com" in href and href not in hrefs:
#                 hrefs.append(href)

#         if not hrefs:
#             print("No events found on the listing.")
#             return

#         for i, href in enumerate(hrefs, 1):
#             try:
#                 driver.get(href)
#                 # Wait for name node to appear
#                 wait(driver, NAME_X, sec=20)

#                 name  = t(driver, NAME_X)
#                 venue = t(driver, VENUE_X)
#                 price = t(driver, PRICE_X)

#                 rows.append({
#                     "Event Name": name,
#                     "Venue": venue,
#                     "Ticket Prices": price,
#                     "Link": href,
#                     "Scraped At (UTC)": datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S"),
#                 })

#                 # If you strictly want to ‚Äúgo back and click the rest‚Äù, uncomment these two lines:
#                 # driver.back()
#                 # wait(driver, LIST_ANCHORS_X)

#             except (TimeoutException, StaleElementReferenceException):
#                 rows.append({
#                     "Event Name": "",
#                     "Venue": "",
#                     "Ticket Prices": "",
#                     "Link": href,
#                     "Scraped At (UTC)": datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S"),
#                 })
#                 # Try to return to list if something failed
#                 try:
#                     driver.get(HOME)
#                     wait(driver, LIST_ANCHORS_X)
#                 except TimeoutException:
#                     pass

#         pd.DataFrame(rows).to_csv("all_bangkok.csv", index=False, encoding="utf-8-sig")
#         print(f"Saved {len(rows)} rows to all_bangkok.csv")

#     finally:
#         try: driver.quit()
#         except: pass

# if __name__ == "__main__":
#     main()



# issue with scraping




In [16]:
# ticketmelon (all asia)



In [17]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, StaleElementReferenceException
from datetime import datetime
import pandas as pd
import time

URL = "https://www.livenationtero.co.th/en?utm_source=chatgpt.com"

LIST_X   = "//*[@id='upcoming-shows']/div/ul"
ITEM_X   = LIST_X + "/li[{i}]"
ANCHOR_X = ITEM_X + "/a"
NAME_X   = ITEM_X + "/a/p[1]"      # event name
DATE_X   = ITEM_X + "/a/small"     # event date  ‚úÖ

VENUE_X_PRIMARY = "//*[@id='main']/div/div[1]/div/div[2]/div/p[4]"
VENUE_FALLBACKS = [
    "//*[@id='main']//p[contains(., 'Venue')]/following-sibling::p[1]",
    "//*[@id='main']//div[contains(@class,'event') or contains(@class,'details')]//p[contains(@class,'venue')]",
    "//*[contains(@class,'EventDetails')]//p[contains(@class,'venue')]",
]

# ================== APPEND + DEDUP ADD-ON (ONLY ADDITIONS BELOW) ==================
import os, re, atexit

_OUTPUT_CSV_PATH = "all_bangkok.csv"

def _normalize_link(u: str) -> str:
    u = (u or "").strip().lower()
    u = re.sub(r"#.*$", "", u)          # drop fragments
    u = re.sub(r"\?.*$", "", u)         # drop query params
    return u.rstrip("/")

# Preload old CSV (if any) so we can merge it back after your own to_csv runs
try:
    _old_df_lnt = pd.read_csv(_OUTPUT_CSV_PATH)
except Exception:
    _old_df_lnt = pd.DataFrame()

def _merge_back_lnt():
    """Runs AFTER your script writes all_bangkok.csv.
       Merges old + new, removes duplicates, saves back to file,
       and prints which events are newly added (or none).
    """
    try:
        new_df = pd.read_csv(_OUTPUT_CSV_PATH)
    except Exception:
        return  # nothing written, nothing to merge

    # Ensure the expected schema exists on both sides
    need_cols = ["Event Name", "Event Date", "Venue", "Link", "Scraped At (UTC)"]
    for df in (_old_df_lnt, new_df):
        for c in need_cols:
            if c not in df.columns:
                df[c] = ""

    # Normalize links for robust dedup (handles tracking params, trailing slashes)
    _old_df_lnt["LinkNorm"] = _old_df_lnt["Link"].map(_normalize_link)
    new_df["LinkNorm"]      = new_df["Link"].map(_normalize_link)

    # ---- Determine which rows are NEW compared to old file ----
    old_linknorms = set(_old_df_lnt["LinkNorm"].astype(str))
    # Fallback identity if Link is blank/unstable: (Event Name, Venue)
    old_fallback_keys = set(zip(
        _old_df_lnt["Event Name"].astype(str).str.strip().str.lower(),
        _old_df_lnt["Venue"].astype(str).str.strip().str.lower()
    ))

    newly_added_rows = []
    for _, r in new_df.iterrows():
        ln = str(r.get("LinkNorm", "") or "")
        if ln:
            if ln not in old_linknorms:
                newly_added_rows.append(r)
        else:
            key = (
                str(r.get("Event Name","")).strip().lower(),
                str(r.get("Venue","")).strip().lower()
            )
            if key not in old_fallback_keys:
                newly_added_rows.append(r)

    # ---- Merge & deduplicate globally ----
    merged = pd.concat([_old_df_lnt, new_df], ignore_index=True)
    merged = merged.drop_duplicates(subset=["LinkNorm"], keep="first")
    merged = merged.drop_duplicates(subset=["Event Name", "Venue"], keep="first")

    # ---- Summary & guarantees ----
    old_count = len(_old_df_lnt)
    new_raw_count = len(new_df)
    merged_count = len(merged)
    dups_dropped = (old_count + new_raw_count) - merged_count

    # Backfill missing timestamps for older rows that didn't have it
    if "Scraped At (UTC)" in merged.columns:
        merged["Scraped At (UTC)"] = (
            merged["Scraped At (UTC)"]
            .astype(str)
            .str.strip()
            .replace({"nan": "", "None": ""})
        )
        merged["Scraped At (UTC)"] = merged["Scraped At (UTC)"].mask(
            merged["Scraped At (UTC)"] == "",
            datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S")
        )

    # Persist without helper column
    merged = merged.drop(columns=["LinkNorm"], errors="ignore")
    merged.to_csv(_OUTPUT_CSV_PATH, index=False, encoding="utf-8-sig")

    # ---- Print summary of new events (or none) ----
    if newly_added_rows:
        print(f"\nüÜï {len(newly_added_rows)} new event(s) added this run:")
        for r in newly_added_rows:
            title = str(r.get("Event Name","")).strip()
            date  = str(r.get("Event Date","")).strip()
            venue = str(r.get("Venue","")).strip()
            link  = str(r.get("Link","")).strip()
            when  = str(r.get("Scraped At (UTC)","")).strip()
            print(f"  ‚Ä¢ {title} | {date} | {venue} | {link} | scraped {when}")
    else:
        print("\n‚Ñπ No new events added this run.")

    print(f"\nüìä Summary: old={old_count}, scraped_this_run={new_raw_count}, "
          f"deduped_total={merged_count}, duplicates_removed={dups_dropped}")

    if merged_count < old_count:
        print("‚ö† Warning: merged row count is less than previous file. (This should not happen.)")
    else:
        print("‚úÖ No previously saved events were removed. Duplicates were removed correctly.")

# Register the merge so it runs automatically at program exit
atexit.register(_merge_back_lnt)
# ================== END ADD-ON ==================

def build_driver():
    opts = webdriver.ChromeOptions()
    opts.add_argument("--start-maximized")
    opts.add_argument("--disable-notifications")
    return webdriver.Chrome(options=opts)

def wait_xpath(drv, xpath, sec=25):
    return WebDriverWait(drv, sec).until(EC.presence_of_element_located((By.XPATH, xpath)))

def wait_clickable(drv, xpath, sec=25):
    return WebDriverWait(drv, sec).until(EC.element_to_be_clickable((By.XPATH, xpath)))

def node_text(drv, xpath):
    try:
        el = drv.find_element(By.XPATH, xpath)
        # textContent catches nested spans / line breaks better than .text sometimes
        txt = el.get_attribute("textContent") or ""
        return " ".join(txt.split())
    except:
        return ""

def get_venue_with_fallbacks(drv, timeout=8):
    end = time.time() + timeout
    try:
        el = WebDriverWait(drv, min(4, timeout)).until(
            EC.presence_of_element_located((By.XPATH, VENUE_X_PRIMARY))
        )
        txt = el.text.strip()
        if txt: return txt
    except TimeoutException:
        pass
    for xp in VENUE_FALLBACKS:
        if time.time() > end: break
        try:
            el = WebDriverWait(drv, max(1, int(end - time.time()))).until(
                EC.presence_of_element_located((By.XPATH, xp))
            )
            txt = el.text.strip()
            if txt: return txt
        except TimeoutException:
            continue
    return ""

def main():
    driver = build_driver()
    rows = []
    try:
        driver.get(URL)
        wait_xpath(driver, LIST_X)
        items = driver.find_elements(By.XPATH, LIST_X + "/li")
        n = len(items)
        if n == 0:
            print("No events found under #upcoming-shows.")
            return

        for i in range(1, n + 1):
            try:
                li = driver.find_element(By.XPATH, ITEM_X.format(i=i))
                driver.execute_script("arguments[0].scrollIntoView({block:'center'});", li)
                time.sleep(0.2)

                event_name = node_text(driver, NAME_X.format(i=i))
                event_date = node_text(driver, DATE_X.format(i=i))   # ‚úÖ now using <small>

                a = wait_clickable(driver, ANCHOR_X.format(i=i))
                href = a.get_attribute("href")
                a.click()

                venue = get_venue_with_fallbacks(driver, timeout=8)

                rows.append({
                    "Event Name": event_name,
                    "Event Date": event_date,
                    "Venue": venue,
                    "Link": href or driver.current_url,
                    "Scraped At (UTC)": datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S"),
                })

            except (TimeoutException, StaleElementReferenceException):
                rows.append({
                    "Event Name": event_name if 'event_name' in locals() else "",
                    "Event Date": event_date if 'event_date' in locals() else "",
                    "Venue": "",
                    "Link": href if 'href' in locals() else "",
                    "Scraped At (UTC)": datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S"),
                })
            finally:
                try:
                    driver.back()
                    wait_xpath(driver, LIST_X)
                except TimeoutException:
                    driver.get(URL)
                    wait_xpath(driver, LIST_X)

        if rows:
            pd.DataFrame(rows).to_csv("all_bangkok.csv", index=False, encoding="utf-8-sig")
            print(f"Saved {len(rows)} rows to all_bangkok.csv")
            _merge_back_lnt()  # <-- ensure immediate merge so previous data is preserved
        else:
            print("No rows collected.")
    finally:
        driver.quit()

if __name__ == "__main__":
    main()




Saved 4 rows to all_bangkok.csv

üÜï 3 new event(s) added this run:
  ‚Ä¢ nan | nan | nan | https://www.livenationtero.co.th/en/event/babymonster-love-monsters-asia-fan-concert-2025-in-bangkok-bangkok-tickets-edp1628752 | scraped 2025-12-15 08:29:31
  ‚Ä¢ nan | nan | nan | https://www.livenationtero.co.th/en/event/giv%C4%93on-dear-beloved-the-tour-bangkok-tickets-edp1636493 | scraped 2025-12-15 08:29:42
  ‚Ä¢ nan | nan | üèüÔ∏è: Lido Connect 3, Thailand | https://www.livenationtero.co.th/en/event/pryvt-back-to-reality-world-tour-in-bangkok-bangkok-tickets-edp1637655 | scraped 2025-12-15 08:29:46

üìä Summary: old=307, scraped_this_run=4, deduped_total=305, duplicates_removed=6


In [18]:
"""
Megatix Thailand (category=31) scraper ‚Äî INCREMENTAL & NON-DELETING
- Appends only new events to OUT_CSV; existing rows are preserved.
- Dedup key: uid = norm(Event Name) + norm(Link)
"""

import os
import time
from datetime import datetime, timezone

import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import (
    TimeoutException, NoSuchElementException,
    ElementClickInterceptedException, StaleElementReferenceException
)

URL = "https://megatix.in.th/events?category=31"
OUT_CSV = "all_bangkok.csv"

# ---- Your XPaths ----
X_SECTION = '//*[@id="browse-events-heading"]/div[2]'
X_LOAD_MORE = '//*[@id="browse-events-heading"]/div[2]/div/div[3]/button/span'
X_CARD_BASE = '//*[@id="browse-events-heading"]/div[2]/div/div[2]/div/div'

def x_name(i):  return f'//*[@id="browse-events-heading"]/div[2]/div/div[2]/div/div[{i}]/a/div/div/div[1]/p[2]'
def x_venue(i): return f'//*[@id="browse-events-heading"]/div[2]/div/div[2]/div/div[{i}]/a/div/div/div[1]/p[3]'
def x_link(i):  return f'//*[@id="browse-events-heading"]/div[2]/div/div[2]/div/div[{i}]/a'

def utc_now_iso():
    return datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")

def norm(s: str) -> str:
    """Lowercase, strip, collapse whitespace for stable matching."""
    if not isinstance(s, str):
        return ""
    return " ".join(s.strip().lower().split())

def build_uid(name: str, link: str) -> str:
    return f"{norm(name)}|{norm(link)}"

def slow_scroll_to_bottom(driver, step=600, pause=0.4, max_passes=3):
    for _ in range(max_passes):
        last_h = driver.execute_script("return document.body.scrollHeight;")
        y = 0
        while y < last_h:
            y += step
            driver.execute_script(f"window.scrollTo(0, {y});")
            time.sleep(pause)
        time.sleep(0.8)
        new_h = driver.execute_script("return document.body.scrollHeight;")
        if new_h == last_h:
            break

def click_load_more_until_done(driver, wait, max_clicks=200):
    clicks = 0
    while clicks < max_clicks:
        try:
            btn_span = wait.until(EC.presence_of_element_located((By.XPATH, X_LOAD_MORE)))
            button = driver.find_element(By.XPATH, X_LOAD_MORE + "/..")
            if not button.is_enabled() or not button.is_displayed():
                break
            driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", button)
            time.sleep(0.4)
            button.click()
            clicks += 1
            time.sleep(1.2)
            slow_scroll_to_bottom(driver, step=700, pause=0.35, max_passes=1)
        except (TimeoutException, NoSuchElementException):
            break
        except (ElementClickInterceptedException, StaleElementReferenceException):
            time.sleep(0.8)
            try:
                button = driver.find_element(By.XPATH, X_LOAD_MORE + "/..")
                driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", button)
                time.sleep(0.5)
                button.click()
                clicks += 1
                time.sleep(1.2)
            except Exception:
                break

def get_card_count(driver):
    return len(driver.find_elements(By.XPATH, X_CARD_BASE))

def scrape_events():
    opts = webdriver.ChromeOptions()
    opts.add_argument("--start-maximized")
    opts.add_argument("--disable-blink-features=AutomationControlled")
    opts.add_argument("--disable-gpu")
    opts.add_argument("--window-size=1400,1000")
    opts.add_argument("--no-sandbox")
    opts.add_argument("--disable-dev-shm-usage")
    opts.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome Safari")

    driver = webdriver.Chrome(options=opts)
    wait = WebDriverWait(driver, 20)
    rows = []

    try:
        driver.get(URL)
        wait.until(EC.presence_of_element_located((By.XPATH, X_SECTION)))
        slow_scroll_to_bottom(driver, step=700, pause=0.35, max_passes=2)
        click_load_more_until_done(driver, wait)
        slow_scroll_to_bottom(driver, step=800, pause=0.4, max_passes=2)

        total = get_card_count(driver)
        print(f"Found {total} cards.")

        for i in range(1, total + 1):
            try:
                name = driver.find_element(By.XPATH, x_name(i)).text.strip()
                venue = driver.find_element(By.XPATH, x_venue(i)).text.strip()
                link = driver.find_element(By.XPATH, x_link(i)).get_attribute("href") or ""
                rows.append({
                    "Event Name": name,
                    "Event Venue": venue,
                    "Link": link,
                    "Source": "Megatix TH",
                    "ScrapedAtUTC": utc_now_iso(),  # ‚Üê time scraped included
                    "uid": build_uid(name, link)
                })
            except NoSuchElementException:
                continue
            except StaleElementReferenceException:
                time.sleep(0.2)
                try:
                    name = driver.find_element(By.XPATH, x_name(i)).text.strip()
                    venue = driver.find_element(By.XPATH, x_venue(i)).text.strip()
                    link = driver.find_element(By.XPATH, x_link(i)).get_attribute("href") or ""
                    rows.append({
                        "Event Name": name,
                        "Event Venue": venue,
                        "Link": link,
                        "Source": "Megatix TH",
                        "ScrapedAtUTC": utc_now_iso(),  # ‚Üê time scraped included
                        "uid": build_uid(name, link)
                    })
                except Exception:
                    continue
    finally:
        driver.quit()

    return pd.DataFrame(rows) if rows else pd.DataFrame(columns=[
        "Event Name","Event Venue","Link","Source","ScrapedAtUTC","uid"
    ])

def append_only_new(df_new: pd.DataFrame, out_csv: str):
    """
    Append only rows whose uid is NOT already in out_csv.
    Never deletes existing rows. Keeps their original ScrapedAtUTC.
    """
    if os.path.exists(out_csv):
        df_old = pd.read_csv(out_csv)
        if "uid" not in df_old.columns:
            # Backfill uids for legacy files
            df_old["uid"] = (df_old["Event Name"].fillna("")
                             .apply(norm) + "|" + df_old["Link"].fillna("").apply(norm))
    else:
        df_old = pd.DataFrame(columns=df_new.columns)

    # --- BEGIN ADDITIONS: summary + no-duplicate/no-removal guarantees ---
    old_uids_before = set(df_old.get("uid", pd.Series(dtype=str)).astype(str))
    old_count_before = len(df_old)

    # Backfill missing ScrapedAtUTC for old rows (keeps original timestamps if present)
    if "ScrapedAtUTC" in df_old.columns:
        df_old["ScrapedAtUTC"] = (
            df_old["ScrapedAtUTC"].astype(str).str.strip().replace({"nan": "", "None": ""})
        )
        df_old.loc[df_old["ScrapedAtUTC"] == "", "ScrapedAtUTC"] = utc_now_iso()
    # --- END ADDITIONS ---

    old_uids = set(df_old.get("uid", pd.Series(dtype=str)).astype(str))
    df_new = df_new.copy()
    df_new["uid"] = df_new["uid"].astype(str)

    # Keep only truly new rows
    df_to_add = df_new[~df_new["uid"].isin(old_uids)]

    if not df_to_add.empty:
        # Concatenate without dropping any existing rows
        df_out_pre_dedup = pd.concat([df_old, df_to_add], ignore_index=True)

        # --- BEGIN ADDITIONS: enforce no duplicates by uid (keep first), and summarize ---
        df_out = df_out_pre_dedup.drop_duplicates(subset=["uid"], keep="first").reset_index(drop=True)
        duplicates_removed = len(df_out_pre_dedup) - len(df_out)

        # Sanity: ensure all previously saved uids are still present (no removals)
        previously_saved_still_present = old_uids_before.issubset(set(df_out["uid"].astype(str)))

        # Optional: ensure column order
        cols = ["Event Name","Event Venue","Link","Source","ScrapedAtUTC","uid"]
        df_out = df_out[[c for c in cols if c in df_out.columns] + [c for c in df_out.columns if c not in cols]]

        # Save
        df_out.to_csv(out_csv, index=False, encoding="utf-8-sig")

        # Print detailed summary
        actually_added_count = len(df_to_add)
        print(f"üÜï Added {actually_added_count} new event(s). Duplicates removed this run: {duplicates_removed}.")
        print(f"üì¶ Total rows now: {len(df_out)} (previously had {old_count_before}).")
        if previously_saved_still_present:
            print("‚úÖ No previously saved events were removed.")
        else:
            print("‚ö† Warning: some previously saved events are missing (unexpected).")

        # List the newly added events (with time scraped)
        if actually_added_count > 0:
            print("\nNewly added:")
            for _, r in df_to_add.iterrows():
                print(f"  ‚Ä¢ {r['Event Name']} | {r['Event Venue']} | {r['Link']} | scraped {r['ScrapedAtUTC']}")
        # --- END ADDITIONS ---

    else:
        print("No new events to append. Existing file left unchanged.")

def main():
    df_new = scrape_events()
    if df_new.empty:
        print("No rows scraped ‚Äî page structure may have changed or content blocked.")
        return
    append_only_new(df_new, OUT_CSV)

if __name__ == "__main__":
    main()





Found 256 cards.
üÜï Added 102 new event(s). Duplicates removed this run: 2.
üì¶ Total rows now: 405 (previously had 305).
‚úÖ No previously saved events were removed.

Newly added:
  ‚Ä¢ matr√´shka x CIZHKA | –ë–∞—Ä –ö—É—Ä–∞–∂ | Courage | https://megatix.in.th/events/matrshka-x-cizhka?source=home | scraped 2025-12-15T08:31:07Z
  ‚Ä¢ New Year's Eve Dinner Buffet | DoubleTree by Hilton Bangkok Ploenchit | DoubleTree by Hilton Bangkok Ploenchit | https://megatix.in.th/events/new-years-eve-dinner-buffet-doubletree-by-hilton-bangkok-ploenchit?source=home | scraped 2025-12-15T08:31:08Z
  ‚Ä¢ New Year‚Äôs Eve Special Dinner on 31 December 2025 | Cape Dara Resort | https://megatix.in.th/events/new-year-special-dinner-on-31-december-2025?source=home | scraped 2025-12-15T08:31:08Z
  ‚Ä¢ 2026 NEW YEAR PARTY by the Beach | Baba Beach Club Natai Luxury Pool Villa Hotel | https://megatix.in.th/events/new-year-party-by-the-beach?source=home | scraped 2025-12-15T08:31:08Z
  ‚Ä¢ SEEN & SIN New Year'

# Australia

In [19]:
# moshtix_sections_scraper.py
# Scrapes events ONLY from:
#   - //*[@id="col_main"]/section[4]
#   - //*[@id="col_main"]/section[5]
#
# Extracted fields per ticket row:
#   Event Name | Date | Location | Ticket Name | Ticket Price | Event Link
#
# Behavior:
#   - Keeps previous data in all_australia.csv
#   - Appends only new rows (no duplicates)
#   - Prints how many new records were added
#
# Usage:
#   pip install selenium pandas
#   python moshtix_sections_scraper.py

import os
import time
import pandas as pd
from datetime import datetime
from urllib.parse import urljoin
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import (
    TimeoutException,
    NoSuchElementException,
    StaleElementReferenceException,
    ElementClickInterceptedException,
)

HOME = "https://www.moshtix.com.au/v2/"
OUTPUT_CSV = "all_australia.csv"

WAIT_SEC = 25

def make_driver():
    opts = webdriver.ChromeOptions()
    opts.add_argument("--start-maximized")
    opts.add_argument("--disable-notifications")
    opts.add_argument("--disable-blink-features=AutomationControlled")
    opts.add_experimental_option("excludeSwitches", ["enable-automation"])
    return webdriver.Chrome(options=opts)

def wait_for(drv, by, selector, sec=WAIT_SEC):
    return WebDriverWait(drv, sec).until(EC.presence_of_element_located((by, selector)))

def smooth_scroll_to(drv, element):
    try:
        drv.execute_script("arguments[0].scrollIntoView({block:'start', inline:'nearest'});", element)
        time.sleep(0.5)
        drv.execute_script("window.scrollBy(0, -120);")
        time.sleep(0.4)
        drv.execute_script("window.scrollBy(0, 240);")
        time.sleep(0.6)
    except Exception:
        pass

def dismiss_banners(drv):
    candidates = [
        "//button[contains(translate(., 'ACEPTI', 'acepti'), 'accept')]",
        "//button[contains(., 'Accept')]",
        "//button[contains(., 'I agree')]",
        "//button[contains(., 'Got it')]",
        "//button[contains(., 'Close')]",
    ]
    for xp in candidates:
        try:
            btn = WebDriverWait(drv, 3).until(EC.element_to_be_clickable((By.XPATH, xp)))
            btn.click()
            time.sleep(0.3)
        except Exception:
            pass

def collect_event_links_in_section(drv, section_xpath):
    links = []
    try:
        sec_el = wait_for(drv, By.XPATH, section_xpath)
        smooth_scroll_to(drv, sec_el)
        anchors = sec_el.find_elements(By.XPATH, ".//a[contains(@href, '/v2/event/')]")
        seen = set()
        for a in anchors:
            try:
                href = a.get_attribute("href")
                if href and "/v2/event/" in href and href not in seen:
                    seen.add(href)
                    links.append((a, href))
            except StaleElementReferenceException:
                continue
    except TimeoutException:
        pass
    return links

def extract_event_core(drv):
    def safe_text(by, sel):
        try:
            el = wait_for(drv, by, sel, sec=8)
            return el.text.strip()
        except TimeoutException:
            return ""
    name = safe_text(By.CSS_SELECTOR, "#event-summary-title > span")
    date = safe_text(By.CSS_SELECTOR, "#event-summary-date")
    venue = safe_text(By.CSS_SELECTOR, "#event-summary-venue > span")
    return name, date, venue

def extract_tickets(drv):
    tickets = []
    try:
        form = wait_for(drv, By.CSS_SELECTOR, "#event-tickets-form", sec=10)
        name_els = form.find_elements(By.CSS_SELECTOR, "[id^='ticket-type-name-']")
        for n in name_els:
            try:
                n_id = n.get_attribute("id")
                suffix = n_id.split("-")[-1]
                tname = n.text.strip()
                price_sel = f"#ticket-type-total-{suffix}"
                try:
                    price_el = form.find_element(By.CSS_SELECTOR, price_sel)
                    tprice = price_el.text.strip()
                except NoSuchElementException:
                    tprice = ""
                if tname:
                    tickets.append((tname, tprice))
            except Exception:
                continue
    except TimeoutException:
        pass
    return tickets


def load_existing_data(path):
    """### NEW: Load existing CSV and return as DataFrame"""
    if os.path.exists(path):
        try:
            df = pd.read_csv(path)
            print(f"Loaded {len(df)} existing rows from {path}")
            return df
        except Exception:
            print("‚ö†Ô∏è Could not read existing CSV; starting fresh.")
    return pd.DataFrame(columns=["Event Name", "Date", "Location", "Ticket Name", "Ticket Price", "Link"])


def merge_and_save(existing_df, new_rows, output_csv):
    """### NEW: Merge new data, drop duplicates, save CSV, and show stats"""
    new_df = pd.DataFrame(new_rows)
    combined = pd.concat([existing_df, new_df], ignore_index=True)
    combined_before = len(combined)
    combined = combined.drop_duplicates(
        subset=["Event Name", "Ticket Name", "Ticket Price"], keep="first"
    )
    added = len(combined) - len(existing_df)
    combined.to_csv(output_csv, index=False, encoding="utf-8-sig")
    print(f"‚úÖ Saved {len(combined)} total rows to {output_csv}")
    print(f"üÜï Added {added} new rows.\n")
    return added


def main():
    driver = make_driver()
    rows = []
    existing = load_existing_data(OUTPUT_CSV)

    try:
        driver.get(HOME)
        wait_for(driver, By.ID, "col_main")
        time.sleep(1.2)
        dismiss_banners(driver)

        for _ in range(4):
            driver.execute_script("window.scrollBy(0, 800);")
            time.sleep(0.6)

        section_xpaths = [
            "//*[@id='col_main']/section[4]",
            "//*[@id='col_main']/section[5]",
        ]

        # Scrape timestamp for this run
        scraped_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")

        for sec_xp in section_xpaths:
            links = collect_event_links_in_section(driver, sec_xp)
            for idx, (elem, href) in enumerate(links, start=1):
                try:
                    try:
                        smooth_scroll_to(driver, elem)
                        elem.click()
                    except (ElementClickInterceptedException, StaleElementReferenceException):
                        driver.get(href)

                    wait_for(driver, By.CSS_SELECTOR, "#event-summary-title > span", sec=WAIT_SEC)
                    ev_name, ev_date, ev_venue = extract_event_core(driver)
                    ev_link = driver.current_url

                    tickets = extract_tickets(driver)
                    if tickets:
                        for tname, tprice in tickets:
                            rows.append({
                                "Event Name": ev_name,
                                "Date": ev_date,
                                "Location": ev_venue,
                                "Ticket Name": tname,
                                "Ticket Price": tprice,
                                "Link": ev_link,
                                "Scraped Time": scraped_time
                            })
                    else:
                        rows.append({
                            "Event Name": ev_name,
                            "Date": ev_date,
                            "Location": ev_venue,
                            "Ticket Name": "",
                            "Ticket Price": "",
                            "Link": ev_link,
                            "Scraped Time": scraped_time
                        })

                    driver.back()
                    wait_for(driver, By.ID, "col_main")
                    time.sleep(0.8)
                    dismiss_banners(driver)

                except TimeoutException:
                    try:
                        driver.back()
                        wait_for(driver, By.ID, "col_main")
                    except Exception:
                        pass
                    continue

        # --- Merge and Save ---
        merge_and_save(existing, rows, OUTPUT_CSV)

    finally:
        try:
            driver.quit()
        except Exception:
            pass


if __name__ == "__main__":
    main()


Loaded 140 existing rows from all_australia.csv
‚úÖ Saved 145 total rows to all_australia.csv
üÜï Added 5 new rows.



In [20]:
# """
# Ticketmaster AU ‚Äî Dance/Electronic (date-filtered) scraper
# - Source URL (example): https://www.ticketmaster.com.au/browse/dance-electronic-catid-201/music-rid-10001#pageInfo?startDate=2025-10-27&endDate=2025-12-31
# - Columns: Title, Date, Location, Link, Source, ScrapedAtUTC

# Usage (VS Code / terminal):
#     pip install selenium pandas
#     python scrape_ticketmaster_au_elec.py
# """

# import time
# import sys
# from datetime import datetime, timezone

# import pandas as pd
# from selenium import webdriver
# from selenium.webdriver.common.by import By
# from selenium.webdriver.support.ui import WebDriverWait
# from selenium.webdriver.support import expected_conditions as EC
# from selenium.common.exceptions import (
#     TimeoutException,
#     NoSuchElementException,
#     StaleElementReferenceException,
# )

# # ===================== CONFIG =====================
# START_DATE = "2025-10-27"
# END_DATE   = "2025-12-31"

# # Your provided (category) page + date filters.
# # NOTE: Put the "#pageInfo" anchor before the query string as per your link.
# BASE_URL = (
#     "https://www.ticketmaster.com.au/browse/dance-electronic-catid-201/music-rid-10001"
#     "#pageInfo"
#     f"?startDate={START_DATE}&endDate={END_DATE}"
# )

# OUTPUT_CSV = "all_australia.csv"
# WAIT_SEC   = 25

# # Your provided XPaths
# X_SECTION  = "//*[@id='pageInfo']/div[1]/div/div[2]/div[2]/div[1]"
# X_LIST     = X_SECTION + "/div[2]/div[2]/ul"
# X_ITEMS    = X_LIST + "/li"

# # Relative XPaths from each <li> item (based on your example for li[1])
# X_NAME_REL      = ".//div[1]/div/div[2]/div[2]/span[1]/span"
# X_LOCATION_REL  = ".//div[1]/div/div[2]/div[2]/span[2]/span[2]"
# X_DATE_REL      = ".//div[1]/div/div[2]/div[1]/span/span[2]"
# # Link: try to grab an <a> within the card
# X_LINK_REL      = ".//a[@href]"

# # ===================== DRIVER =====================
# def build_driver(headless: bool = True) -> webdriver.Chrome:
#     ua = ("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
#           "(KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36")
#     opts = webdriver.ChromeOptions()
#     opts.add_argument(f"user-agent={ua}")
#     if headless:
#         opts.add_argument("--headless=new")
#     opts.add_argument("--disable-notifications")
#     opts.add_argument("--disable-blink-features=AutomationControlled")
#     opts.add_experimental_option("excludeSwitches", ["enable-automation"])
#     opts.add_experimental_option("useAutomationExtension", False)
#     # Selenium Manager will fetch the correct driver automatically (Selenium 4.6+).
#     return webdriver.Chrome(options=opts)

# # ===================== HELPERS =====================
# def wait_for(xpath: str, driver: webdriver.Chrome, timeout: int = WAIT_SEC):
#     return WebDriverWait(driver, timeout).until(
#         EC.presence_of_element_located((By.XPATH, xpath))
#     )

# def scroll_lazy(driver: webdriver.Chrome, max_rounds: int = 18, pause: float = 0.75):
#     """
#     Scroll down repeatedly to trigger lazy loading.
#     Stops early if page height no longer grows.
#     """
#     last_h = 0
#     for i in range(max_rounds):
#         driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
#         time.sleep(pause)
#         h = driver.execute_script("return document.body.scrollHeight;")
#         if h == last_h:
#             break
#         last_h = h

# def get_text_safe(node, rel_xpath: str) -> str:
#     try:
#         return node.find_element(By.XPATH, rel_xpath).text.strip()
#     except NoSuchElementException:
#         return ""
#     except StaleElementReferenceException:
#         try:
#             return node.find_element(By.XPATH, rel_xpath).text.strip()
#         except Exception:
#             return ""

# def get_link_safe(node, rel_xpath: str) -> str:
#     try:
#         a = node.find_element(By.XPATH, rel_xpath)
#         href = a.get_attribute("href") or ""
#         return href.strip()
#     except NoSuchElementException:
#         return ""
#     except StaleElementReferenceException:
#         try:
#             a = node.find_element(By.XPATH, rel_xpath)
#             href = a.get_attribute("href") or ""
#             return href.strip()
#         except Exception:
#             return ""

# # ===================== MAIN SCRAPER =====================
# def scrape_ticketmaster_au(driver: webdriver.Chrome) -> pd.DataFrame:
#     print(f"Opening: {BASE_URL}")
#     driver.get(BASE_URL)

#     # Wait until the section & list exist
#     try:
#         wait_for(X_SECTION, driver)
#         wait_for(X_LIST, driver)
#     except TimeoutException:
#         raise TimeoutException("Could not find the events section on the page (XPaths may have changed).")

#     # Scroll to load all items
#     scroll_lazy(driver, max_rounds=20, pause=0.8)

#     # Grab items
#     try:
#         items = driver.find_elements(By.XPATH, X_ITEMS)
#     except NoSuchElementException:
#         items = []

#     print(f"Found {len(items)} event cards (raw).")

#     rows = []
#     scraped_at = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S UTC")
#     source = "Ticketmaster AU"

#     for idx, li in enumerate(items, start=1):
#         try:
#             title    = get_text_safe(li, X_NAME_REL)
#             location = get_text_safe(li, X_LOCATION_REL)
#             date_str = get_text_safe(li, X_DATE_REL)
#             link     = get_link_safe(li, X_LINK_REL)

#             # Skip empty rows with no title
#             if not title:
#                 continue

#             rows.append({
#                 "Title": title,
#                 "Date": date_str,
#                 "Location": location,
#                 "Link": link,
#                 "Source": source,
#                 "ScrapedAtUTC": scraped_at,
#             })
#         except Exception as e:
#             # Keep going if one card explodes
#             print(f"[warn] Skipped li #{idx} due to error: {e}")
#             continue

#     df = pd.DataFrame(rows, columns=["Title", "Date", "Location", "Link", "Source", "ScrapedAtUTC"])
#     # Deduplicate by Link, then by (Title, Date, Location)
#     if not df.empty:
#         before = len(df)
#         df = df.drop_duplicates(subset=["Link"]).copy()
#         df = df.drop_duplicates(subset=["Title", "Date", "Location"]).copy()
#         after = len(df)
#         print(f"De-duplicated: {before} -> {after}")

#     return df

# def merge_with_existing(df_new: pd.DataFrame, path: str) -> pd.DataFrame:
#     try:
#         df_old = pd.read_csv(path)
#         merged = pd.concat([df_old, df_new], ignore_index=True)
#         # Global dedupe again
#         merged = merged.drop_duplicates(subset=["Link"]).drop_duplicates(subset=["Title", "Date", "Location"])
#         return merged
#     except FileNotFoundError:
#         return df_new

# def main():
#     driver = build_driver(headless=True)
#     try:
#         df = scrape_ticketmaster_au(driver)
#     finally:
#         try:
#             driver.quit()
#         except Exception:
#             pass

#     if df.empty:
#         print("No events found (empty DataFrame).")
#         return

#     out = merge_with_existing(df, OUTPUT_CSV)
#     out.to_csv(OUTPUT_CSV, index=False, encoding="utf-8")
#     print(f"\nSaved {len(out)} total rows -> {OUTPUT_CSV}")

# if __name__ == "__main__":
#     try:
#         main()
#     except Exception as e:
#         print(f"[fatal] {e}")
#         sys.exit(1)


# # need to add code to show how many new events were added



In [21]:
# megatix_section_scraper_hardened.py
# has to wait because redirect too many times

# import os
# import re
# import time
# import pandas as pd
# from selenium import webdriver
# from selenium.webdriver.common.by import By
# from selenium.webdriver.support.ui import WebDriverWait
# from selenium.webdriver.support import expected_conditions as EC
# from selenium.common.exceptions import (
#     TimeoutException, NoSuchElementException, StaleElementReferenceException,
#     ElementClickInterceptedException
# )

# # ----------------- CONFIG -----------------
# START_URL = "https://megatix.com/"   # Megatix root (may redirect regionally)
# TARGET_LIST_XP = "//*[@id='__nuxt']/div/div[3]/div/main/section/section[2]/div[2]/div"
# FALLBACK_ALL_EVENTS_LINK_CSS = "a[href*='/events']"  # fallback to an All Events view
# OUTPUT_CSV = "all_austraila.csv"
# WAIT = 40

# # Event page fields (your exact XPaths)
# EV_NAME_XP  = "//*[@id='megatix']/div/main/div[1]/div[3]/div[2]/div[2]/div[1]/div[1]/div[1]/h1"
# EV_DATE_XP  = "//*[@id='megatix']/div/main/div[1]/div[3]/div[2]/div[2]/div[1]/div[2]/div[1]"
# EV_VENUE_XP = "//*[@id='megatix']/div/main/div[1]/div[3]/div[2]/div[2]/div[1]/div[2]/div[2]/a"
# TICKETS_SECTION_XP = "//*[@id='megatix']/div/main/div[1]/div[3]/div[2]/div[1]/div[2]/div"

# # ----------------- DRIVER -----------------
# def driver_make():
#     opts = webdriver.ChromeOptions()
#     opts.add_argument("--start-maximized")
#     opts.add_argument("--disable-notifications")
#     opts.add_argument("--disable-blink-features=AutomationControlled")
#     opts.add_argument("--lang=en-US,en")
#     # Optional: reduce automation detectability
#     opts.add_experimental_option("excludeSwitches", ["enable-automation"])
#     opts.add_experimental_option("useAutomationExtension", False)
#     # Faster load behavior
#     opts.page_load_strategy = "eager"
#     drv = webdriver.Chrome(options=opts)

#     drv.delete_all_cookies()   # ‚úÖ Clear cookies/cache to avoid redirect loops

#     try:
#         drv.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
#             "source": "Object.defineProperty(navigator, 'webdriver', {get: () => undefined})"
#         })
#     except Exception:
#         pass
#     return drv

# # ----------------- UTILS -----------------
# def wait_for(drv, by, sel, sec=WAIT):
#     return WebDriverWait(drv, sec).until(EC.presence_of_element_located((by, sel)))

# def wait_any(drv, candidates, sec=WAIT):
#     """Wait for the first present among multiple (by, sel) tuples."""
#     end = time.time() + sec
#     last_err = None
#     while time.time() < end:
#         for by, sel in candidates:
#             try:
#                 el = drv.find_element(by, sel)
#                 if el:
#                     return el
#             except Exception as e:
#                 last_err = e
#         time.sleep(0.25)
#     raise TimeoutException(f"None of the candidates appeared within {sec}s: {candidates}") from last_err

# def click_when_clickable(drv, by, sel, sec=WAIT):
#     el = WebDriverWait(drv, sec).until(EC.element_to_be_clickable((by, sel)))
#     el.click()
#     return el

# def smooth_scroll(drv, px=700, steps=12, pause=0.5):
#     for _ in range(steps):
#         drv.execute_script(f"window.scrollBy(0,{px});")
#         time.sleep(pause)

# def gentle_to(drv, el):
#     try:
#         drv.execute_script("arguments[0].scrollIntoView({block:'center'});", el)
#         time.sleep(0.4)
#     except Exception:
#         pass

# def dismiss_overlays(drv):
#     # Cookies / consent / promo modals best-effort
#     xps = [
#         "//button[contains(translate(.,'ACCEPT','accept'),'accept')]",
#         "//button[contains(.,'Accept')]",
#         "//button[contains(.,'Got it')]",
#         "//button[contains(.,'Close')]",
#         "//div[@role='dialog']//button",
#     ]
#     for xp in xps:
#         try:
#             btn = WebDriverWait(drv, 2).until(EC.element_to_be_clickable((By.XPATH, xp)))
#             btn.click()
#             time.sleep(0.2)
#         except Exception:
#             pass

# def wait_cloudflare(drv, max_sec=20):
#     """If Cloudflare 'Checking your browser...' appears, wait it out."""
#     start = time.time()
#     while time.time() - start < max_sec:
#         html = drv.page_source.lower()
#         if "checking your browser" in html or "just a moment" in html:
#             time.sleep(1.0)
#         else:
#             return
#     # continue anyway; not fatal

# def text_or_empty(drv, by, sel, sec=10):
#     try:
#         el = wait_for(drv, by, sel, sec=sec)
#         return el.text.strip()
#     except TimeoutException:
#         return ""

# CURRENCY_RE = re.compile(r"([$‡∏ø‚Ç´‚Ç±‚Ç¨¬£‚Çπ]|AUD|SGD|MYR|THB|IDR|PHP|USD|EUR|GBP)", re.I)

# def extract_tickets_from_container(container_el):
#     """Collect ticket (name, price) by scanning anchor blocks inside tickets container."""
#     pairs = []
#     links = container_el.find_elements(By.XPATH, ".//a")
#     for a in links:
#         try:
#             txt = a.text.strip()
#             if not txt:
#                 continue
#             lines = [s.strip() for s in txt.splitlines() if s.strip()]
#             name_guess, price_guess = "", ""
#             for s in lines:
#                 if not price_guess and (CURRENCY_RE.search(s) or re.search(r"\d", s)):
#                     price_guess = s
#                 elif not name_guess:
#                     name_guess = s
#             if not name_guess and lines:
#                 name_guess = lines[0]
#             if not price_guess and len(lines) > 1:
#                 price_guess = lines[-1]
#             if name_guess or price_guess:
#                 pairs.append((name_guess, price_guess))
#         except StaleElementReferenceException:
#             continue
#     # De-dup (name, price)
#     seen, out = set(), []
#     for n, p in pairs:
#         k = (n, p)
#         if k not in seen:
#             seen.add(k)
#             out.append((n, p))
#     return out

# # ----------------- MAIN -----------------
# def main():
#     drv = driver_make()

#     # Load existing CSV so we append but don't remove previous data
#     existing = pd.DataFrame(columns=["Event Name", "Date", "Location", "Ticket Name", "Ticket Price", "Link"])
#     if os.path.exists(OUTPUT_CSV):
#         try:
#             existing = pd.read_csv(OUTPUT_CSV)
#         except Exception:
#             pass

#     rows = []

#     try:
#         drv.get(START_URL)
#         # Wait for a plausible app root: either Nuxt or Megatix root node or at least <main>
#         try:
#             wait_cloudflare(drv, max_sec=25)
#             wait_any(
#                 drv,
#                 [
#                     (By.CSS_SELECTOR, "#__nuxt"),
#                     (By.CSS_SELECTOR, "#megatix"),
#                     (By.TAG_NAME, "main"),
#                     (By.XPATH, "//*[@id='__nuxt']"),
#                 ],
#                 sec=WAIT,
#             )
#         except TimeoutException:
#             # As a last resort, continue; some builds render later
#             pass

#         dismiss_overlays(drv)
#         smooth_scroll(drv, px=700, steps=8, pause=0.6)

#         # Try to find the target list; if missing, try a fallback ‚ÄúAll Events‚Äù link then wait again
#         def get_list_element(retries=6):
#             for i in range(retries):
#                 try:
#                     lst = drv.find_element(By.XPATH, TARGET_LIST_XP)
#                     if lst.is_displayed():
#                         return lst
#                 except Exception:
#                     pass
#                 # try clicking an /events link to land on a full listing
#                 try:
#                     a = drv.find_element(By.CSS_SELECTOR, FALLBACK_ALL_EVENTS_LINK_CSS)
#                     a.click()
#                     wait_cloudflare(drv, max_sec=20)
#                     time.sleep(1.2)
#                 except Exception:
#                     pass
#                 smooth_scroll(drv, px=900, steps=2, pause=0.7)
#             return None

#         lst = get_list_element()
#         if not lst:
#             # Still not found: search any section that contains many <article> cards under main
#             try:
#                 lst = WebDriverWait(drv, 15).until(
#                     EC.presence_of_element_located(
#                         (By.XPATH, "//main//section[.//article]/section/div[2]/div | //main//section//div[.//article]")
#                     )
#                 )
#             except TimeoutException:
#                 raise TimeoutException("Could not locate the listing container; site layout likely different/region-gated.")

#         # Ensure as many cards as possible are loaded
#         prev_count = -1
#         for _ in range(12):
#             cards = lst.find_elements(By.XPATH, ".//article")
#             if len(cards) == prev_count:
#                 break
#             prev_count = len(cards)
#             try:
#                 drv.execute_script("arguments[0].scrollTop = arguments[0].scrollHeight;", lst)
#             except Exception:
#                 pass
#             drv.execute_script("window.scrollBy(0, 1000);")
#             time.sleep(0.8)

#         # Collect event hrefs (don‚Äôt click yet‚Äîavoid stale issues on back())
#         cards = lst.find_elements(By.XPATH, ".//article")
#         hrefs = []
#         for c in cards:
#             try:
#                 a = c.find_element(By.XPATH, ".//a[.//div or .//h3 or .//h2]")
#                 href = a.get_attribute("href")
#                 if href and href.startswith("http"):
#                     hrefs.append(href)
#             except NoSuchElementException:
#                 continue

#         # Visit each event page and extract info
#         for href in hrefs:
#             try:
#                 drv.get(href)
#                 wait_cloudflare(drv, max_sec=20)
#                 # Wait for event summary (some pages lazy-load)
#                 try:
#                     WebDriverWait(drv, 20).until(
#                         EC.presence_of_element_located((By.XPATH, EV_NAME_XP))
#                     )
#                 except TimeoutException:
#                     # try a small scroll to trigger hydration
#                     smooth_scroll(drv, px=600, steps=2, pause=0.5)
#                     wait_for(drv, By.XPATH, EV_NAME_XP, sec=15)

#                 ev_name  = text_or_empty(drv, By.XPATH, EV_NAME_XP, 12)
#                 ev_date  = text_or_empty(drv, By.XPATH, EV_DATE_XP, 12)
#                 ev_venue = text_or_empty(drv, By.XPATH, EV_VENUE_XP, 12)
#                 ev_link  = drv.current_url

#                 tickets = []
#                 try:
#                     tsec = wait_for(drv, By.XPATH, TICKETS_SECTION_XP, sec=12)
#                     tickets = extract_tickets_from_container(tsec)
#                 except TimeoutException:
#                     tickets = []

#                 if tickets:
#                     for tname, tprice in tickets:
#                         rows.append({
#                             "Event Name": ev_name,
#                             "Date": ev_date,
#                             "Location": ev_venue,
#                             "Ticket Name": tname,
#                             "Ticket Price": tprice,
#                             "Link": ev_link
#                         })
#                 else:
#                     rows.append({
#                         "Event Name": ev_name,
#                         "Date": ev_date,
#                         "Location": ev_venue,
#                         "Ticket Name": "",
#                         "Ticket Price": "",
#                         "Link": ev_link
#                     })

#             except TimeoutException:
#                 # Skip and continue
#                 continue
#             except ElementClickInterceptedException:
#                 continue

#         # Save (append w/ de-dup, don‚Äôt remove older rows)
#         new_df = pd.DataFrame(rows, columns=["Event Name", "Date", "Location", "Ticket Name", "Ticket Price", "Link"])
#         combined = pd.concat([existing, new_df], ignore_index=True)
#         combined["__k"] = (
#             combined["Link"].astype(str).fillna("")
#             + " | " + combined["Ticket Name"].astype(str).fillna("")
#             + " | " + combined["Ticket Price"].astype(str).fillna("")
#         )
#         combined = combined.drop_duplicates(subset="__k").drop(columns="__k")
#         combined.to_csv(OUTPUT_CSV, index=False, encoding="utf-8-sig")
#         print(f"Saved {len(combined)} total rows to {OUTPUT_CSV} "
#               f"(added {len(combined) - len(existing)} new rows).")

#     finally:
#         try:
#             drv.quit()
#         except Exception:
#             pass

# if __name__ == "__main__":
#     main()



In [None]:
# # code has issues
# import os
# import csv
# import time
# from datetime import datetime, timezone

# from selenium import webdriver
# from selenium.webdriver.common.by import By
# from selenium.webdriver.support.ui import WebDriverWait
# from selenium.webdriver.support import expected_conditions as EC
# from selenium.webdriver.chrome.options import Options


# URL = "https://wfac.org.au/whats-on/?type=event&pg=1"
# CSV_FILE = "all_australia.csv"
# EVENT_LOCATION = "walyalup arts centre"


# def build_driver(headless: bool = True):
#     chrome_options = Options()
#     if headless:
#         chrome_options.add_argument("--headless=new")
#     chrome_options.add_argument("--disable-gpu")
#     chrome_options.add_argument("--no-sandbox")
#     chrome_options.add_argument("--disable-dev-shm-usage")
#     chrome_options.add_argument("--window-size=1920,1080")

#     driver = webdriver.Chrome(options=chrome_options)
#     return driver


# def wait_for_page_and_lazy_load(driver):
#     """
#     Wait for the events list to appear, then scroll a bit to ensure
#     everything lazy-loads. Does NOT touch any popup.
#     """
#     wait = WebDriverWait(driver, 20)

#     # Wait until at least one event article is present
#     wait.until(
#         EC.presence_of_element_located(
#             (
#                 By.XPATH,
#                 '//*[@id="whats-on__ajax-wrapper"]/section/div[2]/div/article[1]'
#             )
#         )
#     )

#     # Simple scroll loop in case of lazy loading
#     last_height = driver.execute_script("return document.body.scrollHeight")
#     for _ in range(3):
#         driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
#         time.sleep(1.5)
#         new_height = driver.execute_script("return document.body.scrollHeight")
#         if new_height == last_height:
#             break
#         last_height = new_height


# def load_existing_event_names(csv_path):
#     """
#     Returns a set of event_name values that already exist in the CSV.
#     Used to avoid adding duplicates across runs.
#     """
#     existing_names = set()
#     if not os.path.exists(csv_path):
#         return existing_names

#     with open(csv_path, "r", encoding="utf-8", newline="") as f:
#         reader = csv.DictReader(f)
#         for row in reader:
#             name = (row.get("event_name") or "").strip()
#             if name:
#                 existing_names.add(name)
#     return existing_names


# def save_events_to_csv(csv_path, events):
#     """
#     Append new events to the CSV, ensuring:
#     - Header is written if file does not exist.
#     - Existing rows are kept.
#     """
#     file_exists = os.path.exists(csv_path)
#     fieldnames = ["event_name", "event_date", "event_location", "time_scraped_utc"]

#     with open(csv_path, "a", encoding="utf-8", newline="") as f:
#         writer = csv.DictWriter(f, fieldnames=fieldnames)
#         if not file_exists:
#             writer.writeheader()
#         for ev in events:
#             writer.writerow(ev)


# def scrape_events(driver):
#     driver.get(URL)
#     wait_for_page_and_lazy_load(driver)

#     # Select all event <article> elements in the list
#     articles = driver.find_elements(
#         By.XPATH,
#         '//*[@id="whats-on__ajax-wrapper"]/section/div[2]/div/article'
#     )

#     existing_names = load_existing_event_names(CSV_FILE)
#     seen_this_run = set()
#     new_events = []
#     current_utc = datetime.now(timezone.utc).isoformat()

#     for article in articles:
#         try:
#             # Event name: //*[@id="whats-on__ajax-wrapper"]/section/div[2]/div/article[1]/h3
#             name_elem = article.find_element(By.XPATH, "./h3")
#             event_name = name_elem.text.strip()
#         except Exception:
#             event_name = ""

#         try:
#             # Event date: //*[@id="whats-on__ajax-wrapper"]/section/div[2]/div/article[1]/div[1]/span
#             date_elem = article.find_element(By.XPATH, "./div[1]/span")
#             event_date = date_elem.text.strip()
#         except Exception:
#             event_date = ""

#         if not event_name:
#             continue

#         # De-dupe by event name (existing CSV + within this run)
#         if event_name in existing_names or event_name in seen_this_run:
#             continue

#         seen_this_run.add(event_name)

#         new_events.append(
#             {
#                 "event_name": event_name,
#                 "event_date": event_date,
#                 "event_location": EVENT_LOCATION,
#                 "time_scraped_utc": current_utc,
#             }
#         )

#     return new_events


# def main():
#     driver = build_driver(headless=True)
#     try:
#         new_events = scrape_events(driver)
#         if new_events:
#             save_events_to_csv(CSV_FILE, new_events)
#             print(f"Added {len(new_events)} new events to {CSV_FILE}.")
#         else:
#             print("No new events found. CSV unchanged.")
#     finally:
#         driver.quit()


# if __name__ == "__main__":
#     main()


KeyboardInterrupt: 

# Bali (done)
- as of 13/11/25

In [24]:
"""
Savaya Bali ‚Äî Events scraper (append-only, no duplicates)
Site: https://www.savaya.com/

What it does
- Opens the landing page
- Clicks the element at /html/body/main/section[1]/div/div[1]/a if present
- Slow-scrolls to trigger lazy loading of all event cards
- Scrapes Event Name + Event Date using your given XPaths as anchors
- Also attempts to capture the event link (anchor href) when available
- Appends only new rows to savaya_events.csv (never deletes existing rows)
- Adds per-row ScrapedAtUTC (UTC ISO8601)

Requirements:
    pip install selenium pandas
"""

import os
import time
from datetime import datetime, timezone

import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import (
    TimeoutException, NoSuchElementException, ElementClickInterceptedException,
    StaleElementReferenceException
)

URL = "https://www.savaya.com/"
OUT_CSV = "all_bali.csv"

# ---------------- Your provided XPaths ----------------
X_CLICK = "/html/body/main/section[1]/div/div[1]/a"  # click this if present (e.g., banner/CTA)
# Example event card fields for ONE card (we'll generalize from this):
X_NAME_SAMPLE = '//*[@id="w-node-_9de959fa-b5bd-ae81-b76c-b5857691ce18-62724cfd"]/div/a/div/div[2]/h3'
X_DATE_SAMPLE = '//*[@id="w-node-_9de959fa-b5bd-ae81-b76c-b5857691ce18-62724cfd"]/div/a/div/div[1]/div'

# ---------------- Helpers ----------------
def utc_now_iso():
    return datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")

def norm(s: str) -> str:
    if not isinstance(s, str):
        return ""
    return " ".join(s.strip().lower().split())

def build_uid(name: str, date_str: str, link: str) -> str:
    return f"{norm(name)}|{norm(date_str)}|{norm(link)}"

def slow_scroll_to_bottom(driver, step=700, pause=0.35, settle=0.8, max_passes=5):
    """
    Gentle, repeated step-scrolling to trigger lazy loading.
    """
    for _ in range(max_passes):
        last_h = driver.execute_script("return document.body.scrollHeight;")
        y = 0
        while y < last_h:
            y += step
            driver.execute_script(f"window.scrollTo(0, {y});")
            time.sleep(pause)
        time.sleep(settle)
        new_h = driver.execute_script("return document.body.scrollHeight;")
        if new_h == last_h:
            break

def try_click(driver, xpath):
    try:
        el = WebDriverWait(driver, 6).until(EC.element_to_be_clickable((By.XPATH, xpath)))
        driver.execute_script("arguments[0].scrollIntoView({block:'center'});", el)
        time.sleep(0.3)
        el.click()
        time.sleep(0.8)
        return True
    except Exception:
        return False

def find_event_container(driver):
    """
    Strategy:
    1) Find your sample NAME element (X_NAME_SAMPLE).
    2) Walk up to a common container that holds ALL event cards.
    3) From that container, collect all anchor cards and read their name/date via RELATIVE XPaths.
    """
    try:
        sample_h3 = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH, X_NAME_SAMPLE)))
    except TimeoutException:
        return None

    # Climb up a few levels to get the repeating list container (Webflow grids are usually a few divs up).
    # We‚Äôll try progressively broader ancestors until we see multiple <a> children with h3s.
    ancestors = [
        "./../../..",           # div/div[2]/h3 -> div/div[2] -> div/div -> div (card)
        "./../../../..",        # up one more
        "./../../../../..",     # up two more
        "./../../../../../..",  # up three more
        "./../../../../../../.." # up four more
    ]

    for anc in ancestors:
        try:
            container = sample_h3.find_element(By.XPATH, anc)
            # Heuristic: a container with multiple anchors that have an h3 child
            anchors = container.find_elements(By.XPATH, ".//a[.//h3]")
            if len(anchors) >= 2:
                return container
        except Exception:
            continue
    return None

def collect_cards(container):
    """
    From the container, get all anchor 'cards' that contain an h3.
    Return list of dicts with name, date, link.
    """
    rows = []
    anchors = container.find_elements(By.XPATH, ".//a[.//h3]")
    for a in anchors:
        try:
            # Name: relative to each anchor (mirroring your provided path endings)
            name_el = a.find_element(By.XPATH, ".//div/div[2]/h3")
        except NoSuchElementException:
            # Fallback: any h3 within the anchor
            try:
                name_el = a.find_element(By.XPATH, ".//h3")
            except NoSuchElementException:
                continue

        # Date block: relative to each anchor; primary and fallback
        date_text = ""
        try:
            date_el = a.find_element(By.XPATH, ".//div/div[1]/div")
            date_text = date_el.text.strip()
        except NoSuchElementException:
            # Fallbacks: sometimes the date might be in a sibling text node or alternative class
            try:
                # Any div/span near the top portion that looks like a date
                date_el = a.find_element(By.XPATH, ".//*[self::div or self::span][1]")
                date_text = date_el.text.strip()
            except NoSuchElementException:
                date_text = ""

        name = name_el.text.strip()
        link = a.get_attribute("href") or ""

        if name:  # only keep cards with a name
            rows.append({
                "Event Name": name,
                "Event Date": date_text,
                "Link": link
            })
    return rows

def append_only_new(df_new: pd.DataFrame, out_csv: str):
    """
    Append only rows whose uid is NOT already present in out_csv.
    Keep all existing rows; do not modify their ScrapedAtUTC.
    """
    if os.path.exists(out_csv):
        df_old = pd.read_csv(out_csv)
        if "uid" not in df_old.columns:
            # Backfill for legacy files (best-effort)
            df_old["uid"] = (
                df_old.get("Event Name", "").fillna("").apply(norm) + "|" +
                df_old.get("Event Date", "").fillna("").apply(norm) + "|" +
                df_old.get("Link", "").fillna("").apply(norm)
            )
    else:
        df_old = pd.DataFrame(columns=["Event Name","Event Date","Link","ScrapedAtUTC","uid"])

    old_uids = set(df_old.get("uid", pd.Series(dtype=str)).astype(str))
    df_new = df_new.copy()
    df_new["uid"] = df_new["uid"].astype(str)

    df_to_add = df_new[~df_new["uid"].isin(old_uids)]
    if df_to_add.empty:
        print("No new events to append. Existing file left unchanged.")
        return

    df_out = pd.concat([df_old, df_to_add], ignore_index=True)
    cols = ["Event Name","Event Date","Link","ScrapedAtUTC","uid"]
    df_out = df_out[[c for c in cols if c in df_out.columns] + [c for c in df_out.columns if c not in cols]]
    df_out.to_csv(out_csv, index=False, encoding="utf-8-sig")
    print(f"Appended {len(df_to_add)} new rows. Total now: {len(df_out)}")

def main(headless=False):
    # --- Browser setup ---
    opts = webdriver.ChromeOptions()
    if headless:
        opts.add_argument("--headless=new")
    opts.add_argument("--start-maximized")
    opts.add_argument("--disable-blink-features=AutomationControlled")
    opts.add_argument("--disable-gpu")
    opts.add_argument("--window-size=1400,1000")
    opts.add_argument("--no-sandbox")
    opts.add_argument("--disable-dev-shm-usage")
    opts.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome Safari")

    driver = webdriver.Chrome(options=opts)
    wait = WebDriverWait(driver, 15)

    try:
        driver.get(URL)

        # Optional click (your provided XPath). If it isn't there, we just carry on.
        try_click(driver, X_CLICK)

        # Initial slow scroll to let cards render
        slow_scroll_to_bottom(driver, step=700, pause=0.35, settle=0.8, max_passes=4)

        # Find the repeating container from your sample XPaths
        container = find_event_container(driver)
        if not container:
            # One more scroll attempt (some sites load later)
            slow_scroll_to_bottom(driver, step=900, pause=0.4, settle=1.0, max_passes=3)
            container = find_event_container(driver)

        if not container:
            print("Could not locate the events container. Site structure may have changed.")
            return

        # Final scroll (safety) and collect
        slow_scroll_to_bottom(driver, step=900, pause=0.35, settle=0.8, max_passes=2)
        rows = collect_cards(container)

        if not rows:
            print("No events found. Check XPaths or if content is gated by a modal.")
            return

        # Build DataFrame with per-row timestamp & stable uid
        now_iso = utc_now_iso()
        for r in rows:
            r["ScrapedAtUTC"] = now_iso
            r["uid"] = build_uid(r.get("Event Name",""), r.get("Event Date",""), r.get("Link",""))

        df_new = pd.DataFrame(rows, columns=["Event Name","Event Date","Link","ScrapedAtUTC","uid"])

        # Append-only write
        append_only_new(df_new, OUT_CSV)

    finally:
        driver.quit()

if __name__ == "__main__":
    main(headless=False)


Appended 13 new rows. Total now: 151


In [25]:
# # ra_bali_list_scraper.py
# # Scrape RA Bali listing page (no detail clicks) with the user's XPaths.
# # - Prevents lazy-loading misses (incremental scroll until stable)
# # - Appends to CSV without removing previous rows
# # - De-dupes by event link (fallback: Title+Date)
# # - Includes TimeScraped

# import os
# import time
# import pandas as pd
# from datetime import datetime
# from typing import List, Dict, Set

# from selenium import webdriver
# from selenium.webdriver.common.by import By
# from selenium.webdriver.chrome.service import Service
# from selenium.webdriver.chrome.options import Options
# from selenium.webdriver.support.ui import WebDriverWait
# from selenium.webdriver.support import expected_conditions as EC
# from webdriver_manager.chrome import ChromeDriverManager

# # ---------------- CONFIG ----------------
# BASE_URL = "https://ra.co/events/id/bali"
# OUTPUT_CSV = "all_bali_list.csv"
# CITY = "Bali"

# # Section UL (your XPath)
# UL_XP = "//*[@id='__next']/div[3]/div[3]/div[2]/section/div/div/div[2]/div[2]/div/div/ul"
# LI_REL_XP = "./li"

# # Per-card fields (generalized from your li[1] examples)
# # event name (you gave the li[1] path; we adapt to "this li")
# NAME_REL_XP = ".//div/h3"
# # event venue ‚Äî you provided the same XPath as name, so we‚Äôll take the same text.
# VENUE_REL_XP = ".//div/h3"       # if RA shows venue elsewhere, update this
# # event date
# DATE_REL_XP = ".//div/div[2]/span"
# # link (often on <a> inside h3)
# LINK_REL_CSS = "h3 a, a[href*='/events/']"

# # ---------------- DRIVER ----------------
# def make_driver(headless: bool = False) -> webdriver.Chrome:
#     opts = Options()
#     if headless:
#         opts.add_argument("--headless=new")
#     opts.add_argument("--window-size=1400,1000")
#     opts.add_argument("--disable-gpu")
#     opts.add_argument("--no-sandbox")
#     opts.add_argument("--disable-dev-shm-usage")
#     # slightly less detectable
#     opts.add_experimental_option("excludeSwitches", ["enable-automation"])
#     opts.add_experimental_option("useAutomationExtension", False)

#     service = Service(ChromeDriverManager().install())
#     drv = webdriver.Chrome(service=service, options=opts)
#     return drv

# # ---------------- HELPERS ----------------
# def now_stamp() -> str:
#     return datetime.now().strftime("%Y-%m-%d %H:%M:%S")

# def slow_load_list(driver: webdriver.Chrome, list_xpath: str, item_rel_xpath: str = "./li",
#                    pause: float = 1.0, max_passes: int = 50) -> None:
#     """
#     Incremental scrolling until the number of items stabilizes twice or max_passes reached.
#     Prevents missing items due to lazy loading.
#     """
#     wait = WebDriverWait(driver, 25)
#     wait.until(EC.presence_of_element_located((By.XPATH, list_xpath)))

#     last = 0
#     stable = 0
#     for _ in range(max_passes):
#         driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
#         time.sleep(pause)
#         ul = wait.until(EC.presence_of_element_located((By.XPATH, list_xpath)))
#         count = len(ul.find_elements(By.XPATH, item_rel_xpath))
#         if count == last:
#             stable += 1
#         else:
#             stable = 0
#         last = count
#         if stable >= 2:
#             break

#     driver.execute_script("window.scrollTo(0, 0);")
#     time.sleep(0.5)

# def get_text_or_blank(el, by: By, sel: str) -> str:
#     try:
#         t = el.find_element(by, sel).text.strip()
#         return t
#     except Exception:
#         return ""

# def get_link(el) -> str:
#     try:
#         a = el.find_element(By.CSS_SELECTOR, LINK_REL_CSS)
#         href = a.get_attribute("href")
#         return (href or "").strip()
#     except Exception:
#         return ""

# def append_dedup_csv(new_rows: List[Dict], csv_path: str,
#                      key_cols=("Link", "Title", "Date")) -> int:
#     new_df = pd.DataFrame(new_rows)

#     if os.path.exists(csv_path):
#         old = pd.read_csv(csv_path)
#         # Align columns across old and new
#         all_cols = list(dict.fromkeys(list(old.columns) + list(new_df.columns)))
#         old = old.reindex(columns=all_cols)
#         new_df = new_df.reindex(columns=all_cols)
#         out = pd.concat([old, new_df], ignore_index=True)
#     else:
#         out = new_df

#     # Ensure TimeScraped is str (prevents NaN)
#     if "TimeScraped" in out.columns:
#         out["TimeScraped"] = out["TimeScraped"].astype(str)

#     # Prefer Link as unique key; fallback to Title+Date if link missing
#     # Build a helper key column
#     def build_key(row):
#         link = str(row.get("Link") or "").strip()
#         if link:
#             return f"URL::{link}"
#         title = str(row.get("Title") or "").strip()
#         date = str(row.get("Date") or "").strip()
#         return f"TD::{title}|{date}"

#     out["_dedup_key"] = out.apply(build_key, axis=1)
#     out = out.drop_duplicates(subset=["_dedup_key"], keep="first").drop(columns=["_dedup_key"])
#     out.to_csv(csv_path, index=False)
#     return len(out)

# # ---------------- MAIN ----------------
# def main():
#     driver = make_driver(headless=False)  # set True on server/CI
#     try:
#         driver.get(BASE_URL)

#         # Ensure the list fully loads
#         slow_load_list(driver, UL_XP, LI_REL_XP, pause=1.0, max_passes=60)

#         wait = WebDriverWait(driver, 25)
#         ul = wait.until(EC.presence_of_element_located((By.XPATH, UL_XP)))
#         cards = ul.find_elements(By.XPATH, LI_REL_XP)

#         rows: List[Dict] = []

#         for i in range(len(cards)):
#             # re-fetch after DOM changes (robustness)
#             ul = wait.until(EC.presence_of_element_located((By.XPATH, UL_XP)))
#             cards = ul.find_elements(By.XPATH, LI_REL_XP)
#             if i >= len(cards):
#                 break

#             li = cards[i]

#             title = get_text_or_blank(li, By.XPATH, NAME_REL_XP)
#             venue = get_text_or_blank(li, By.XPATH, VENUE_REL_XP)  # as per your spec (same as name)
#             date_txt = get_text_or_blank(li, By.XPATH, DATE_REL_XP)
#             link = get_link(li)

#             if not any([title, link, date_txt]):
#                 continue

#             rows.append({
#                 "City": CITY,
#                 "Title": title,
#                 "Venue": venue,
#                 "Date": date_txt,
#                 "Link": link,
#                 "TimeScraped": now_stamp()
#             })

#             # tiny pause ‚Äî friendlier + reduces anti-bot triggers
#             time.sleep(0.2)

#         total = append_dedup_csv(rows, OUTPUT_CSV, key_cols=("Link", "Title", "Date"))
#         print(f"Scraped {len(rows)} rows this run. Total unique saved: {total}")
#         print(f"CSV saved: {OUTPUT_CSV}")

#     finally:
#         driver.quit()

# if __name__ == "__main__":
#     main()





# Manila (done)
- as of 13/11/25

In [26]:
# edit code to show how it scrapes and ensure it scrapes all not just some
import time
import csv
import sys
from pathlib import Path
from datetime import datetime
from dateutil import tz

import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import (
    TimeoutException, NoSuchElementException, StaleElementReferenceException, ElementClickInterceptedException
)

SMTICKETS_URL = "https://smtickets.com/"
OUT_CSV = "all_manila.csv"

# Strictly the two sections you asked for:
SECTION_1_XPATH = "/html/body/div[30]/div/div[2]"
SECTION_1_NEXT_BTN_XPATH = "/html/body/div[30]/div/div[2]/div/div[2]/button[2]"
SECTION_2_XPATH = "/html/body/div[31]/div/div[2]"

# Within each section, the list often sits under a UL with id 'events_moa_slide'
# (Your sample XPaths target li[21] under #events_moa_slide).
# We'll try that first; if missing, we fallback to ‚Äúall li descendants that look like event cards‚Äù.
PREFERRED_LIST_XPATH = ".//*[@id='events_moa_slide']/li"

# Relative-to-li fallback selectors
REL_NAME = ".//a/h1"
REL_VENUE = ".//p"
REL_DATE  = ".//div"
REL_LINK  = ".//a"


def sgt_now_iso():
    return datetime.now(tz.gettz("Asia/Singapore")).isoformat(timespec="seconds")


def init_driver(headless=True):
    opts = Options()
    if headless:
        opts.add_argument("--headless=new")
    opts.add_argument("--disable-gpu")
    opts.add_argument("--no-sandbox")
    opts.add_argument("--disable-dev-shm-usage")
    opts.add_argument("--window-size=1400,900")
    driver = webdriver.Chrome(options=opts)
    driver.implicitly_wait(2)
    return driver


def slow_page_scroll(driver, steps=6, pause=0.5):
    """Slow vertical scroll to encourage lazy-loaded assets to render."""
    for i in range(steps):
        driver.execute_script("window.scrollBy(0, Math.floor(window.innerHeight*0.6));")
        time.sleep(pause)
    # small bounce up to trigger observers
    driver.execute_script("window.scrollBy(0, -200);")
    time.sleep(0.3)


def slow_horizontal_scroll(driver, container, max_cycles=12, pause=0.6):
    """Slow horizontal scroll for carousels (if any use scrollLeft)."""
    for _ in range(max_cycles):
        try:
            driver.execute_script("arguments[0].scrollLeft += Math.floor(arguments[0].clientWidth*0.8);", container)
            time.sleep(pause)
        except Exception:
            break


def click_next_button_safely(driver, xpath, max_clicks=25, pause=0.9):
    """Click the 'next' button up to max_clicks times, stopping when it disappears or no progress is seen."""
    last_count = -1
    stable_hits = 0
    for i in range(max_clicks):
        # If the button isn't present or clickable, we stop.
        try:
            btn = WebDriverWait(driver, 3).until(EC.element_to_be_clickable((By.XPATH, xpath)))
        except TimeoutException:
            break

        try:
            btn.click()
        except (ElementClickInterceptedException, StaleElementReferenceException):
            # try a tiny scroll-nudge and retry once
            driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", btn)
            time.sleep(0.3)
            try:
                btn.click()
            except Exception:
                break

        # let new slides load
        time.sleep(pause)

        # crude progress heuristic: count <li> in the nearest list if available
        try:
            parent = driver.find_element(By.XPATH, xpath + "/..")
            lis = parent.find_elements(By.XPATH, ".//li")
            curr_count = len(lis)
            if curr_count == last_count:
                stable_hits += 1
            else:
                stable_hits = 0
                last_count = curr_count
            if stable_hits >= 3:  # clicked but no more new items arriving
                break
        except Exception:
            # if we can‚Äôt estimate progress, continue a few clicks then bail
            continue


def find_event_items_in_section(section_el):
    """Return list of <li> elements that look like event cards inside a section."""
    items = section_el.find_elements(By.XPATH, PREFERRED_LIST_XPATH)
    if items:
        return items
    # Fallback: any li under the section that has an <a> and an <h1>
    guess = section_el.find_elements(By.XPATH, ".//li[.//a and .//h1]")
    return guess


def text_or_blank(el):
    try:
        return el.text.strip()
    except Exception:
        return ""


def href_or_blank(el):
    try:
        return el.get_attribute("href") or ""
    except Exception:
        return ""


def extract_event_from_li(li):
    """Extract name, venue, date, link from one <li> card, using your provided relative structure."""
    # Name
    try:
        name_el = li.find_element(By.XPATH, REL_NAME)
    except NoSuchElementException:
        name_el = None

    # Venue (first <p> under the li‚Äîper your XPath sample)
    try:
        venue_el = li.find_element(By.XPATH, REL_VENUE)
    except NoSuchElementException:
        venue_el = None

    # Date (first <div> under the li‚Äîper your XPath sample)
    try:
        date_el = li.find_element(By.XPATH, REL_DATE)
    except NoSuchElementException:
        date_el = None

    # Link (anchor wrapping the name)
    try:
        link_el = li.find_element(By.XPATH, REL_LINK)
    except NoSuchElementException:
        link_el = None

    name = text_or_blank(name_el)
    venue = text_or_blank(venue_el)
    date_text = text_or_blank(date_el)
    link = href_or_blank(link_el)

    return name, venue, date_text, link


def load_existing_df(csv_path: Path) -> pd.DataFrame:
    if csv_path.exists():
        try:
            return pd.read_csv(csv_path)
        except Exception:
            # fallback: try latin-1 weirdness
            return pd.read_csv(csv_path, encoding="latin-1")
    return pd.DataFrame(columns=["section", "name", "venue", "date_text", "link", "scraped_at_sgt", "event_id"])


def build_event_id(section, name, venue, date_text, link):
    key = "|".join([section, name, venue, date_text, link]).strip().lower()
    return key


def scrape_section(driver, section_xpath, next_btn_xpath=None, label="section"):
    """Scrape one section: slow scroll, (optional) click next repeatedly, horizontal scroll inside, then extract items."""
    results = []

    # Wait for the section to render
    section = WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.XPATH, section_xpath)))

    # Bring into view and slow scroll the page a bit
    driver.execute_script("arguments[0].scrollIntoView({block: 'start'});", section)
    time.sleep(0.6)
    slow_page_scroll(driver, steps=5, pause=0.5)

    # Try a gentle horizontal scroll on the section container (some carousels need this)
    try:
        slow_horizontal_scroll(driver, section, max_cycles=8, pause=0.5)
    except Exception:
        pass

    # Click the "next" button multiple times to reveal more slides (if provided)
    if next_btn_xpath:
        click_next_button_safely(driver, next_btn_xpath, max_clicks=30, pause=0.9)

    # After navigation, pause and scroll again to let images/text fully render
    time.sleep(0.8)
    slow_page_scroll(driver, steps=3, pause=0.4)

    # Collect list items
    lis = find_event_items_in_section(section)
    for li in lis:
        try:
            name, venue, date_text, link = extract_event_from_li(li)
        except StaleElementReferenceException:
            # retry once
            try:
                name, venue, date_text, link = extract_event_from_li(li)
            except Exception:
                continue

        if not name:
            continue

        results.append({
            "section": label,
            "name": name,
            "venue": venue,
            "date_text": date_text,
            "link": link,
            "scraped_at_sgt": sgt_now_iso()
        })
    return results


def main():
    headless = True
    if len(sys.argv) > 1 and sys.argv[1].lower() in {"--headed", "--show"}:
        headless = False

    driver = init_driver(headless=headless)
    csv_path = Path(OUT_CSV)

    try:
        driver.get(SMTICKETS_URL)

        # Initial gentle load mitigation
        WebDriverWait(driver, 25).until(EC.presence_of_element_located((By.TAG_NAME, "body")))
        time.sleep(1.2)
        slow_page_scroll(driver, steps=6, pause=0.5)

        # Scrape Section 1 (with right/next button)
        sec1_results = scrape_section(
            driver,
            section_xpath=SECTION_1_XPATH,
            next_btn_xpath=SECTION_1_NEXT_BTN_XPATH,
            label="div[30]/div/div[2]"
        )

        # Scrape Section 2 (no explicit next button provided‚Äîjust scroll it)
        sec2_results = scrape_section(
            driver,
            section_xpath=SECTION_2_XPATH,
            next_btn_xpath=None,
            label="div[31]/div/div[2]"
        )

        all_results = sec1_results + sec2_results

        # Build event IDs for dedup
        for r in all_results:
            r["event_id"] = build_event_id(r["section"], r["name"], r["venue"], r["date_text"], r["link"])

        # Load existing and append only new event_ids
        existing = load_existing_df(csv_path)
        existing_ids = set(existing["event_id"].astype(str)) if not existing.empty else set()

        new_rows = [r for r in all_results if r["event_id"] not in existing_ids]

        if new_rows:
            df_new = pd.DataFrame(new_rows)
            df_out = pd.concat([existing, df_new], ignore_index=True)
        else:
            df_out = existing  # nothing new this run

        # Persist without removing older rows
        df_out.to_csv(csv_path, index=False, quoting=csv.QUOTE_MINIMAL)
        print(f"Done. Wrote {len(df_out)} total rows to {csv_path}. Added {len(new_rows)} new rows this run.")

    finally:
        driver.quit()


if __name__ == "__main__":
    main()


Done. Wrote 68 total rows to all_manila.csv. Added 2 new rows this run.


In [27]:
import os
import csv
import time
from datetime import datetime, timezone

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import TimeoutException


URL = "https://comedymanila.helixpay.ph/"
CSV_FILE = "all_manila.csv"


def build_driver(headless: bool = False):
    """
    Build a Chrome WebDriver.
    headless=False so the browser window pops up and you can see it.
    """
    chrome_options = Options()

    if headless:
        chrome_options.add_argument("--headless=new")

    chrome_options.add_argument("--disable-gpu")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_options.add_argument("--window-size=1400,1000")

    driver = webdriver.Chrome(options=chrome_options)
    return driver


def wait_for_page_and_lazy_load(driver):
    """
    Wait for the events section to appear, then scroll down to trigger
    lazy loading of all events. Does not close any pop-ups.
    """
    wait = WebDriverWait(driver, 25)

    try:
        # Wait for the groupSection container that holds the events
        wait.until(
            EC.presence_of_element_located(
                (By.ID, "groupSection")
            )
        )
    except TimeoutException:
        print("Warning: #groupSection did not load in time.")
        return

    # Lazy-load scroll loop ‚Äì scroll down multiple times
    last_height = 0
    for _ in range(6):
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(1.5)
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height


def load_existing_event_names(csv_path: str):
    """
    Load existing event_name values from CSV to avoid duplicates on re-runs.
    """
    existing_names = set()
    if not os.path.exists(csv_path):
        return existing_names

    with open(csv_path, "r", encoding="utf-8", newline="") as f:
        reader = csv.DictReader(f)
        for row in reader:
            name = (row.get("event_name") or "").strip()
            if name:
                existing_names.add(name)
    return existing_names


def save_events_to_csv(csv_path: str, events: list[dict]):
    """
    Append new events to the CSV. Do not remove or overwrite existing rows.
    """
    file_exists = os.path.exists(csv_path)
    fieldnames = ["event_name", "event_price", "time_scraped_utc"]

    with open(csv_path, "a", encoding="utf-8", newline="") as f:
        writer = csv.DictWriter(f, fieldnames=fieldnames)
        if not file_exists:
            writer.writeheader()
        for ev in events:
            writer.writerow(ev)


def scrape_events(driver):
    driver.get(URL)
    wait_for_page_and_lazy_load(driver)

    # Find ALL event name elements under groupSection.
    # Your sample XPaths:
    #   event name:
    #     //*[@id="groupSection"]/div[1]/div[1]/a/section[2]/div[1]/div
    #   event price:
    #     //*[@id="groupSection"]/div[1]/div[1]/a/section[2]/div[2]/div/span
    #
    # Generalized: under #groupSection/div[1] find all a/section[2]/div[1]/div (names)
    name_elements = driver.find_elements(
        By.XPATH,
        '//*[@id="groupSection"]/div[1]//a/section[2]/div[1]/div'
    )

    existing_names = load_existing_event_names(CSV_FILE)
    seen_this_run = set()
    new_events = []
    current_utc = datetime.now(timezone.utc).isoformat()

    for name_elem in name_elements:
        try:
            event_name = name_elem.text.strip()
        except Exception:
            event_name = ""

        if not event_name:
            continue

        # Skip if event_name already exists in CSV or in this run
        if event_name in existing_names or event_name in seen_this_run:
            continue

        # Move from name element to its container to get the price.
        # name is at: a/section[2]/div[1]/div
        # section[2] is the parent of div[1], so price is at:
        #   section[2]/div[2]/div/span  (relative to section[2])
        try:
            section_two = name_elem.find_element(By.XPATH, './../..')  # section[2]
        except Exception:
            section_two = None

        event_price = ""
        if section_two is not None:
            try:
                price_elem = section_two.find_element(By.XPATH, './div[2]/div/span')
                event_price = price_elem.text.strip()
            except Exception:
                event_price = ""

        seen_this_run.add(event_name)

        new_events.append(
            {
                "event_name": event_name,
                "event_price": event_price,
                "time_scraped_utc": current_utc,
            }
        )

    return new_events


def main():
    # headless=False ‚Üí website pops up in a visible window
    driver = build_driver(headless=False)
    try:
        new_events = scrape_events(driver)
        if new_events:
            save_events_to_csv(CSV_FILE, new_events)
            print(f"Added {len(new_events)} new events to {CSV_FILE}.")
        else:
            print("No new events found. CSV unchanged.")

        # Optional: keep the browser open a bit so you can see it
        time.sleep(5)

    finally:
        driver.quit()


if __name__ == "__main__":
    main()


Added 7 new events to all_manila.csv.


In [28]:
import os
import csv
import time
from datetime import datetime, timezone

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import TimeoutException


URL = "https://smtickets.com/events/listing/arena"
CSV_FILE = "all_manila.csv"


def build_driver(headless: bool = False):
    """
    Build a Chrome WebDriver.
    headless=False so the browser window pops up and you can see it.
    """
    chrome_options = Options()

    if headless:
        chrome_options.add_argument("--headless=new")

    chrome_options.add_argument("--disable-gpu")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_options.add_argument("--window-size=1400,1000")

    driver = webdriver.Chrome(options=chrome_options)
    return driver


def wait_for_page_and_lazy_load(driver):
    """
    Wait for the first example event block to appear, then scroll down
    repeatedly to lazy-load more events.
    Uses your sample XPath under id="December".
    """
    wait = WebDriverWait(driver, 30)

    try:
        # Wait for at least one event block to appear
        wait.until(
            EC.presence_of_element_located(
                (By.XPATH, '//*[@id="December"]/div[1]')
            )
        )
    except TimeoutException:
        print("Warning: December block did not load in time.")
        return

    # Scroll several times to trigger lazy loading of additional events
    last_height = 0
    for _ in range(10):
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(1.5)
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height


def load_existing_name_and_url_sets(csv_path):
    """
    Load existing event_name and event_url sets from CSV so we can
    avoid duplicates if EITHER name OR URL matches.
    """
    existing_names = set()
    existing_urls = set()

    if not os.path.exists(csv_path):
        return existing_names, existing_urls

    with open(csv_path, "r", encoding="utf-8", newline="") as f:
        reader = csv.DictReader(f)
        for row in reader:
            name = (row.get("event_name") or "").strip()
            url = (row.get("event_url") or "").strip()
            if name:
                existing_names.add(name)
            if url:
                existing_urls.add(url)

    return existing_names, existing_urls


def save_events_to_csv(csv_path, events):
    """
    Append new events to the CSV, preserving existing rows.
    """
    file_exists = os.path.exists(csv_path)
    fieldnames = ["event_name", "event_date", "event_venue",
                  "event_url", "time_scraped_utc"]

    with open(csv_path, "a", encoding="utf-8", newline="") as f:
        writer = csv.DictWriter(f, fieldnames=fieldnames)
        if not file_exists:
            writer.writeheader()
        for ev in events:
            writer.writerow(ev)


def scrape_events(driver):
    driver.get(URL)
    wait_for_page_and_lazy_load(driver)

    # Your example XPaths:
    #   event name:  //*[@id="December"]/div[1]
    #   event date:  //*[@id="December"]/div[2]/span
    #   event venue: //*[@id="December"]/div[3]
    #
    # Treat each element with id="December" as an event block.
    event_blocks = driver.find_elements(By.ID, "December")

    existing_names, existing_urls = load_existing_name_and_url_sets(CSV_FILE)

    # For this run, also keep track of names & urls to avoid duplicates
    seen_names = set()
    seen_urls = set()

    new_events = []
    current_utc = datetime.now(timezone.utc).isoformat()

    for block in event_blocks:
        # ---- Event Name ----
        try:
            name_elem = block.find_element(By.XPATH, "./div[1]")
            event_name = name_elem.text.strip()
        except Exception:
            event_name = ""

        if not event_name:
            continue

        # ---- Event Date ----
        try:
            date_elem = block.find_element(By.XPATH, "./div[2]/span")
            event_date = date_elem.text.strip()
        except Exception:
            event_date = ""

        # ---- Event Venue ----
        try:
            venue_elem = block.find_element(By.XPATH, "./div[3]")
            event_venue = venue_elem.text.strip()
        except Exception:
            event_venue = ""

        # ---- Event URL (from link, if present) ----
        event_url = ""
        try:
            link_elem = block.find_element(By.XPATH, ".//a")
            href = link_elem.get_attribute("href")
            if href:
                event_url = href.strip()
        except Exception:
            event_url = ""

        # ---------- DEDUP LOGIC (NAME OR URL) ----------
        # Skip this event if:
        #   - its name already exists in CSV OR
        #   - its URL (non-empty) already exists in CSV OR
        #   - its name already seen in this run OR
        #   - its URL (non-empty) already seen in this run
        name_exists = event_name in existing_names or event_name in seen_names
        url_exists = bool(event_url) and (event_url in existing_urls or event_url in seen_urls)

        if name_exists or url_exists:
            continue

        # Mark as seen for this run
        seen_names.add(event_name)
        if event_url:
            seen_urls.add(event_url)

        new_events.append(
            {
                "event_name": event_name,
                "event_date": event_date,
                "event_venue": event_venue,
                "event_url": event_url,
                "time_scraped_utc": current_utc,
            }
        )

    return new_events


def main():
    # headless=False ‚Üí website pops up in a visible window
    driver = build_driver(headless=False)
    try:
        new_events = scrape_events(driver)
        if new_events:
            save_events_to_csv(CSV_FILE, new_events)
            print(f"Added {len(new_events)} new events to {CSV_FILE}.")
        else:
            print("No new events found. CSV unchanged.")

        # Optional: keep browser open briefly so you can see the page
        time.sleep(5)

    finally:
        driver.quit()


if __name__ == "__main__":
    main()



Added 1 new events to all_manila.csv.


# Taipei

In [29]:
# tixcraft_scrape.py
# pip install selenium webdriver-manager

import csv
import os
import time
from datetime import datetime, timezone

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

URL = "https://tixcraft.com/activity"
OUTPUT_CSV = "all_taipei.csv"

# XPaths you provided (generalized to select ALL items, not just the first one)
# name: //*[@id="all"]/div[2]/div[1]/div/a/div[2]/div[2]
# date: //*[@id="all"]/div[2]/div[1]/div/a/div[2]/div[1]
# We target all cards by finding all occurrences of the "name" div, then walk to siblings/ancestors.
NAME_XPATH_ALL = '//*[@id="all"]/div[2]//a/div[2]/div[2]'

def start_driver(headless=True):
    opts = webdriver.ChromeOptions()
    if headless:
        opts.add_argument("--headless=new")
    opts.add_argument("--disable-gpu")
    opts.add_argument("--no-sandbox")
    opts.add_argument("--window-size=1440,900")
    opts.add_argument("--disable-blink-features=AutomationControlled")
    opts.add_argument("--disable-dev-shm-usage")
    opts.add_argument("--user-agent=Mozilla/5.0")
    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service, options=opts)
    driver.set_page_load_timeout(60)
    driver.implicitly_wait(2)
    return driver

def slow_scroll_to_bottom(driver, min_pause=1.2, max_idle_rounds=3):
    """
    Smoothly scrolls until no new items load for max_idle_rounds rounds.
    """
    last_count = 0
    idle_rounds = 0
    while True:
        driver.execute_script("window.scrollBy(0, document.body.scrollHeight * 0.6);")
        time.sleep(min_pause)
        # Try a "nudge" to trigger lazy loading
        driver.execute_script("window.scrollBy(0, 300);")
        time.sleep(0.5)

        # Count currently visible name nodes
        try:
            names_now = driver.find_elements(By.XPATH, NAME_XPATH_ALL)
            count_now = len(names_now)
        except Exception:
            count_now = 0

        if count_now > last_count:
            last_count = count_now
            idle_rounds = 0
        else:
            idle_rounds += 1
            if idle_rounds >= max_idle_rounds:
                break

    # Ensure at true bottom
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    time.sleep(1)

def load_existing(filepath):
    rows = []
    if os.path.exists(filepath):
        with open(filepath, "r", newline="", encoding="utf-8") as f:
            reader = csv.DictReader(f)
            for r in reader:
                rows.append(r)
    return rows

def save_merged(filepath, rows):
    fieldnames = ["event_name", "event_date", "event_link", "time_scraped"]
    exists = os.path.exists(filepath)
    if not exists:
        with open(filepath, "w", newline="", encoding="utf-8") as f:
            writer = csv.DictWriter(f, fieldnames=fieldnames)
            writer.writeheader()
            writer.writerows(rows)
    else:
        # Merge with existing, dedupe by (event_name, event_date, event_link)
        existing = load_existing(filepath)
        seen = set(
            (e["event_name"], e["event_date"], e.get("event_link", ""))
            for e in existing
        )
        new_unique = []
        for r in rows:
            key = (r["event_name"], r["event_date"], r.get("event_link", ""))
            if key not in seen:
                seen.add(key)
                new_unique.append(r)

        if not new_unique:
            return  # nothing to add

        with open(filepath, "a", newline="", encoding="utf-8") as f:
            writer = csv.DictWriter(f, fieldnames=fieldnames)
            writer.writerows(new_unique)

def scrape():
    driver = start_driver(headless=True)
    try:
        driver.get(URL)

        # Wait until the main container is present
        WebDriverWait(driver, 20).until(
            EC.presence_of_element_located((By.ID, "all"))
        )

        # Scroll (handles lazy loading)
        slow_scroll_to_bottom(driver)

        # Collect all name nodes
        name_nodes = driver.find_elements(By.XPATH, NAME_XPATH_ALL)
        ts = datetime.now(timezone.utc).isoformat()

        rows = []
        for name_el in name_nodes:
            try:
                event_name = name_el.text.strip()
            except Exception:
                event_name = ""

            # date is sibling: ../div[1]
            try:
                date_el = name_el.find_element(By.XPATH, "../div[1]")
                event_date = date_el.text.strip()
            except Exception:
                event_date = ""

            # link is the ancestor <a>: name is a/div[2]/div[2] -> go up 3 levels to reach <a>
            try:
                anchor = name_el.find_element(By.XPATH, "../../..")
                event_link = anchor.get_attribute("href") or ""
            except Exception:
                event_link = ""

            # Only add if at least a name or date is present
            if event_name or event_date:
                rows.append(
                    {
                        "event_name": event_name,
                        "event_date": event_date,
                        "event_link": event_link,
                        "time_scraped": ts,
                    }
                )

        # Persist without removing older rows; dedupe across runs
        save_merged(OUTPUT_CSV, rows)

        print(f"Scraped {len(rows)} rows this run. Saved/merged into {OUTPUT_CSV}.")

    finally:
        driver.quit()

if __name__ == "__main__":
    scrape()




Scraped 45 rows this run. Saved/merged into all_taipei.csv.


In [30]:
import os
import csv
import time
from datetime import datetime, timezone

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import TimeoutException


URL = "https://trendy.taipei/en/Event?type=0"
CSV_FILE = "all_taipei.csv"


def build_driver(headless: bool = False):
    """
    Build a Chrome WebDriver.
    headless=False so the browser window pops up (visible).
    """
    chrome_options = Options()

    if headless:
        chrome_options.add_argument("--headless=new")

    chrome_options.add_argument("--disable-gpu")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_options.add_argument("--window-size=1400,1000")

    driver = webdriver.Chrome(options=chrome_options)
    return driver


def wait_for_page_and_lazy_load(driver):
    """
    Wait for the first event card to appear, then scroll down slowly
    multiple times to lazy-load more events.
    """
    wait = WebDriverWait(driver, 30)

    try:
        # Wait until the first event name element is present
        # Example provided:
        # //*[@id="content"]/section/div/div/div/a[1]/div[2]/div[2]
        wait.until(
            EC.presence_of_element_located(
                (
                    By.XPATH,
                    '//*[@id="content"]/section/div/div/div/a[1]/div[2]/div[2]'
                )
            )
        )
    except TimeoutException:
        print("Warning: First event name element did not load in time.")
        return

    # Scroll DOWN slowly to trigger lazy loading for more events
    last_height = 0
    for _ in range(12):  # more iterations = deeper scrolling
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(2.0)  # slow scrolling as requested
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height


def load_existing_name_and_url_sets(csv_path: str):
    """
    Load existing event_name and event_url sets from CSV.
    Used for dedupe: if name OR url already exists, skip.
    """
    existing_names = set()
    existing_urls = set()

    if not os.path.exists(csv_path):
        return existing_names, existing_urls

    with open(csv_path, "r", encoding="utf-8", newline="") as f:
        reader = csv.DictReader(f)
        for row in reader:
            name = (row.get("event_name") or "").strip()
            url = (row.get("event_url") or "").strip()
            if name:
                existing_names.add(name)
            if url:
                existing_urls.add(url)

    return existing_names, existing_urls


def save_events_to_csv(csv_path: str, events: list[dict]):
    """
    Append new events to the CSV.
    Never removes or overwrites existing rows.
    """
    file_exists = os.path.exists(csv_path)
    fieldnames = [
        "event_name",
        "event_date",
        "event_location",
        "event_url",
        "time_scraped_utc",
    ]

    with open(csv_path, "a", encoding="utf-8", newline="") as f:
        writer = csv.DictWriter(f, fieldnames=fieldnames)
        if not file_exists:
            writer.writeheader()
        for ev in events:
            writer.writerow(ev)


def scrape_events(driver):
    driver.get(URL)
    wait_for_page_and_lazy_load(driver)

    # Example XPaths you gave (for the FIRST card):
    # event name:    //*[@id="content"]/section/div/div/div/a[1]/div[2]/div[2]
    # event date:    //*[@id="content"]/section/div/div/div/a[1]/div[2]/div[1]
    # event location://*[@id="content"]/section/div/div/div/a[1]/div[2]/div[3]/span
    #
    # Generalized: under #content/section/div/div/div, find all <a> cards:
    cards = driver.find_elements(
        By.XPATH,
        '//*[@id="content"]/section/div/div/div/a'
    )

    existing_names, existing_urls = load_existing_name_and_url_sets(CSV_FILE)

    # Track seen within this run
    seen_names = set()
    seen_urls = set()

    new_events = []
    current_utc = datetime.now(timezone.utc).isoformat()

    for card in cards:
        # Event URL (href on <a>)
        event_url = (card.get_attribute("href") or "").strip()

        # Inside each <a>, layout is:
        # <a>
        #   ...
        #   <div[2]>  # info panel
        #       <div[1]> date
        #       <div[2]> name
        #       <div[3]><span> location
        #   ...
        # </a>
        try:
            info_div = card.find_element(By.XPATH, "./div[2]")
        except Exception:
            info_div = None

        if info_div is None:
            continue

        # Name
        try:
            name_elem = info_div.find_element(By.XPATH, "./div[2]")
            event_name = name_elem.text.strip()
        except Exception:
            event_name = ""

        if not event_name:
            continue

        # Date
        try:
            date_elem = info_div.find_element(By.XPATH, "./div[1]")
            event_date = date_elem.text.strip()
        except Exception:
            event_date = ""

        # Location
        try:
            loc_elem = info_div.find_element(By.XPATH, "./div[3]/span")
            event_location = loc_elem.text.strip()
        except Exception:
            event_location = ""

        # ---------- DEDUPE: name OR url ----------
        name_exists = (
            event_name in existing_names or
            event_name in seen_names
        )
        url_exists = (
            bool(event_url) and
            (event_url in existing_urls or event_url in seen_urls)
        )

        if name_exists or url_exists:
            # either name or url already present ‚Üí skip this event
            continue

        # Mark as seen for this run
        seen_names.add(event_name)
        if event_url:
            seen_urls.add(event_url)

        new_events.append(
            {
                "event_name": event_name,
                "event_date": event_date,
                "event_location": event_location,
                "event_url": event_url,
                "time_scraped_utc": current_utc,
            }
        )

    return new_events


def main():
    # headless=False ‚Üí show popup browser window while scraping
    driver = build_driver(headless=False)
    try:
        new_events = scrape_events(driver)
        if new_events:
            save_events_to_csv(CSV_FILE, new_events)
            print(f"Added {len(new_events)} new events to {CSV_FILE}.")
        else:
            print("No new events found. CSV unchanged.")

        # Optional: let you see the page for a bit before closing
        time.sleep(5)

    finally:
        driver.quit()


if __name__ == "__main__":
    main()


No new events found. CSV unchanged.


# Phuket

In [31]:
import os
import time
from datetime import datetime, timezone

import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

URL = "https://phuket.cafedelmar.com/events"
CSV_PATH = "all_phucket.csv"

# ---------------------------------------------------------
# UTILITIES
# ---------------------------------------------------------

def get_utc_timestamp():
    """Return current UTC time as a string."""
    return datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S%z")


def setup_driver():
    options = webdriver.ChromeOptions()
    options.add_argument("--start-maximized")
    # options.add_argument("--headless=new")  # uncomment if you want headless
    driver = webdriver.Chrome(options=options)
    return driver


def wait_for_page_loaded(driver, timeout=30):
    """
    Wait until the main event section is present.
    Reference: //*[@id="grid-only"]/section
    """
    WebDriverWait(driver, timeout).until(
        EC.presence_of_element_located((By.XPATH, "//*[@id='grid-only']/section"))
    )


def lazy_scroll(driver, pause=2.0, max_loops=20):
    """
    Scroll to the bottom repeatedly to trigger lazy-loading until
    the page height stops changing or max_loops is reached.
    """
    last_height = driver.execute_script("return document.body.scrollHeight")

    for _ in range(max_loops):
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(pause)

        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height


# ---------------------------------------------------------
# SCRAPER
# ---------------------------------------------------------

def scrape_events(driver):
    driver.get(URL)
    wait_for_page_loaded(driver)
    lazy_scroll(driver)

    # Main event section
    section_el = driver.find_element(By.XPATH, "//*[@id='grid-only']/section")

    # Get ALL cards under section (no [1] indices)
    cards = section_el.find_elements(By.XPATH, "./div/div/div/div")

    scraped_at = get_utc_timestamp()
    rows = []

    for card in cards:
        # Event name:
        try:
            name_el = card.find_element(By.XPATH, ".//div[2]/div[2]")
            event_name = name_el.text.strip()
        except Exception:
            event_name = ""

        # Event date:
        try:
            date_el = card.find_element(By.XPATH, ".//div[2]/div[1]/div[2]")
            event_date = date_el.text.strip()
        except Exception:
            event_date = ""

        # Event link (if any anchor in the card)
        try:
            link_el = card.find_element(By.XPATH, ".//a")
            event_link = link_el.get_attribute("href") or ""
        except Exception:
            event_link = ""

        if not event_name:
            continue

        row = {
            "event_name": event_name,
            "event_date": event_date,
            "event_location": "Cafe Del Mar Phuket",
            "event_section": "Cafe Del Mar Phuket Events",
            "event_link": event_link,
            "scraped_at_utc": scraped_at,
        }
        rows.append(row)

    return rows


# ---------------------------------------------------------
# DEDUPE HELPERS
# ---------------------------------------------------------

def _norm_col(df: pd.DataFrame, col: str) -> pd.Series:
    """Return a normalised text column (strip, collapse spaces, lowercase)."""
    if col not in df.columns:
        return pd.Series([""] * len(df), index=df.index)
    s = df[col].fillna("").astype(str)
    s = s.str.strip()
    s = s.str.replace(r"\s+", " ", regex=True)
    s = s.str.lower()
    return s


def build_uid(df: pd.DataFrame) -> pd.Series:
    """
    Build a stable unique ID for each event for deduplication.
    Uses normalised event_name + date + location + link.
    """
    name = _norm_col(df, "event_name")
    date = _norm_col(df, "event_date")
    loc  = _norm_col(df, "event_location")
    link = _norm_col(df, "event_link")

    return name + " | " + date + " | " + loc + " | " + link


# ---------------------------------------------------------
# MERGING / SAVING (NO ROW REMOVAL, NO DUPES)
# ---------------------------------------------------------

def save_merged_csv(new_rows, path=CSV_PATH):
    new_df = pd.DataFrame(new_rows)
    if new_df.empty:
        print("No events scraped.")
        return

    # Build UID for new data
    new_df["uid"] = build_uid(new_df)

    if os.path.exists(path):
        existing_df = pd.read_csv(path, dtype=str)

        # Rebuild uid for existing data (normalised, same logic)
        existing_df["uid"] = build_uid(existing_df)

        before_existing = len(existing_df)
        before_new = len(new_df)

        combined = pd.concat([existing_df, new_df], ignore_index=True)

        # Drop duplicates by uid
        combined = combined.drop_duplicates(subset=["uid"], keep="first")

        print(
            f"Existing rows: {before_existing}, "
            f"New rows: {before_new}, "
            f"Combined after dedupe: {len(combined)}"
        )
    else:
        combined = new_df
        print(f"Creating new file with {len(combined)} rows")

    combined.to_csv(path, index=False, encoding="utf-8-sig")
    print(f"Saved {len(combined)} rows to {path}")


# ---------------------------------------------------------
# -MAIN
# --------------------------------------------------------

def main():
    driver = setup_driver()
    try:
        events = scrape_events(driver)
        print(f"Scraped {len(events)} events from Cafe Del Mar Phuket.")
        save_merged_csv(events)
    finally:
        driver.quit()


if __name__ == "__main__":
    main()

Scraped 23 events from Cafe Del Mar Phuket.
Existing rows: 119, New rows: 23, Combined after dedupe: 130
Saved 130 rows to all_phucket.csv


In [32]:
import os
import csv
import time
from datetime import datetime, timezone

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import TimeoutException


URL = "https://lotusarenatickets.com/"
CSV_FILE = "all_phucket.csv"


def build_driver(headless: bool = False):
    """
    Build a Chrome WebDriver.
    headless=False so the website pops up in a visible window.
    """
    chrome_options = Options()

    if headless:
        chrome_options.add_argument("--headless=new")

    chrome_options.add_argument("--disable-gpu")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_options.add_argument("--window-size=1400,1000")

    driver = webdriver.Chrome(options=chrome_options)
    return driver


def wait_for_page_and_lazy_load(driver):
    """
    Wait for the main content section to appear, then give extra time
    for JS rendering, and scroll down slowly several times to load
    all events.
    """
    wait = WebDriverWait(driver, 40)

    try:
        # Wait until the main section under root/div[1]/div is present
        wait.until(
            EC.presence_of_element_located(
                (By.XPATH, '//*[@id="root"]/div[1]/div')
            )
        )
    except TimeoutException:
        print("Warning: main content section did not load in time.")
        return

    # Extra pause so React/JS has time to populate the list
    time.sleep(4)

    # Scroll down slowly several times to lazy-load everything
    last_height = 0
    for _ in range(15):
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(2.5)  # slow scrolling
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height

    # Final small wait before scraping
    time.sleep(2)


def load_existing_name_and_url_sets(csv_path: str):
    """
    Load existing event_name and event_url sets from CSV.
    Used for dedupe: skip if name OR url already exists.
    """
    existing_names = set()
    existing_urls = set()

    if not os.path.exists(csv_path):
        return existing_names, existing_urls

    with open(csv_path, "r", encoding="utf-8", newline="") as f:
        reader = csv.DictReader(f)
        for row in reader:
            name = (row.get("event_name") or "").strip()
            url = (row.get("event_url") or "").strip()
            if name:
                existing_names.add(name)
            if url:
                existing_urls.add(url)

    return existing_names, existing_urls


def save_events_to_csv(csv_path: str, events: list[dict]):
    """
    Append new events to the CSV.
    Never removes or overwrites existing rows.
    """
    file_exists = os.path.exists(csv_path)
    fieldnames = [
        "event_name",
        "event_description",
        "event_date",
        "event_url",
        "time_scraped_utc",
    ]

    with open(csv_path, "a", encoding="utf-8", newline="") as f:
        writer = csv.DictWriter(f, fieldnames=fieldnames)
        if not file_exists:
            writer.writeheader()
        for ev in events:
            writer.writerow(ev)


def scrape_events(driver):
    driver.get(URL)
    wait_for_page_and_lazy_load(driver)

    # MAIN SECTION containing all events
    try:
        container = driver.find_element(By.XPATH, '//*[@id="root"]/div[1]/div')
    except Exception:
        print("Could not find main event container.")
        return []

    # We now look THROUGH this section and treat any div that has BOTH
    # a <span> and a <p> (name + description) as an event card.
    # This is more general and should pick up all event cards inside.
    event_cards = container.find_elements(
        By.XPATH,
        './/div[span and p]'
    )

    existing_names, existing_urls = load_existing_name_and_url_sets(CSV_FILE)

    # Track what we see in this run as well
    seen_names = set()
    seen_urls = set()

    new_events = []
    current_utc = datetime.now(timezone.utc).isoformat()

    for card in event_cards:
        # ---- Event Name ----
        try:
            name_elem = card.find_element(By.XPATH, "./span")
            event_name = name_elem.text.strip()
        except Exception:
            event_name = ""

        if not event_name:
            continue

        # ---- Event Description ----
        try:
            desc_elem = card.find_element(By.XPATH, "./p")
            event_description = desc_elem.text.strip()
        except Exception:
            event_description = ""

        # ---- Event Date ----
        # Often date is in a nested div with a span; we use a flexible search:
        event_date = ""
        try:
            # First try the direct pattern ./div[1]/span
            date_elem = card.find_element(By.XPATH, "./div[1]/span")
            event_date = date_elem.text.strip()
        except Exception:
            # Fallback: any descendant div/span that looks like a date
            try:
                date_elem = card.find_element(By.XPATH, ".//div/span")
                event_date = date_elem.text.strip()
            except Exception:
                event_date = ""

        # ---- Event URL ----
        event_url = ""
        try:
            link_elem = card.find_element(By.XPATH, ".//a")
            href = link_elem.get_attribute("href")
            if href:
                event_url = href.strip()
        except Exception:
            event_url = ""

        # ---------- DEDUPE: name OR url ----------
        name_exists = (
            event_name in existing_names or
            event_name in seen_names
        )
        url_exists = (
            bool(event_url) and
            (event_url in existing_urls or event_url in seen_urls)
        )

        if name_exists or url_exists:
            # Skip if either event name or event url is already present
            continue

        seen_names.add(event_name)
        if event_url:
            seen_urls.add(event_url)

        new_events.append(
            {
                "event_name": event_name,
                "event_description": event_description,
                "event_date": event_date,
                "event_url": event_url,
                "time_scraped_utc": current_utc,
            }
        )

    return new_events


def main():
    # headless=False ‚Üí show popup browser window while scraping
    driver = build_driver(headless=False)
    try:
        new_events = scrape_events(driver)
        if new_events:
            save_events_to_csv(CSV_FILE, new_events)
            print(f"Added {len(new_events)} new events to {CSV_FILE}.")
        else:
            print("No new events found. CSV unchanged.")

        # Optional: keep browser open briefly so you can see the page
        time.sleep(5)

    finally:
        driver.quit()


if __name__ == "__main__":
    main()



Added 15 new events to all_phucket.csv.


In [33]:
import os
import csv
import time
from datetime import datetime, timezone

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import TimeoutException


URLS = [
    "https://www.impact.co.th/en/visitors/event-calendar?search=&category_id=1&location_id=0&month_select=11&year_select=2025&filter_duration=month_11_year_2025",
    "https://www.impact.co.th/en/visitors/event-calendar?category_id=1&location_id=0&filter_duration=month_12_year_2025",
]

CSV_FILE = "all_phucket.csv"


def build_driver(headless: bool = False):
    """
    Build a Chrome WebDriver.
    headless=False so the browser pops up and you can see it.
    """
    chrome_options = Options()
    if headless:
        chrome_options.add_argument("--headless=new")

    chrome_options.add_argument("--disable-gpu")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_options.add_argument("--window-size=1400,1000")

    driver = webdriver.Chrome(options=chrome_options)
    return driver


def wait_for_page_and_lazy_load(driver):
    """
    Wait for the event grid to appear, then scroll down several times
    to allow lazy-loaded events to load.
    """
    wait = WebDriverWait(driver, 30)

    try:
        # Wait for at least one event card to appear (based on your XPaths)
        wait.until(
            EC.presence_of_element_located(
                (
                    By.XPATH,
                    '//*[@id="eb-category-grid"]/div/div[1]/div/div[2]/a'
                )
            )
        )
    except TimeoutException:
        print("Warning: event grid did not load in time.")
        return

    # Extra pause so everything has time to render
    time.sleep(3)

    # Scroll down slowly to lazy-load more events
    last_height = 0
    for _ in range(10):
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(2.0)
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height

    # Small pause before scraping
    time.sleep(1.5)


def load_existing_name_and_url_sets(csv_path: str):
    """
    Load existing event_name and event_url sets from CSV.
    Used for dedupe: skip if name OR url already exists.
    """
    existing_names = set()
    existing_urls = set()

    if not os.path.exists(csv_path):
        return existing_names, existing_urls

    with open(csv_path, "r", encoding="utf-8", newline="") as f:
        reader = csv.DictReader(f)
        for row in reader:
            name = (row.get("event_name") or "").strip()
            url = (row.get("event_url") or "").strip()
            if name:
                existing_names.add(name)
            if url:
                existing_urls.add(url)

    return existing_names, existing_urls


def save_events_to_csv(csv_path: str, events: list[dict]):
    """
    Append new events to the CSV.
    Never removes or overwrites existing rows.
    """
    file_exists = os.path.exists(csv_path)
    fieldnames = [
        "event_name",
        "event_date",
        "event_location",
        "event_url",
        "time_scraped_utc",
    ]

    with open(csv_path, "a", encoding="utf-8", newline="") as f:
        writer = csv.DictWriter(f, fieldnames=fieldnames)
        if not file_exists:
            writer.writeheader()
        for ev in events:
            writer.writerow(ev)


def scrape_page(driver, url: str, existing_names: set, existing_urls: set):
    """
    Scrape a single URL and return a list of new event dicts.
    Uses dedupe sets passed in to avoid duplicates across pages.
    """
    print(f"Scraping: {url}")
    driver.get(url)
    wait_for_page_and_lazy_load(driver)

    # Your example XPaths for first event:
    # event name:     //*[@id="eb-category-grid"]/div/div[1]/div/div[2]/a
    # event date:     //*[@id="eb-category-grid"]/div/div[1]/div/div[3]
    # event location: //*[@id="eb-category-grid"]/div/div[1]/div/div[4]/span
    #
    # Generalized: each event card is under: //*[@id="eb-category-grid"]/div/div
    cards = driver.find_elements(
        By.XPATH,
        '//*[@id="eb-category-grid"]/div/div'
    )

    # Track names/urls we see in this run (so we don't duplicate within this run)
    seen_names = set()
    seen_urls = set()

    new_events = []
    current_utc = datetime.now(timezone.utc).isoformat()

    for card in cards:
        # Event name & URL from div[2]/a
        try:
            name_elem = card.find_element(By.XPATH, "./div/div[2]/a")
            event_name = name_elem.text.strip()
            event_url = (name_elem.get_attribute("href") or "").strip()
        except Exception:
            event_name = ""
            event_url = ""

        if not event_name:
            continue

        # Event date from div[3]
        try:
            date_elem = card.find_element(By.XPATH, "./div/div[3]")
            event_date = date_elem.text.strip()
        except Exception:
            event_date = ""

        # Event location from div[4]/span
        try:
            loc_elem = card.find_element(By.XPATH, "./div/div[4]/span")
            event_location = loc_elem.text.strip()
        except Exception:
            event_location = ""

        # ---------- DEDUPE: name OR url ----------
        name_exists = (
            event_name in existing_names or
            event_name in seen_names
        )
        url_exists = (
            bool(event_url) and
            (event_url in existing_urls or event_url in seen_urls)
        )

        if name_exists or url_exists:
            continue

        # Mark as seen
        seen_names.add(event_name)
        if event_url:
            seen_urls.add(event_url)

        new_events.append(
            {
                "event_name": event_name,
                "event_date": event_date,
                "event_location": event_location,
                "event_url": event_url,
                "time_scraped_utc": current_utc,
            }
        )

    # Update global sets for cross-page dedupe
    existing_names.update(seen_names)
    existing_urls.update(seen_urls)

    return new_events


def main():
    # headless=False ‚Üí show popup browser window while scraping
    driver = build_driver(headless=False)
    try:
        existing_names, existing_urls = load_existing_name_and_url_sets(CSV_FILE)
        all_new_events = []

        for url in URLS:
            events = scrape_page(driver, url, existing_names, existing_urls)
            all_new_events.extend(events)

        if all_new_events:
            save_events_to_csv(CSV_FILE, all_new_events)
            print(f"Added {len(all_new_events)} new events to {CSV_FILE}.")
        else:
            print("No new events found. CSV unchanged.")

        # Optional: keep browser open briefly so you can see the page
        time.sleep(5)

    finally:
        driver.quit()


if __name__ == "__main__":
    main()


Scraping: https://www.impact.co.th/en/visitors/event-calendar?search=&category_id=1&location_id=0&month_select=11&year_select=2025&filter_duration=month_11_year_2025
Scraping: https://www.impact.co.th/en/visitors/event-calendar?category_id=1&location_id=0&filter_duration=month_12_year_2025
Added 10 new events to all_phucket.csv.


# Seoul

In [34]:
# https://www.wanderlochhall.com/15
# https://m.ticketlink.co.kr/global/en/local/seoul?sorting=LAST_REGISTRATION_ORDER
# https://tkglobal.melon.com/main/index.htm?langCd=EN
# https://ticket.yes24.com/New/Genre/GenreMain.aspx?genre=15456&Gcode=009_202_001

In [35]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import csv
import os
from datetime import datetime, timezone

URL = "https://www.livenation.kr/en/event/allevents"
CSV_FILE = "all_seoul.csv"


def init_driver():
    options = webdriver.ChromeOptions()
    # Force English if possible
    options.add_argument("--lang=en-US")
    # options.add_argument("--headless=new")  # uncomment if you want headless
    driver = webdriver.Chrome(options=options)
    return driver


def scroll_to_load_all(driver, pause=2):
    """Scroll until no new content is loaded (for lazy loading)."""
    last_height = driver.execute_script("return document.body.scrollHeight")

    while True:
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(pause)
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height


def load_existing_csv(csv_path):
    """Load existing rows + keys to avoid duplicates."""
    existing_rows = []
    existing_keys = set()  # (name, venue, date)

    if os.path.exists(csv_path):
        with open(csv_path, newline="", encoding="utf-8") as f:
            reader = csv.DictReader(f)
            for row in reader:
                existing_rows.append(row)
                key = (row.get("event_name", "").strip(),
                       row.get("event_venue", "").strip(),
                       row.get("event_date", "").strip())
                existing_keys.add(key)

    return existing_rows, existing_keys


def save_csv(csv_path, rows):
    fieldnames = ["event_name", "event_venue", "event_date", "scraped_utc"]
    with open(csv_path, "w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(rows)


def scrape_livenation_seoul():
    driver = init_driver()
    driver.get(URL)

    wait = WebDriverWait(driver, 20)

    # Wait for the main event section to appear
    event_section = wait.until(
        EC.presence_of_element_located(
            (By.XPATH, '//*[@id="main"]/div[2]/div[1]/div/div/div/div[2]')
        )
    )

    # Scroll to load all events
    scroll_to_load_all(driver, pause=2)

    # After scrolling, re-grab the section (in case DOM updated)
    event_section = driver.find_element(
        By.XPATH, '//*[@id="main"]/div[2]/div[1]/div/div/div/div[2]'
    )

    # Heuristic: each event is usually inside a <li> that contains a <time>
    event_cards = event_section.find_elements(By.XPATH, './/li[.//time]')

    print(f"Found {len(event_cards)} event cards")

    existing_rows, existing_keys = load_existing_csv(CSV_FILE)
    new_rows = []

    utc_now = datetime.now(timezone.utc).isoformat()

    for card in event_cards:
        try:
            # Event name: based on your XPath sample: span/div/div/p[1]
            name_el = card.find_element(By.XPATH, './/span/div/div/p[1]')
            event_name = name_el.text.strip()

            # Event venue: span/div/div/p[3]
            venue_el = card.find_element(By.XPATH, './/span/div/div/p[3]')
            event_venue = venue_el.text.strip()

            # Event date: <time> tag
            time_el = card.find_element(By.XPATH, './/time')
            # Use the datetime attribute if available, otherwise text
            event_date = time_el.get_attribute("datetime") or time_el.text
            event_date = event_date.strip()

            key = (event_name, event_venue, event_date)

            if key in existing_keys:
                # Already in CSV, skip
                continue

            row = {
                "event_name": event_name,
                "event_venue": event_venue,
                "event_date": event_date,
                "scraped_utc": utc_now,
            }
            new_rows.append(row)
            existing_keys.add(key)

        except Exception as e:
            # If any element is missing for a card, just skip that card
            print("Error parsing an event card:", e)
            continue

    driver.quit()

    combined_rows = existing_rows + new_rows
    save_csv(CSV_FILE, combined_rows)

    print(f"Existing rows kept: {len(existing_rows)}")
    print(f"New unique rows added: {len(new_rows)}")
    print(f"Total rows now in CSV: {len(combined_rows)}")


if __name__ == "__main__":
    scrape_livenation_seoul()



Found 7 event cards
Existing rows kept: 204
New unique rows added: 0
Total rows now in CSV: 204


In [36]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import csv
import os
from datetime import datetime, timezone

URL = "https://tkglobal.melon.com/main/index.htm?langCd=EN"
CSV_FILE = "all_seoul.csv"


def init_driver():
    options = webdriver.ChromeOptions()
    options.add_argument("--lang=en-US")
    # options.add_argument("--headless=new")  # uncomment for headless
    driver = webdriver.Chrome(options=options)
    return driver


def scroll_to_load_all(driver, pause=2):
    """Scroll down until page height stops changing (for lazy loading)."""
    last_height = driver.execute_script("return document.body.scrollHeight")

    while True:
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(pause)
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height


def load_existing_csv(csv_path):
    """Load existing rows & build a key set to avoid duplicates."""
    existing_rows = []
    existing_keys = set()  # (name, venue, date)

    if os.path.exists(csv_path):
        with open(csv_path, newline="", encoding="utf-8") as f:
            reader = csv.DictReader(f)
            for row in reader:
                existing_rows.append(row)
                key = (
                    row.get("event_name", "").strip(),
                    row.get("event_venue", "").strip(),
                    row.get("event_date", "").strip(),
                )
                existing_keys.add(key)

    return existing_rows, existing_keys


def save_csv(csv_path, rows):
    fieldnames = ["event_name", "event_venue", "event_date", "scraped_utc"]
    with open(csv_path, "w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(rows)


def scrape_melon_all_seoul():
    driver = init_driver()
    driver.get(URL)

    wait = WebDriverWait(driver, 20)

    # Wait for the main contents section
    conts_div = wait.until(
        EC.presence_of_element_located((By.XPATH, '//*[@id="conts"]/div'))
    )

    # Scroll to load all events (lazy load)
    scroll_to_load_all(driver, pause=2)

    # Re-locate the section after scrolling (in case DOM updated)
    conts_div = driver.find_element(By.XPATH, '//*[@id="conts"]/div')

    # Assuming events are in ul/li under div[1]: //*[@id="conts"]/div/div[1]/ul/li[*]
    event_list_items = conts_div.find_elements(
        By.XPATH, './div[1]/ul/li'
    )

    print(f"Found {len(event_list_items)} event items")

    existing_rows, existing_keys = load_existing_csv(CSV_FILE)
    new_rows = []

    # Single scrape timestamp for this run
    utc_now = datetime.now(timezone.utc).isoformat()

    for li in event_list_items:
        try:
            # Event name: //*[@id="conts"]/div/div[1]/ul/li[1]/div[2]/h2
            name_el = li.find_element(By.XPATH, './div[2]/h2')
            event_name = name_el.text.strip()

            # Event venue: //*[@id="conts"]/div/div[1]/ul/li[1]/div[2]/dl/dd[2]
            venue_el = li.find_element(By.XPATH, './div[2]/dl/dd[2]')
            event_venue = venue_el.text.strip()

            # Event date: //*[@id="conts"]/div/div[1]/ul/li[1]/div[2]/dl/dd[1]
            date_el = li.find_element(By.XPATH, './div[2]/dl/dd[1]')
            event_date = date_el.text.strip()

            key = (event_name, event_venue, event_date)

            # Skip if already in CSV
            if key in existing_keys:
                continue

            row = {
                "event_name": event_name,
                "event_venue": event_venue,
                "event_date": event_date,
                "scraped_utc": utc_now,
            }
            new_rows.append(row)
            existing_keys.add(key)

        except Exception as e:
            print("Error parsing an event item:", e)
            continue

    driver.quit()

    combined_rows = existing_rows + new_rows
    save_csv(CSV_FILE, combined_rows)

    print(f"Existing rows kept: {len(existing_rows)}")
    print(f"New unique rows added: {len(new_rows)}")
    print(f"Total rows now in CSV: {len(combined_rows)}")


if __name__ == "__main__":
    scrape_melon_all_seoul()


Found 126 event items
Existing rows kept: 204
New unique rows added: 25
Total rows now in CSV: 229
