In [None]:
import requests
from bs4 import BeautifulSoup
import json
import time
import random

In [None]:
def get_event_links(page_number):
    """
    Fetches event links from a specific page number on Eventbrite.
    Returns a set of URLs.
    """
    url = f"https://www.eventbrite.com/d/ny--new-york/all-events/?page={page_number}"
    
    # Rotate user agents or use a standard one to look like a browser
    headers = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36',
        'Accept-Language': 'en-US,en;q=0.9',
    }
    
    try:
        response = requests.get(url, headers=headers, timeout=10)
        
        # Check if we've been redirected to page 1 (common behavior when pages run out)
        # or if we hit a 404
        if response.status_code == 404:
            return set()
        if response.url != url and "page=1" not in response.url and page_number != 1:
            print(f"  [!] Redirected to {response.url} - assuming end of pages.")
            return set()

        soup = BeautifulSoup(response.text, 'html.parser')
        event_links = set()

        # Method: JSON-LD (Structured Data) extraction
        scripts = soup.find_all('script', type='application/ld+json')
        
        for script in scripts:
            try:
                data = json.loads(script.string)
                # Helper to extract url from a single item dict
                def extract_url(item):
                    if 'url' in item:
                        return item['url']
                    elif 'item' in item and 'url' in item['item']: # Nested Schema
                        return item['item']['url']
                    return None

                if isinstance(data, list):
                    for item in data:
                        url = extract_url(item)
                        if url: event_links.add(url)
                
                elif isinstance(data, dict):
                    if 'itemListElement' in data:
                        for item in data['itemListElement']:
                            url = extract_url(item)
                            if url: event_links.add(url)
            except (json.JSONDecodeError, TypeError):
                continue
        
        # Fallback: If JSON fails, look for 'a' tags with specific patterns
        # Note: Eventbrite changes classes often, so checking href for '/e/' is safer
        if not event_links:
            links = soup.find_all('a', href=True)
            for link in links:
                href = link['href']
                if '/e/' in href and 'eventbrite.com' in href:
                    clean_link = href.split('?')[0] # Remove tracking params
                    event_links.add(clean_link)
                    
        return event_links

    except requests.exceptions.RequestException as e:
        print(f"  [!] Error fetching page {page_number}: {e}")
        return set()

In [None]:
all_events = set()
page = 1
max_safety_limit = 500  # Hard stop to prevent infinite loops if logic fails

print("Starting scraper...")

while page <= max_safety_limit:
    print(f"Scraping Page {page}...", end=" ")
    
    new_links = get_event_links(page)
    
    # Stop condition: If no links are returned, we have likely reached the end
    if not new_links:
        print("No events found. Reached end of results.")
        break
    
    # Update master list
    initial_count = len(all_events)
    all_events.update(new_links)
    new_count = len(all_events)
    
    print(f"Found {len(new_links)} links. (Total unique: {new_count})")
    
    # If we didn't add any new unique links, we might be seeing a 'no results' page 
    # that still has promoted links we've already seen.
    if new_count == initial_count and page > 1:
        print("  [!] No new unique links found. Stopping to avoid duplicate loops.")
        break

    page += 1
    
    # PAUSE: Random sleep to be polite and avoid IP bans
    sleep_time = random.uniform(2, 5) 
    time.sleep(sleep_time)

print("-" * 30)
print(f"Scraping complete. Found {len(all_events)} unique event links.")

In [17]:
import requests
from bs4 import BeautifulSoup
import json
import time
import random
import concurrent.futures
from datetime import datetime, timedelta

In [20]:
# --- Configuration ---
START_DATE = "2025-12-12"  # YYYY-MM-DD
DAYS_TO_SCRAPE = 30        # How many days from start date

# Concurrency Controls
MAX_DAY_WORKERS = 5        # How many days to process at once (Outer Loop)
MAX_PAGE_WORKERS = 10       # How many pages to scrape at once per day (Inner Loop)
# NOTE: Total concurrent requests = DAY_WORKERS * PAGE_WORKERS (approx 50 here)

BASE_URL = "https://www.eventbrite.com/d/ny--new-york/all-events/"

USER_AGENTS = [
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.64 Safari/537.36',
]

In [21]:
def get_dates(start_date_str, num_days):
    """Generates a list of date strings (YYYY-MM-DD)"""
    start = datetime.strptime(start_date_str, "%Y-%m-%d")
    return [(start + timedelta(days=i)).strftime("%Y-%m-%d") for i in range(num_days)]

In [22]:
def scrape_single_page(date, page_number):
    """
    Inner Worker: Scrapes a specific page for a specific date.
    """
    # Construct URL with date filters
    params = {
        'page': page_number,
        'start_date': date,
        'end_date': date
    }
    
    # Random sleep to prevent "thundering herd" on the server
    time.sleep(random.uniform(1, 3))
    
    headers = {
        'User-Agent': random.choice(USER_AGENTS),
        'Accept-Language': 'en-US,en;q=0.9',
    }

    try:
        response = requests.get(BASE_URL, params=params, headers=headers, timeout=10)
        
        # 404 or redirect usually means end of results
        if response.status_code == 404:
            return set()
        if "page=1" in response.url and page_number > 1:
            return set()

        soup = BeautifulSoup(response.text, 'html.parser')
        links = set()

        # 1. JSON-LD Extraction
        scripts = soup.find_all('script', type='application/ld+json')
        for script in scripts:
            try:
                data = json.loads(script.string)
                if isinstance(data, list):
                    for item in data:
                        if 'url' in item: links.add(item['url'])
                elif isinstance(data, dict):
                    if 'itemListElement' in data:
                        for item in data['itemListElement']:
                            if 'url' in item: links.add(item['url'])
                            elif 'item' in item and 'url' in item['item']:
                                links.add(item['item']['url'])
            except: continue

        # 2. Fallback Extraction
        if not links:
            for a in soup.find_all('a', href=True):
                if '/e/' in a['href'] and 'eventbrite.com' in a['href']:
                    links.add(a['href'].split('?')[0])

        return links

    except Exception as e:
        # print(f"  [!] Error {date} pg {page_number}: {e}")
        return set()

In [23]:
def manage_day_scrape(target_date):
    """
    Outer Worker: Manages the scraping for a single day.
    Spawns inner workers to handle pages in batches.
    """
    print(f"üìÖ Starting Day: {target_date}")
    day_links = set()
    current_page = 1
    keep_scraping = True
    
    # We use a ThreadPool for the pages within this day
    with concurrent.futures.ThreadPoolExecutor(max_workers=MAX_PAGE_WORKERS) as page_executor:
        
        while keep_scraping:
            # Create a batch of pages (e.g., try pages 1, 2, 3 concurrently)
            # We batch to avoid queuing 50 pages for a day that has 0 events.
            batch_size = MAX_PAGE_WORKERS
            futures = {}
            
            for i in range(batch_size):
                pg = current_page + i
                futures[page_executor.submit(scrape_single_page, target_date, pg)] = pg
            
            batch_has_results = False
            
            for future in concurrent.futures.as_completed(futures):
                page_num = futures[future]
                try:
                    links = future.result()
                    if links:
                        day_links.update(links)
                        batch_has_results = True
                        # print(f"   -> {target_date} Page {page_num}: Found {len(links)} links")
                    else:
                        # If a page returns empty, we might have hit the end.
                        pass
                except Exception:
                    continue
            
            # Decision Logic:
            # If the entire batch returned 0 links, we assume the day is done.
            # (Or if we reached a safety limit like 50 pages)
            if not batch_has_results or current_page > 50:
                keep_scraping = False
            else:
                current_page += batch_size
                time.sleep(1) # Breath between batches

    print(f"‚úÖ Finished {target_date}: {len(day_links)} total events.")
    return day_links

In [24]:
all_unique_links = set()
dates_to_scrape = get_dates(START_DATE, DAYS_TO_SCRAPE)

print(f"Starting Scrape for {len(dates_to_scrape)} days...")
print(f"Configuration: {MAX_DAY_WORKERS} Day-Workers x {MAX_PAGE_WORKERS} Page-Workers")

# Outer Pool: Manages different days
with concurrent.futures.ThreadPoolExecutor(max_workers=MAX_DAY_WORKERS) as day_executor:
    
    # Submit all date tasks
    future_to_date = {day_executor.submit(manage_day_scrape, date): date for date in dates_to_scrape}
    
    for future in concurrent.futures.as_completed(future_to_date):
        date = future_to_date[future]
        try:
            links = future.result()
            all_unique_links.update(links)
        except Exception as e:
            print(f"‚ùå Critical failure on {date}: {e}")

print("-" * 30)
print(f"SCRAPE COMPLETE.")
print(f"Total Unique Event Links: {len(all_unique_links)}")

# Save to file
with open('nyc_events_dated.txt', 'w') as f:
    for link in all_unique_links:
        f.write(f"{link}\n")

Starting Scrape for 30 days...
Configuration: 5 Day-Workers x 10 Page-Workers
üìÖ Starting Day: 2025-12-12
üìÖ Starting Day: 2025-12-13
üìÖ Starting Day: 2025-12-14
üìÖ Starting Day: 2025-12-15
üìÖ Starting Day: 2025-12-16
‚úÖ Finished 2025-12-15: 304 total events.
üìÖ Starting Day: 2025-12-17
‚úÖ Finished 2025-12-16: 455 total events.
üìÖ Starting Day: 2025-12-18
‚úÖ Finished 2025-12-13: 700 total events.
üìÖ Starting Day: 2025-12-19
‚úÖ Finished 2025-12-14: 708 total events.
üìÖ Starting Day: 2025-12-20
‚úÖ Finished 2025-12-12: 700 total events.
üìÖ Starting Day: 2025-12-21
‚úÖ Finished 2025-12-17: 525 total events.
üìÖ Starting Day: 2025-12-22
‚úÖ Finished 2025-12-18: 570 total events.
üìÖ Starting Day: 2025-12-23
‚úÖ Finished 2025-12-21: 693 total events.
üìÖ Starting Day: 2025-12-24
‚úÖ Finished 2025-12-19: 705 total events.
üìÖ Starting Day: 2025-12-25
‚úÖ Finished 2025-12-20: 706 total events.
üìÖ Starting Day: 2025-12-26
‚úÖ Finished 2025-12-22: 188 total events.

In [28]:
import requests
from bs4 import BeautifulSoup
import json
import re
import time
import random
import concurrent.futures
from datetime import datetime

# --- Configuration ---
MAX_WORKERS = 50  # Higher than search scraping because these are distinct pages
INPUT_FILE = 'nyc_events_dated.txt' # The file created in the previous step
OUTPUT_FILE = 'nyc_events_details.json'

# --- Worker Function (The Extraction Logic) ---
def extract_single_event(url):
    """
    Worker function to scrape a single event URL.
    Returns a dictionary or None.
    """
    # 1. Random Sleep for Safety (Desynchronize threads)
    time.sleep(random.uniform(0.5, 2.0))
    
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
        'Accept-Language': 'en-US,en;q=0.9',
    }

    try:
        response = requests.get(url, headers=headers, timeout=10)
        
        if response.status_code != 200:
            return None

        soup = BeautifulSoup(response.text, 'html.parser')
        data = {}

        # A. Try JSON-LD
        scripts = soup.find_all('script', type='application/ld+json')
        for script in scripts:
            try:
                js_data = json.loads(script.string)
                if isinstance(js_data, list): js_data = js_data[0]
                if 'name' in js_data and 'startDate' in js_data:
                    data = js_data
                    break
            except: continue

        # B. Try Server Data (The specific fix for your "broken" links)
        if not data:
            pattern = re.compile(r'window\.__SERVER_DATA__\s*=\s*({.*?});', re.DOTALL)
            script_content = soup.find('script', string=pattern)
            if script_content:
                match = pattern.search(script_content.string)
                if match:
                    try:
                        server_data = json.loads(match.group(1))
                        # Attempt to locate event object inside server data
                        if 'event' in server_data: data = server_data['event']
                        elif 'dates' in server_data: data = server_data
                    except: pass

        # C. Fallback: Meta Tags
        if not data:
            data['name'] = soup.find('meta', property='og:title')['content'] if soup.find('meta', property='og:title') else "Unknown"
            start_meta = soup.find('meta', property='event:start_time')
            if start_meta: data['startDate'] = start_meta['content']
            
        # --- Normalization ---
        # Title
        title = data.get('name', 'Unknown')
        if isinstance(title, dict): title = title.get('text', 'Unknown')

        # Date
        timestamp = None
        start_str = data.get('startDate')
        if start_str:
            try:
                dt = datetime.fromisoformat(start_str.replace('Z', '+00:00'))
                timestamp = int(dt.timestamp() * 1000)
            except: pass

        # Location
        loc_name = "Online"
        if 'location' in data:
            loc = data['location']
            if isinstance(loc, dict):
                loc_name = loc.get('name') or loc.get('address', {}).get('addressLocality', 'See Details')
            elif isinstance(loc, str):
                loc_name = loc
        
        # Categories (Scraping Tags)
        categories = []
        if 'keywords' in data:
            k = data['keywords']
            categories = k if isinstance(k, list) else str(k).split(',')
        
        if not categories:
            for link in soup.find_all('a', href=True):
                if '/d/' in link['href'] and '-events/' in link['href']:
                    tag = link.get_text(strip=True)
                    if tag and len(tag) < 30 and "Events" not in tag and tag not in categories:
                        categories.append(tag)

        return {
            "Title": title,
            "DateTime": timestamp,
            "Location": loc_name,
            "Categories": categories,
            "Link": url
        }

    except Exception:
        return None

# --- Main Execution Manager ---
if __name__ == "__main__":
    
    # 1. Load Links
    # Assuming you have the file from the previous step. 
    # If not, replace this list with your manual list of strings.
    try:
        with open(INPUT_FILE, 'r') as f:
            links = [line.strip() for line in f if line.strip()]
    except FileNotFoundError:
        print(f"File {INPUT_FILE} not found. Using sample list.")
        links = [
            "https://www.eventbrite.com/e/new-year-eve-hottest-bollywood-desi-party-racket-nyc-tickets-1977187088807",
            "https://www.eventbrite.com/e/tech-and-business-networking-elevating-your-potential-hoboken-tickets-1640869628229",
            "https://www.eventbrite.com/e/african-dance-tickets-926482633497"
        ]

    print(f"üöÄ Starting extraction for {len(links)} events...")
    print(f"   Workers: {MAX_WORKERS}")

    results = []
    completed = 0
    
    # 2. Parallel Processing
    with concurrent.futures.ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
        # Create a dictionary to map {future: url} for error tracking
        future_to_url = {executor.submit(extract_single_event, url): url for url in links}
        
        for future in concurrent.futures.as_completed(future_to_url):
            url = future_to_url[future]
            try:
                data = future.result()
                completed += 1
                
                if data:
                    results.append(data)
                    # Optional: Print progress every 10 items
                    if completed % 10 == 0:
                        print(f"   Progress: {completed}/{len(links)} done...")
                else:
                    print(f"   [!] Failed/Empty: {url}")
                    
            except Exception as e:
                print(f"   [!] Exception for {url}: {e}")

    # 3. Save Results
    print(f"‚úÖ Extraction Complete. Successfully extracted {len(results)} events.")
    
    with open(OUTPUT_FILE, 'w') as f:
        json.dump(results, f, indent=4)
        
    print(f"Saved to {OUTPUT_FILE}")

üöÄ Starting extraction for 9250 events...
   Workers: 50
   Progress: 10/9250 done...
   Progress: 20/9250 done...
   Progress: 30/9250 done...
   Progress: 40/9250 done...
   [!] Failed/Empty: https://www.eventbrite.com/e/1212-treadwell-nyc-shake-it-off-dance-party-games-cocktails-tickets-1902704072099
   Progress: 60/9250 done...
   Progress: 70/9250 done...
   Progress: 80/9250 done...
   Progress: 90/9250 done...
   Progress: 100/9250 done...
   Progress: 110/9250 done...
   Progress: 120/9250 done...
   Progress: 130/9250 done...
   Progress: 140/9250 done...
   Progress: 150/9250 done...
   Progress: 160/9250 done...
   Progress: 170/9250 done...
   Progress: 180/9250 done...
   Progress: 190/9250 done...
   Progress: 200/9250 done...
   Progress: 210/9250 done...
   Progress: 220/9250 done...
   Progress: 230/9250 done...
   Progress: 240/9250 done...
   Progress: 250/9250 done...
   Progress: 260/9250 done...
   Progress: 270/9250 done...
   Progress: 280/9250 done...
   Prog