In [None]:
import requests
from bs4 import BeautifulSoup
import json
import time
import random

In [None]:
def get_event_links(page_number):
    """
    Fetches event links from a specific page number on Eventbrite.
    Returns a set of URLs.
    """
    url = f"https://www.eventbrite.com/d/ny--new-york/all-events/?page={page_number}"
    
    # Rotate user agents or use a standard one to look like a browser
    headers = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36',
        'Accept-Language': 'en-US,en;q=0.9',
    }
    
    try:
        response = requests.get(url, headers=headers, timeout=10)
        
        # Check if we've been redirected to page 1 (common behavior when pages run out)
        # or if we hit a 404
        if response.status_code == 404:
            return set()
        if response.url != url and "page=1" not in response.url and page_number != 1:
            print(f"  [!] Redirected to {response.url} - assuming end of pages.")
            return set()

        soup = BeautifulSoup(response.text, 'html.parser')
        event_links = set()

        # Method: JSON-LD (Structured Data) extraction
        scripts = soup.find_all('script', type='application/ld+json')
        
        for script in scripts:
            try:
                data = json.loads(script.string)
                # Helper to extract url from a single item dict
                def extract_url(item):
                    if 'url' in item:
                        return item['url']
                    elif 'item' in item and 'url' in item['item']: # Nested Schema
                        return item['item']['url']
                    return None

                if isinstance(data, list):
                    for item in data:
                        url = extract_url(item)
                        if url: event_links.add(url)
                
                elif isinstance(data, dict):
                    if 'itemListElement' in data:
                        for item in data['itemListElement']:
                            url = extract_url(item)
                            if url: event_links.add(url)
            except (json.JSONDecodeError, TypeError):
                continue
        
        # Fallback: If JSON fails, look for 'a' tags with specific patterns
        # Note: Eventbrite changes classes often, so checking href for '/e/' is safer
        if not event_links:
            links = soup.find_all('a', href=True)
            for link in links:
                href = link['href']
                if '/e/' in href and 'eventbrite.com' in href:
                    clean_link = href.split('?')[0] # Remove tracking params
                    event_links.add(clean_link)
                    
        return event_links

    except requests.exceptions.RequestException as e:
        print(f"  [!] Error fetching page {page_number}: {e}")
        return set()

In [None]:
all_events = set()
page = 1
max_safety_limit = 500  # Hard stop to prevent infinite loops if logic fails

print("Starting scraper...")

while page <= max_safety_limit:
    print(f"Scraping Page {page}...", end=" ")
    
    new_links = get_event_links(page)
    
    # Stop condition: If no links are returned, we have likely reached the end
    if not new_links:
        print("No events found. Reached end of results.")
        break
    
    # Update master list
    initial_count = len(all_events)
    all_events.update(new_links)
    new_count = len(all_events)
    
    print(f"Found {len(new_links)} links. (Total unique: {new_count})")
    
    # If we didn't add any new unique links, we might be seeing a 'no results' page 
    # that still has promoted links we've already seen.
    if new_count == initial_count and page > 1:
        print("  [!] No new unique links found. Stopping to avoid duplicate loops.")
        break

    page += 1
    
    # PAUSE: Random sleep to be polite and avoid IP bans
    sleep_time = random.uniform(2, 5) 
    time.sleep(sleep_time)

print("-" * 30)
print(f"Scraping complete. Found {len(all_events)} unique event links.")

In [17]:
import requests
from bs4 import BeautifulSoup
import json
import time
import random
import concurrent.futures
from datetime import datetime, timedelta

In [20]:
# --- Configuration ---
START_DATE = "2025-12-12"  # YYYY-MM-DD
DAYS_TO_SCRAPE = 30        # How many days from start date

# Concurrency Controls
MAX_DAY_WORKERS = 5        # How many days to process at once (Outer Loop)
MAX_PAGE_WORKERS = 10       # How many pages to scrape at once per day (Inner Loop)
# NOTE: Total concurrent requests = DAY_WORKERS * PAGE_WORKERS (approx 50 here)

BASE_URL = "https://www.eventbrite.com/d/ny--new-york/all-events/"

USER_AGENTS = [
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.64 Safari/537.36',
]

In [21]:
def get_dates(start_date_str, num_days):
    """Generates a list of date strings (YYYY-MM-DD)"""
    start = datetime.strptime(start_date_str, "%Y-%m-%d")
    return [(start + timedelta(days=i)).strftime("%Y-%m-%d") for i in range(num_days)]

In [22]:
def scrape_single_page(date, page_number):
    """
    Inner Worker: Scrapes a specific page for a specific date.
    """
    # Construct URL with date filters
    params = {
        'page': page_number,
        'start_date': date,
        'end_date': date
    }
    
    # Random sleep to prevent "thundering herd" on the server
    time.sleep(random.uniform(1, 3))
    
    headers = {
        'User-Agent': random.choice(USER_AGENTS),
        'Accept-Language': 'en-US,en;q=0.9',
    }

    try:
        response = requests.get(BASE_URL, params=params, headers=headers, timeout=10)
        
        # 404 or redirect usually means end of results
        if response.status_code == 404:
            return set()
        if "page=1" in response.url and page_number > 1:
            return set()

        soup = BeautifulSoup(response.text, 'html.parser')
        links = set()

        # 1. JSON-LD Extraction
        scripts = soup.find_all('script', type='application/ld+json')
        for script in scripts:
            try:
                data = json.loads(script.string)
                if isinstance(data, list):
                    for item in data:
                        if 'url' in item: links.add(item['url'])
                elif isinstance(data, dict):
                    if 'itemListElement' in data:
                        for item in data['itemListElement']:
                            if 'url' in item: links.add(item['url'])
                            elif 'item' in item and 'url' in item['item']:
                                links.add(item['item']['url'])
            except: continue

        # 2. Fallback Extraction
        if not links:
            for a in soup.find_all('a', href=True):
                if '/e/' in a['href'] and 'eventbrite.com' in a['href']:
                    links.add(a['href'].split('?')[0])

        return links

    except Exception as e:
        # print(f"  [!] Error {date} pg {page_number}: {e}")
        return set()

In [23]:
def manage_day_scrape(target_date):
    """
    Outer Worker: Manages the scraping for a single day.
    Spawns inner workers to handle pages in batches.
    """
    print(f"üìÖ Starting Day: {target_date}")
    day_links = set()
    current_page = 1
    keep_scraping = True
    
    # We use a ThreadPool for the pages within this day
    with concurrent.futures.ThreadPoolExecutor(max_workers=MAX_PAGE_WORKERS) as page_executor:
        
        while keep_scraping:
            # Create a batch of pages (e.g., try pages 1, 2, 3 concurrently)
            # We batch to avoid queuing 50 pages for a day that has 0 events.
            batch_size = MAX_PAGE_WORKERS
            futures = {}
            
            for i in range(batch_size):
                pg = current_page + i
                futures[page_executor.submit(scrape_single_page, target_date, pg)] = pg
            
            batch_has_results = False
            
            for future in concurrent.futures.as_completed(futures):
                page_num = futures[future]
                try:
                    links = future.result()
                    if links:
                        day_links.update(links)
                        batch_has_results = True
                        # print(f"   -> {target_date} Page {page_num}: Found {len(links)} links")
                    else:
                        # If a page returns empty, we might have hit the end.
                        pass
                except Exception:
                    continue
            
            # Decision Logic:
            # If the entire batch returned 0 links, we assume the day is done.
            # (Or if we reached a safety limit like 50 pages)
            if not batch_has_results or current_page > 50:
                keep_scraping = False
            else:
                current_page += batch_size
                time.sleep(1) # Breath between batches

    print(f"‚úÖ Finished {target_date}: {len(day_links)} total events.")
    return day_links

In [24]:
all_unique_links = set()
dates_to_scrape = get_dates(START_DATE, DAYS_TO_SCRAPE)

print(f"Starting Scrape for {len(dates_to_scrape)} days...")
print(f"Configuration: {MAX_DAY_WORKERS} Day-Workers x {MAX_PAGE_WORKERS} Page-Workers")

# Outer Pool: Manages different days
with concurrent.futures.ThreadPoolExecutor(max_workers=MAX_DAY_WORKERS) as day_executor:
    
    # Submit all date tasks
    future_to_date = {day_executor.submit(manage_day_scrape, date): date for date in dates_to_scrape}
    
    for future in concurrent.futures.as_completed(future_to_date):
        date = future_to_date[future]
        try:
            links = future.result()
            all_unique_links.update(links)
        except Exception as e:
            print(f"‚ùå Critical failure on {date}: {e}")

print("-" * 30)
print(f"SCRAPE COMPLETE.")
print(f"Total Unique Event Links: {len(all_unique_links)}")

# Save to file
with open('nyc_events_dated.txt', 'w') as f:
    for link in all_unique_links:
        f.write(f"{link}\n")

Starting Scrape for 30 days...
Configuration: 5 Day-Workers x 10 Page-Workers
üìÖ Starting Day: 2025-12-12
üìÖ Starting Day: 2025-12-13
üìÖ Starting Day: 2025-12-14
üìÖ Starting Day: 2025-12-15
üìÖ Starting Day: 2025-12-16
‚úÖ Finished 2025-12-15: 304 total events.
üìÖ Starting Day: 2025-12-17
‚úÖ Finished 2025-12-16: 455 total events.
üìÖ Starting Day: 2025-12-18
‚úÖ Finished 2025-12-13: 700 total events.
üìÖ Starting Day: 2025-12-19
‚úÖ Finished 2025-12-14: 708 total events.
üìÖ Starting Day: 2025-12-20
‚úÖ Finished 2025-12-12: 700 total events.
üìÖ Starting Day: 2025-12-21
‚úÖ Finished 2025-12-17: 525 total events.
üìÖ Starting Day: 2025-12-22
‚úÖ Finished 2025-12-18: 570 total events.
üìÖ Starting Day: 2025-12-23
‚úÖ Finished 2025-12-21: 693 total events.
üìÖ Starting Day: 2025-12-24
‚úÖ Finished 2025-12-19: 705 total events.
üìÖ Starting Day: 2025-12-25
‚úÖ Finished 2025-12-20: 706 total events.
üìÖ Starting Day: 2025-12-26
‚úÖ Finished 2025-12-22: 188 total events.

In [None]:
from curl_cffi import requests as crequests 
import json
import re
import time
from datetime import datetime, timezone
from concurrent.futures import ThreadPoolExecutor, as_completed

# --- Configuration ---
API_ENDPOINT = "https://www.eventbrite.com/api/v3/destination/events/?event_ids={}&expand=event_sales_status,image,primary_venue,ticket_availability,taxonomy,tags,upcoming_occurrences"
# Recommended: Keep this low (3-5) to avoid IP bans.
MAX_WORKERS = 5

def get_fallback_data_from_html(url):
    """
    Fallback: Scrapes the schema.org JSON-LD from the HTML page 
    to find Date AND Location if the API fails.
    """
    try:
        response = crequests.get(url, impersonate="chrome110", timeout=15)
        if response.status_code != 200: return None, None, None

        matches = re.findall(r'<script type="application/ld\+json">(.*?)</script>', response.text, re.DOTALL)
        
        found_date = None
        found_date_str = None
        found_location = None

        for match in matches:
            try:
                data = json.loads(match)
                if isinstance(data, list): data = data[0]
                
                if data.get('@type') in ['Event', 'SocialEvent', 'MusicEvent']:
                    # 1. Fallback Date
                    start_str = data.get('startDate')
                    if start_str:
                        dt = datetime.fromisoformat(start_str)
                        if dt.tzinfo is None: dt = dt.replace(tzinfo=timezone.utc)
                        found_date = int(dt.timestamp() * 1000)
                        found_date_str = dt.astimezone(timezone.utc).strftime('%Y-%m-%d %H:%M:%S UTC')

                    # 2. Fallback Location
                    loc_data = data.get('location')
                    if isinstance(loc_data, dict):
                        venue_name = loc_data.get('name', '')
                        address = loc_data.get('address', {})
                        parts = [venue_name]
                        if isinstance(address, dict):
                            parts.extend([
                                address.get('streetAddress'),
                                address.get('addressLocality'),
                                address.get('addressRegion'),
                                address.get('postalCode')
                            ])
                        elif isinstance(address, str):
                            parts.append(address)
                        found_location = ", ".join([p for p in parts if p])
            except:
                continue
        return found_date, found_date_str, found_location
    except Exception:
        return None, None, None

def get_event_data_via_api(url):
    # 1. Extract Event ID
    event_id_match = re.search(r'(\d{10,})', url)
    if not event_id_match:
        return None
    
    event_id = event_id_match.group(1)
    api_url = API_ENDPOINT.format(event_id)

    # 2. Call API
    try:
        response = crequests.get(
            api_url,
            impersonate="chrome110",
            headers={"Accept": "application/json"},
            timeout=15
        )
        if response.status_code != 200: return None

        data = response.json()
        if 'events' not in data or not data['events']: return None
        event = data['events'][0]
        
        # --- EXTRACTION ---
        
        # Title
        title = "No Title"
        name_obj = event.get('name')
        if isinstance(name_obj, dict): title = name_obj.get('text', "No Title")
        elif isinstance(name_obj, str): title = name_obj

        # Date Helper
        def parse_start_obj(s_obj):
            if not isinstance(s_obj, dict): return None, None
            d_str = s_obj.get('utc') or s_obj.get('local')
            if not d_str: return None, None
            try:
                if d_str.endswith('Z'): d_str = d_str.replace('Z', '+00:00')
                dt = datetime.fromisoformat(d_str)
                if dt.tzinfo is None: dt = dt.replace(tzinfo=timezone.utc)
                return int(dt.timestamp() * 1000), dt.astimezone(timezone.utc).strftime('%Y-%m-%d %H:%M:%S UTC')
            except: return None, None

        # Date Attempt
        timestamp, datetime_str = parse_start_obj(event.get('start'))
        if not timestamp:
            occurrences = event.get('upcoming_occurrences')
            if occurrences and isinstance(occurrences, list):
                timestamp, datetime_str = parse_start_obj(occurrences[0].get('start'))
        
        # Location Attempt
        location = None
        venue = event.get('primary_venue')
        if isinstance(venue, dict):
            address_obj = venue.get('address')
            if isinstance(address_obj, dict):
                location = address_obj.get('localized_address_display')
            
            if not location:
                venue_name = venue.get('name', '')
                parts = [venue_name]
                if isinstance(address_obj, dict):
                    parts.extend([
                        address_obj.get('address_1'),
                        address_obj.get('city'),
                        address_obj.get('region'),
                        address_obj.get('postal_code')
                    ])
                clean_parts = [p for p in parts if p and p.strip()]
                if clean_parts: location = ", ".join(clean_parts)

        if not location and event.get('online_event'): location = "Online"

        # Trigger Fallback if needed
        if not timestamp or not location:
            fb_ts, fb_str, fb_loc = get_fallback_data_from_html(url)
            if not timestamp: timestamp, datetime_str = fb_ts, fb_str
            if not location: location = fb_loc

        # Categories
        raw_categories = set()
        tax = event.get('taxonomy')
        if isinstance(tax, dict):
            cat = tax.get('category')
            if isinstance(cat, dict): raw_categories.add(cat.get('name', ''))
            sub = tax.get('sub_category')
            if isinstance(sub, dict): raw_categories.add(sub.get('name', ''))
        tags = event.get('tags')
        if isinstance(tags, list):
            for tag in tags:
                if isinstance(tag, dict): raw_categories.add(tag.get('display_name', ''))
                elif isinstance(tag, str): raw_categories.add(tag)
        cleaned_categories = [c for c in raw_categories if c and c[0].isupper()]

        return {
            "Title": title,
            "DateTime": timestamp,
            "DateTimeStr": datetime_str or "TBD",
            "Location": location or "Location TBD",
            "Categories": sorted(cleaned_categories),
            "EventID": event_id,
            "Link": url
        }
    except Exception:
        return None

# --- Main Execution (Parallelized) ---
if __name__ == "__main__":
    input_file = "nyc_events_dated.txt"
    output_file = "eventbrite_events_corrected.json"
    limit = 20  # How many to process

    print(f"üöÄ Reading links from {input_file}...")
    
    links = []
    try:
        with open(input_file, 'r', encoding='utf-8') as f:
            for line in f:
                clean_line = line.strip()
                if clean_line and clean_line.startswith("http"):
                    links.append(clean_line)
    except FileNotFoundError:
        print(f"‚ùå Error: File '{input_file}' not found.")
        exit()

    # target_links = links[:limit]
    target_links = links
    print(f"üìã Processing {len(target_links)} events with {MAX_WORKERS} concurrent workers...\n")

    results = []
    
    # --- PARALLEL EXECUTION START ---
    with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
        # Submit all tasks to the pool
        # This creates a dictionary: {Future Object: URL}
        future_to_url = {executor.submit(get_event_data_via_api, url): url for url in target_links}
        
        # as_completed yields futures as they finish (not necessarily in order of submission)
        for future in as_completed(future_to_url):
            url = future_to_url[future]
            try:
                data = future.result()
                if data:
                    results.append(data)
                    print(f"‚úÖ Finished: {data['Title'][:40]}...")
                else:
                    print(f"‚ö†Ô∏è Failed: {url}")
            except Exception as exc:
                print(f"‚ùå Exception for {url}: {exc}")
    # --- PARALLEL EXECUTION END ---

    print("\n--- JSON OUTPUT ---")
    print(json.dumps(results, indent=4))
    
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(results, f, indent=4, ensure_ascii=False)
    
    print(f"\n‚úÖ Done! Saved {len(results)} events to {output_file}")

üöÄ Reading links from nyc_events_dated.txt...
üìã Processing 20 events with 5 concurrent workers...

‚úÖ Finished: SUNDAY RESET BREATH WORK SERIES  ‚ú¶Ô∏é Soma...
‚úÖ Finished: Cold Beer & A Good Laugh at The Grisly P...
‚úÖ Finished: Tech and Business Networking | Elevating...
‚úÖ Finished: Earth & Art: Hand-Built Pottery with Jun...
‚úÖ Finished: HIGHLIFE SATURDAYS ROOFTOP PARTY - NYC 3...
‚úÖ Finished: The Everri Holiday Sample Sale ‚Äî Jewelry...
‚úÖ Finished: New Year Eve Hottest Bollywood  Desi Par...
‚úÖ Finished: Long Pose Figure Drawing & Painting Work...
‚úÖ Finished: New Year's Eve at Bar Sprezzatura...
‚úÖ Finished: Jersey City Conspiracy Game: The Outdoor...
‚úÖ Finished: Trauma-Informed Vinyasa Yoga...
‚úÖ Finished: Christians Over Coffee: Manhattan Meetup...
‚úÖ Finished: LIVE Music EVERY Monday Night at the Bro...
‚úÖ Finished: Boozy Sundae Decorating Class...
‚úÖ Finished: Traditional Turkish Mosaic Lamp Workshop...
‚úÖ Finished: Community Heart Space and Breathwork