In [3]:
import pandas as pd

# Read the file (replace with your actual file path)
df = pd.read_csv("highape_events_enriched_20250607_171657.csv")

# Fix "artists" column safely
for idx, row in df.iterrows():
    artists = row.get("artists", "")
    if pd.notna(artists):
        artist_list = [a.strip().title() for a in str(artists).split(",") if a.strip()]
        df.at[idx, "artists"] = ", ".join(artist_list)
    else:
        df.at[idx, "artists"] = ""

# Save to a new cleaned file
df.to_csv("highape_events_enriched_20250607_171657_cleaned.csv", index=False)
print("✅ Cleaned CSV saved as: highape_events_enriched_20250607_171657_cleaned.csv")


✅ Cleaned CSV saved as: highape_events_enriched_20250607_171657_cleaned.csv


In [7]:
# Post-processing script to clean and enhance HighApe event data
import pandas as pd
import json
import re
from datetime import datetime
from pathlib import Path

# === CONFIG ===
CSV_INPUT_PATH = "highape_events_enriched_20250607_171657_cleaned.csv"
OUTPUT_CSV_PATH = "highape_events_cleaned.csv"
HTML_JSON_FOLDER = Path("./")  # JSON files in current folder

# === HELPERS ===
def parse_datetime_and_duration(date_str):
    try:
        date_time_match = re.search(r'(\d{1,2} \w+)(?: - (\d{1,2} \w+))?(?: \| (\d{1,2}:\d{2} [APMapm]{2}))?', date_str)
        if not date_time_match:
            return date_str, ''

        start_date = date_time_match.group(1)
        end_date = date_time_match.group(2) or start_date
        time_str = date_time_match.group(3) or "6:00 PM"

        now_year = datetime.now().year
        start = datetime.strptime(f"{start_date} {now_year} {time_str}", "%d %B %Y %I:%M %p")
        end = datetime.strptime(f"{end_date} {now_year} {time_str}", "%d %B %Y %I:%M %p")
        duration_ms = int((end - start).total_seconds() * 1000)
        iso_str = start.isoformat() + "Z"
        return iso_str, duration_ms
    except:
        return date_str, ''

def normalize_text(text):
    return re.sub(r'[\n\r]+', ' ', str(text)).strip()

# === MAIN PROCESS ===
df = pd.read_csv(CSV_INPUT_PATH)

for idx, row in df.iterrows():
    # --- Fix Date & Duration ---
    fixed_date, duration = parse_datetime_and_duration(row.get("eventDateAndTime", ""))
    df.at[idx, "eventDateAndTime"] = fixed_date
    df.at[idx, "eventDuration"] = duration

    # --- Fix Artists ---
    artists = row.get("artists", "")
    df.at[idx, "artists"] = ', '.join(filter(None, [a.strip().title() for a in artists.split(",")]))

    # --- Validate lat/lon ---
    if row.get("lat", "").strip() in ["", "0"]:
        df.at[idx, "lat"] = ""
    if row.get("lon", "").strip() in ["", "0"]:
        df.at[idx, "lon"] = ""

    # --- Normalize Description ---
    df.at[idx, "eventDescription"] = normalize_text(row.get("eventDescription", ""))

    # --- Load JSON raw HTML for missing data recovery ---
    json_path = row.get("jsonFilePath")
    if json_path and Path(json_path).exists():
        with open(json_path, 'r') as f:
            html_data = json.load(f)
            # TODO: Add smart field recovery logic if needed

# === OUTPUT ===
df.to_csv(OUTPUT_CSV_PATH, index=False)
print(f"✅ Cleaned CSV saved to: {OUTPUT_CSV_PATH}")


  df.at[idx, "eventDuration"] = duration


AttributeError: 'float' object has no attribute 'split'

In [9]:
import pandas as pd
import json
import re
import numpy as np
from datetime import datetime
from pathlib import Path

# === CONFIG ===
CSV_INPUT_PATH = "highape_events_enriched_20250607_171657_cleaned.csv"
OUTPUT_CSV_PATH = "highape_events_cleaned.csv"
HTML_JSON_FOLDER = Path("./")  # All JSON files expected in current folder

# === HELPERS ===
def parse_datetime_and_duration(date_str):
    try:
        date_time_match = re.search(r'(\d{1,2} \w+)(?: - (\d{1,2} \w+))?(?: \| (\d{1,2}:\d{2} [APMapm]{2}))?', str(date_str))
        if not date_time_match:
            return date_str, np.nan

        start_date = date_time_match.group(1)
        end_date = date_time_match.group(2) or start_date
        time_str = date_time_match.group(3) or "6:00 PM"

        now_year = datetime.now().year
        start = datetime.strptime(f"{start_date} {now_year} {time_str}", "%d %B %Y %I:%M %p")
        end = datetime.strptime(f"{end_date} {now_year} {time_str}", "%d %B %Y %I:%M %p")
        duration_ms = int((end - start).total_seconds() * 1000)
        iso_str = start.isoformat() + "Z"
        return iso_str, duration_ms
    except:
        return date_str, np.nan

def normalize_text(text):
    return re.sub(r'[\n\r]+', ' ', str(text)).strip()

# === MAIN PROCESS ===
df = pd.read_csv(CSV_INPUT_PATH)

for idx, row in df.iterrows():
    # --- Fix Date & Duration ---
    fixed_date, duration = parse_datetime_and_duration(row.get("eventDateAndTime", ""))
    df.at[idx, "eventDateAndTime"] = fixed_date
    df.at[idx, "eventDuration"] = duration

    # --- Fix Artists ---
    artists = row.get("artists", "")
    if pd.notna(artists):
        df.at[idx, "artists"] = ', '.join(filter(None, [a.strip().title() for a in str(artists).split(",")]))
    else:
        df.at[idx, "artists"] = ""

    # --- Validate lat/lon ---
    if str(row.get("lat", "")).strip() in ["", "0"]:
        df.at[idx, "lat"] = ""
    if str(row.get("lon", "")).strip() in ["", "0"]:
        df.at[idx, "lon"] = ""

    # --- Normalize Description ---
    df.at[idx, "eventDescription"] = normalize_text(row.get("eventDescription", ""))

    # --- Load raw HTML JSON if needed (placeholder) ---
    json_path = row.get("jsonFilePath")
    if json_path and Path(json_path).exists():
        with open(json_path, 'r') as f:
            html_data = json.load(f)
            # TODO: Use HTML if specific fields are missing
            pass

# === OUTPUT ===
df.to_csv(OUTPUT_CSV_PATH, index=False)
print(f"✅ Cleaned CSV saved to: {OUTPUT_CSV_PATH}")


✅ Cleaned CSV saved to: highape_events_cleaned.csv


In [None]:
import pandas as pd
import json
import re
from pathlib import Path
from bs4 import BeautifulSoup

INPUT_CSV = "highape_events_cleaned.csv"
OUTPUT_CSV = "highape_events_final_enriched.csv"

# Helper: Normalize text
def normalize_text(text):
    return re.sub(r'[\n\r]+', ' ', str(text)).strip()

# Helper: Build Plur JSON
def build_plur_json(event):
    def to_int(val): return int(val) if str(val).isdigit() else 0
    def to_float(val): return float(val) if re.match(r'^-?\d+(?:\.\d+)?$', str(val)) else None

    return {
        "eventName": event.get("eventName", ""),
        "eventDescription": event.get("eventDescription", ""),
        "eventDateAndTime": event.get("eventDateAndTime", ""),
        "eventDuration": to_int(event.get("eventDuration", "")),
        "venue": {
            "venueName": event.get("venueName", ""),
            "locality": event.get("locality", ""),
            "address": event.get("address", ""),
            "city": event.get("city", ""),
            "state": event.get("state", ""),
            "zipcode": event.get("zipcode", ""),
            "geolocation": {
                "lat": to_float(event.get("lat", "")),
                "lon": to_float(event.get("lon", ""))
            },
            "layout": event.get("layout", "")
        },
        "highlightImageLinks": [event.get("highlightImages")] if event.get("highlightImages") else [],
        "galleryImageLinks": [event.get("highlightImages")] if event.get("highlightImages") else [],
        "ticketAmount": to_int(event.get("ticketAmount", "")),
        "ticketLink": event.get("ticketLink", ""),
        "supportedLanguages": ["English"],
        "category": event.get("category", ""),
        "subCategory": event.get("subCategory", ""),
        "eventType": event.get("eventType", ""),
        "eventFeatures": {
            "foodAvailable": event.get("foodAvailable", "") == "Yes",
            "smokingAllowed": event.get("smokingAllowed", "") == "Yes",
            "wheelchairAccess": event.get("wheelchairAccess", "") == "Yes",
            "parkingAvailable": event.get("parkingAvailable", "") == "Yes",
            "supportAvailable": event.get("supportAvailable", "") == "Yes",
            "petFriendly": event.get("petFriendly", False),
            "alcoholServed": event.get("alcoholServed", "") == "Yes",
            "minimumAge": to_int(event.get("minimumAge", "")),
            "ticketsAtVenue": event.get("ticketsAtVenue", "") == "Yes",
            "washroomAvailable": event.get("washroomAvailable", "") == "Yes",
            "danceFloorAvailable": event.get("danceFloorAvailable", "") == "Yes",
            "poolAvailable": event.get("poolAvailable", "") == "Yes"
        },
        "artists": [a.strip() for a in str(event.get("artists", "")).split(",") if a.strip()],
        "sharableEventOgImageLink": event.get("highlightImages", ""),
        "attendeesCount": to_int(event.get("attendeesCount", "")),
        "likesCount": 0,
        "joinChatDetails": {
            "joinChatLink": "",
            "provider": "",
            "isEnabled": False
        },
        "policyAndConditions": str(event.get("policyAndConditions", "")).split("\n") if event.get("policyAndConditions") else [],
        "frequentlyAskedQuestions": []
    }

# Load CSV
df = pd.read_csv(INPUT_CSV)

# Iterate over rows and enrich missing data
for idx, row in df.iterrows():
    json_path = row.get("jsonFilePath")
    if json_path and Path(json_path).exists():
        with open(json_path, "r", encoding="utf-8") as f:
            raw_data = json.load(f)
            html = raw_data.get("rawHtml", "")
            soup = BeautifulSoup(html, "html.parser")

            # Fill missing description
            if not row.get("eventDescription"):
                desc_node = soup.select_one("#desc .event-content-div")
                if desc_node:
                    df.at[idx, "eventDescription"] = normalize_text(desc_node.get_text())

            # Fill highlight image
            if not row.get("highlightImages"):
                img_node = soup.select_one('#image_carousel_web img.img-background-events')
                if img_node and img_node.get("src"):
                    df.at[idx, "highlightImages"] = img_node["src"]

    # Normalize and rebuild Plur_json_format
    event_dict = df.loc[idx].to_dict()
    plur_json = build_plur_json(event_dict)
    df.at[idx, "Plur_json_format"] = json.dumps(plur_json, ensure_ascii=False, indent=2)

# Save updated file
df.to_csv(OUTPUT_CSV, index=False)
print(f"✅ Final enriched CSV saved as: {OUTPUT_CSV}")


In [13]:
import pandas as pd
import json
import re
from bs4 import BeautifulSoup
from datetime import datetime
from pathlib import Path

# === CONFIG ===
CSV_INPUT_PATH = "highape_events_cleaned.csv"
CSV_OUTPUT_PATH = "highape_events_cleaned_fixed.csv"

# === FUNCTIONS ===

def build_plur_json(event):
    def to_int(val): return int(val) if str(val).isdigit() else 0
    def to_float(val): return float(val) if re.match(r'^-?\d+(?:\.\d+)?$', str(val)) else None

    return {
        "eventName": event["eventName"],
        "eventDescription": event["eventDescription"],
        "eventDateAndTime": event["eventDateAndTime"],
        "eventDuration": to_int(event["eventDuration"]),
        "venue": {
            "venueName": event["venueName"],
            "locality": event["locality"],
            "address": event["address"],
            "city": event["city"],
            "state": event["state"],
            "zipcode": event["zipcode"],
            "geolocation": {
                "lat": to_float(event["lat"]),
                "lon": to_float(event["lon"]),
            },
            "layout": event.get("layout", "")
        },
        "highlightImageLinks": [event["highlightImages"]] if event["highlightImages"] else [],
        "galleryImageLinks": [event["highlightImages"]] if event["highlightImages"] else [],
        "ticketAmount": to_int(event["ticketAmount"]),
        "ticketLink": event["ticketLink"],
        "supportedLanguages": ["en"] if not event["supportedLanguages"] else [event["supportedLanguages"]],
        "category": event["category"],
        "subCategory": event["subCategory"],
        "eventType": event["eventType"],
        "eventFeatures": {
            "foodAvailable": event["foodAvailable"] == "Yes",
            "smokingAllowed": event["smokingAllowed"] == "Yes",
            "wheelchairAccess": event["wheelchairAccess"] == "Yes",
            "parkingAvailable": event["parkingAvailable"] == "Yes",
            "supportAvailable": event["supportAvailable"] == "Yes",
            "petFriendly": bool(event["petFriendly"]),
            "alcoholServed": event["alcoholServed"] == "Yes",
            "minimumAge": to_int(event["minimumAge"]),
            "ticketsAtVenue": event["ticketsAtVenue"] == "Yes",
            "washroomAvailable": event["washroomAvailable"] == "Yes",
            "danceFloorAvailable": event["danceFloorAvailable"] == "Yes",
            "poolAvailable": event["poolAvailable"] == "Yes"
        },
        "artists": str(event["artists"]).split(", ") if event["artists"] else [],
        "sharableEventOgImageLink": event["highlightImages"],
        "attendeesCount": to_int(event["attendeesCount"]),
        "likesCount": 0,
        "joinChatDetails": {
            "joinChatLink": "",
            "provider": "",
            "isEnabled": False
        },
        "policyAndConditions": str(event["policyAndConditions"]).split("\n") if event["policyAndConditions"] else [],
        "frequentlyAskedQuestions": []
    }

def parse_event_duration(date_str):
    try:
        pattern = r'(\d{1,2} \w+)(?: - (\d{1,2} \w+))?(?: \| (\d{1,2}:\d{2} [APMapm]{2}))?'
        match = re.search(pattern, date_str)
        if not match:
            return date_str, ""

        start_date = match.group(1)
        end_date = match.group(2) or start_date
        time_str = match.group(3) or "6:00 PM"
        now_year = datetime.now().year

        start = datetime.strptime(f"{start_date} {now_year} {time_str}", "%d %B %Y %I:%M %p")
        end = datetime.strptime(f"{end_date} {now_year} {time_str}", "%d %B %Y %I:%M %p")
        duration_ms = int((end - start).total_seconds() * 1000)
        iso_str = start.isoformat() + "Z"
        return iso_str, str(duration_ms)
    except:
        return date_str, ""

# === MAIN PROCESS ===
df = pd.read_csv(CSV_INPUT_PATH)

for idx, row in df.iterrows():
    # === Clean venueName ===
    venue = str(row.get("venueName", ""))
    for field in ["locality", "city", "state"]:
        val = str(row.get(field, ""))
        if val and val in venue:
            venue = venue.replace(val, "").strip(", ").strip()
    df.at[idx, "venueName"] = venue

    # === Fix missing eventDateAndTime + duration ===
    date_raw = str(row.get("eventDateAndTime", ""))
    iso, duration = parse_event_duration(date_raw)
    df.at[idx, "eventDateAndTime"] = iso
    df.at[idx, "eventDuration"] = str(duration)

    # === Fix ticketAmount ===
    ticket_amt = row.get("ticketAmount", "")
    if isinstance(ticket_amt, str) and "free" in ticket_amt.lower():
        df.at[idx, "ticketAmount"] = "0"

    # === Fix supportedLanguages ===
    lang = str(row.get("supportedLanguages", "")).strip()
    df.at[idx, "supportedLanguages"] = lang if lang else "en"

    # === Fix artists formatting ===
    artists = row.get("artists", "")
    if isinstance(artists, str):
        df.at[idx, "artists"] = ', '.join([a.strip().title() for a in artists.split(",") if a.strip()])
    else:
        df.at[idx, "artists"] = ""

    # === Load HTML from jsonFilePath and extract if needed ===
    json_path = row.get("jsonFilePath")
    if json_path and Path(json_path).exists():
        try:
            with open(json_path, 'r', encoding='utf-8') as f:
                data = json.load(f)
            soup = BeautifulSoup(data.get("rawHtml", ""), "html.parser")

            # Example enhancement: try recovering missing description
            if not row.get("eventDescription"):
                desc_node = soup.select_one('#desc .event-content-div')
                if desc_node:
                    df.at[idx, "eventDescription"] = desc_node.get_text(strip=True)

            # Example: parse highlight image
            if not row.get("highlightImages"):
                img_tag = soup.select_one('#image_carousel_web img.img-background-events')
                if img_tag:
                    df.at[idx, "highlightImages"] = img_tag.get("src", "")
        except Exception as e:
            print(f"⚠️ Could not process {json_path}: {e}")

    # === Rebuild Plur_json_format ===
    event_dict = df.loc[idx].to_dict()
    plur_json = build_plur_json(event_dict)
    df.at[idx, "Plur_json_format"] = json.dumps(plur_json, ensure_ascii=False, indent=2)

# === SAVE OUTPUT ===
df.to_csv(CSV_OUTPUT_PATH, index=False)
print(f"✅ Cleaned and enriched CSV saved to: {CSV_OUTPUT_PATH}")


  df.at[idx, "eventDuration"] = str(duration)
  df.at[idx, "supportedLanguages"] = lang if lang else "en"


✅ Cleaned and enriched CSV saved to: highape_events_cleaned_fixed.csv


In [17]:
import json
from bs4 import BeautifulSoup
from pathlib import Path

file_path = "/Users/shiv/Documents/Jupyter/HighApe/event_json_files/run_20250607_161335/event_1bdd4ba4-1d6b-4c21-beb8-19482f712e86.json"

# Load and parse
data = json.loads(Path(file_path).read_text(encoding="utf-8"))

# Prettify only the rawHtml field
if "rawHtml" in data:
    data["rawHtml"] = BeautifulSoup(data["rawHtml"], "html.parser").prettify()

# Overwrite the same file (no renaming)
Path(file_path).write_text(json.dumps(data, indent=2, ensure_ascii=False))

print("✅ 'rawHtml' field is now readable in Sublime.")


✅ 'rawHtml' field is now readable in Sublime.


In [21]:
import json
from pathlib import Path
from bs4 import BeautifulSoup

# Input file path
file_path = "/Users/shiv/Documents/Jupyter/HighApe/event_json_files/run_20250607_161335/event_1bdd4ba4-1d6b-4c21-beb8-19482f712e86.json"

# Load JSON
data = json.loads(Path(file_path).read_text(encoding="utf-8"))

# Format rawHtml if it exists
if "rawHtml" in data:
    # Prettify the HTML content
    soup = BeautifulSoup(data["rawHtml"], "html.parser")
    pretty_html = soup.prettify()
    data["rawHtml"] = pretty_html

    # Save back to the same file (optional: create a backup)
    Path(file_path).write_text(json.dumps(data, indent=2, ensure_ascii=False))
    print("✅ rawHtml has been formatted and saved inside the JSON.")
else:
    print("❌ 'rawHtml' field not found.")



✅ rawHtml has been formatted and saved inside the JSON.


In [23]:
from bs4 import BeautifulSoup
from pathlib import Path

# Load raw/minified HTML
input_path = Path("/Users/shiv/Documents/Jupyter/HighApe/event_json_files/run_20250607_161335/event_1bdd4ba4-1d6b-4c21-beb8-19482f712e86.html")
html_content = input_path.read_text(encoding="utf-8")

# Parse and prettify
soup = BeautifulSoup(html_content, "html.parser")
pretty_html = soup.prettify()

# Save as a new file (or overwrite if you want)
output_path = Path("event_1bdd4ba4-1d6b-4c21-beb8-19482f712e86_pretty.html")
output_path.write_text(pretty_html, encoding="utf-8")

print(f"✅ Prettified HTML saved to: {output_path}")


✅ Prettified HTML saved to: event_1bdd4ba4-1d6b-4c21-beb8-19482f712e86_pretty.html


In [25]:
# Script: Enhance HighApe Event CSV using JSON-LD in raw HTML

import pandas as pd
import json
import re
from datetime import datetime
from pathlib import Path
from bs4 import BeautifulSoup

CSV_INPUT = "highape_events_cleaned.csv"
CSV_OUTPUT = "highape_events_cleaned_with_jsonld.csv"

# Load CSV
df = pd.read_csv(CSV_INPUT)

# --- Helpers ---
def parse_iso_and_duration(start, end):
    try:
        start_dt = datetime.strptime(start, "%Y-%m-%d %H:%M:%S")
        end_dt = datetime.strptime(end, "%Y-%m-%d %H:%M:%S")
        return start_dt.isoformat() + "Z", int((end_dt - start_dt).total_seconds() * 1000)
    except:
        return start, ''

def extract_jsonld_from_html(html):
    try:
        soup = BeautifulSoup(html, 'html.parser')
        script_tags = soup.find_all('script', type='application/ld+json')
        for tag in script_tags:
            try:
                data = json.loads(tag.text.strip())
                if isinstance(data, list):
                    for item in data:
                        if item.get('@type') == 'Event':
                            return item
                elif data.get('@type') == 'Event':
                    return data
            except:
                continue
    except Exception as e:
        print(f"Error parsing HTML: {e}")
    return None

# --- Main enhancement loop ---
for idx, row in df.iterrows():
    json_path = row.get("jsonFilePath")
    if not json_path or not Path(json_path).exists():
        continue

    with open(json_path, 'r') as f:
        html_json = json.load(f)
        html = html_json.get("rawHtml")
        ld = extract_jsonld_from_html(html)

    if ld:
        if not row.get("eventName"):
            df.at[idx, "eventName"] = ld.get("name", "")

        if not row.get("highlightImages"):
            df.at[idx, "highlightImages"] = ld.get("image", "")

        loc = ld.get("location", {})
        if isinstance(loc, dict):
            df.at[idx, "venueName"] = loc.get("name", "")
            df.at[idx, "address"] = loc.get("address", "")

            addr = loc.get("address", "")
            zip_match = re.search(r'\b(\d{6})\b', addr)
            if zip_match:
                df.at[idx, "zipcode"] = zip_match.group(1)

            parts = re.findall(r'[A-Za-z]+', addr)
            if len(parts) >= 3:
                df.at[idx, "locality"] = parts[-3]
                df.at[idx, "city"] = parts[-2]
                df.at[idx, "state"] = parts[-1]

        start, end = ld.get("startDate", ""), ld.get("endDate", "")
        iso, duration = parse_iso_and_duration(start, end)
        df.at[idx, "eventDateAndTime"] = iso
        df.at[idx, "eventDuration"] = duration

        offers = ld.get("offers", [])
        if isinstance(offers, list) and offers:
            try:
                prices = [float(o.get("price", 0)) for o in offers if o.get("price")]
                if prices:
                    df.at[idx, "ticketAmount"] = int(min(prices))
                df.at[idx, "ticketLink"] = offers[0].get("url", "")
            except:
                pass

        performers = ld.get("performer", [])
        if isinstance(performers, list):
            names = [p.get("name") for p in performers if p.get("name")]
            df.at[idx, "artists"] = ", ".join(names)

        if not row.get("supportedLanguages"):
            df.at[idx, "supportedLanguages"] = "English"

        # Rebuild Plur_json_format
        from ast import literal_eval
        def to_int(val): return int(val) if str(val).isdigit() else 0
        def to_float(val): return float(val) if re.match(r'^-?\d+(\.\d+)?$', str(val)) else None

        plur_json = {
            "eventName": df.at[idx, "eventName"],
            "eventDescription": df.at[idx, "eventDescription"],
            "eventDateAndTime": df.at[idx, "eventDateAndTime"],
            "eventDuration": to_int(df.at[idx, "eventDuration"]),
            "venue": {
                "venueName": df.at[idx, "venueName"],
                "locality": df.at[idx, "locality"],
                "address": df.at[idx, "address"],
                "city": df.at[idx, "city"],
                "state": df.at[idx, "state"],
                "zipcode": df.at[idx, "zipcode"],
                "geolocation": {
                    "lat": to_float(df.at[idx, "lat"]),
                    "lon": to_float(df.at[idx, "lon"])
                },
                "layout": df.at[idx, "layout"]
            },
            "highlightImageLinks": [df.at[idx, "highlightImages"]] if df.at[idx, "highlightImages"] else [],
            "galleryImageLinks": [df.at[idx, "highlightImages"]] if df.at[idx, "highlightImages"] else [],
            "ticketAmount": to_int(df.at[idx, "ticketAmount"]),
            "ticketLink": df.at[idx, "ticketLink"],
            "supportedLanguages": [df.at[idx, "supportedLanguages"]],
            "category": df.at[idx, "category"],
            "subCategory": df.at[idx, "subCategory"],
            "eventType": df.at[idx, "eventType"],
            "eventFeatures": {
                "foodAvailable": df.at[idx, "foodAvailable"] == "Yes",
                "smokingAllowed": df.at[idx, "smokingAllowed"] == "Yes",
                "wheelchairAccess": df.at[idx, "wheelchairAccess"] == "Yes",
                "parkingAvailable": df.at[idx, "parkingAvailable"] == "Yes",
                "supportAvailable": df.at[idx, "supportAvailable"] == "Yes",
                "petFriendly": df.at[idx, "petFriendly"] in ["Yes", True],
                "alcoholServed": df.at[idx, "alcoholServed"] == "Yes",
                "minimumAge": to_int(df.at[idx, "minimumAge"]),
                "ticketsAtVenue": df.at[idx, "ticketsAtVenue"] == "Yes",
                "washroomAvailable": df.at[idx, "washroomAvailable"] == "Yes",
                "danceFloorAvailable": df.at[idx, "danceFloorAvailable"] == "Yes",
                "poolAvailable": df.at[idx, "poolAvailable"] == "Yes"
            },
            "artists": df.at[idx, "artists"].split(", ") if isinstance(df.at[idx, "artists"], str) else [],
            "sharableEventOgImageLink": df.at[idx, "highlightImages"],
            "attendeesCount": to_int(df.at[idx, "attendeesCount"]),
            "likesCount": 0,
            "joinChatDetails": {
                "joinChatLink": "",
                "provider": "",
                "isEnabled": False
            },
            "policyAndConditions": df.at[idx, "policyAndConditions"].split("\n") if isinstance(df.at[idx, "policyAndConditions"], str) else [],
            "frequentlyAskedQuestions": []
        }

        df.at[idx, "Plur_json_format"] = json.dumps(plur_json, ensure_ascii=False, indent=2)

# Save final CSV
print(f"✅ Saving updated file with JSON-LD enhancements → {CSV_OUTPUT}")
df.to_csv(CSV_OUTPUT, index=False)


  df.at[idx, "zipcode"] = zip_match.group(1)


✅ Saving updated file with JSON-LD enhancements → highape_events_cleaned_with_jsonld.csv


In [27]:
# Script: Enhance HighApe Event CSV using JSON-LD in raw HTML

import pandas as pd
import json
import re
from datetime import datetime
from pathlib import Path
from bs4 import BeautifulSoup

CSV_INPUT = "highape_events_cleaned.csv"
CSV_OUTPUT = "highape_events_cleaned_with_jsonld.csv"

# Load CSV
df = pd.read_csv(CSV_INPUT)

# --- Helpers ---
def parse_iso_and_duration(start, end):
    try:
        start_dt = datetime.strptime(start, "%Y-%m-%d %H:%M:%S")
        end_dt = datetime.strptime(end, "%Y-%m-%d %H:%M:%S")
        return start_dt.isoformat() + "Z", int((end_dt - start_dt).total_seconds() * 1000)
    except:
        return start, ''

def extract_jsonld_from_html(html):
    try:
        soup = BeautifulSoup(html, 'html.parser')
        script_tags = soup.find_all('script', type='application/ld+json')
        for tag in script_tags:
            try:
                data = json.loads(tag.text.strip())
                if isinstance(data, list):
                    for item in data:
                        if item.get('@type') == 'Event':
                            return item
                elif data.get('@type') == 'Event':
                    return data
            except:
                continue
    except Exception as e:
        print(f"Error parsing HTML: {e}")
    return None

# --- Main enhancement loop ---
for idx, row in df.iterrows():
    json_path = row.get("jsonFilePath")
    if not json_path or not Path(json_path).exists():
        continue

    with open(json_path, 'r') as f:
        html_json = json.load(f)
        html = html_json.get("rawHtml")
        ld = extract_jsonld_from_html(html)

    if ld:
        if not row.get("eventName"):
            df.at[idx, "eventName"] = ld.get("name", "")

        if not row.get("highlightImages"):
            df.at[idx, "highlightImages"] = ld.get("image", "")

        loc = ld.get("location", {})
        if isinstance(loc, dict):
            df.at[idx, "venueName"] = loc.get("name", "")
            df.at[idx, "address"] = loc.get("address", "")

            addr = loc.get("address", "")
            zip_match = re.search(r'\b(\d{6})\b', addr)
            if zip_match:
                df.at[idx, "zipcode"] = zip_match.group(1)

            parts = re.findall(r'[A-Za-z]+', addr)
            if len(parts) >= 3:
                df.at[idx, "locality"] = parts[-3]
                df.at[idx, "city"] = parts[-2]
                df.at[idx, "state"] = parts[-1]

        start, end = ld.get("startDate", ""), ld.get("endDate", "")
        iso, duration = parse_iso_and_duration(start, end)
        df.at[idx, "eventDateAndTime"] = iso
        df.at[idx, "eventDuration"] = duration

        offers = ld.get("offers", [])
        if isinstance(offers, list) and offers:
            try:
                prices = [float(o.get("price", 0)) for o in offers if o.get("price")]
                if prices:
                    df.at[idx, "ticketAmount"] = int(min(prices))
                df.at[idx, "ticketLink"] = offers[0].get("url", "")
            except:
                pass

        performers = ld.get("performer", [])
        if isinstance(performers, list):
            names = [p.get("name") for p in performers if p.get("name")]
            df.at[idx, "artists"] = ", ".join(names)

        if not row.get("supportedLanguages"):
            df.at[idx, "supportedLanguages"] = "English"

        df.at[idx, "layout"] = "Indoor"

        # Rebuild Plur_json_format
        from ast import literal_eval
        def to_int(val): return int(val) if str(val).isdigit() else 0
        def to_float(val): return float(val) if re.match(r'^-?\d+(\.\d+)?$', str(val)) else None

        plur_json = {
            "eventName": df.at[idx, "eventName"],
            "eventDescription": df.at[idx, "eventDescription"],
            "eventDateAndTime": df.at[idx, "eventDateAndTime"],
            "eventDuration": to_int(df.at[idx, "eventDuration"]),
            "venue": {
                "venueName": df.at[idx, "venueName"],
                "locality": df.at[idx, "locality"],
                "address": df.at[idx, "address"],
                "city": df.at[idx, "city"],
                "state": df.at[idx, "state"],
                "zipcode": df.at[idx, "zipcode"],
                "geolocation": {
                    "lat": to_float(df.at[idx, "lat"]),
                    "lon": to_float(df.at[idx, "lon"])
                },
                "layout": "Indoor"
            },
            "highlightImageLinks": [df.at[idx, "highlightImages"]] if df.at[idx, "highlightImages"] else [],
            "galleryImageLinks": [df.at[idx, "highlightImages"]] if df.at[idx, "highlightImages"] else [],
            "ticketAmount": to_int(df.at[idx, "ticketAmount"]),
            "ticketLink": df.at[idx, "ticketLink"],
            "supportedLanguages": ["en"],
            "category": df.at[idx, "category"],
            "subCategory": df.at[idx, "subCategory"],
            "eventType": df.at[idx, "eventType"],
            "eventFeatures": {
                "foodAvailable": df.at[idx, "foodAvailable"] == "Yes",
                "smokingAllowed": df.at[idx, "smokingAllowed"] == "Yes",
                "wheelchairAccess": df.at[idx, "wheelchairAccess"] == "Yes",
                "parkingAvailable": df.at[idx, "parkingAvailable"] == "Yes",
                "supportAvailable": df.at[idx, "supportAvailable"] == "Yes",
                "petFriendly": df.at[idx, "petFriendly"] in ["Yes", True],
                "alcoholServed": df.at[idx, "alcoholServed"] == "Yes",
                "minimumAge": to_int(df.at[idx, "minimumAge"]),
                "ticketsAtVenue": df.at[idx, "ticketsAtVenue"] == "Yes",
                "washroomAvailable": df.at[idx, "washroomAvailable"] == "Yes",
                "danceFloorAvailable": df.at[idx, "danceFloorAvailable"] == "Yes",
                "poolAvailable": df.at[idx, "poolAvailable"] == "Yes"
            },
            "artists": df.at[idx, "artists"].split(", ") if isinstance(df.at[idx, "artists"], str) else [],
            "sharableEventOgImageLink": df.at[idx, "highlightImages"],
            "attendeesCount": to_int(df.at[idx, "attendeesCount"]),
            "likesCount": 0,
            "joinChatDetails": {
                "joinChatLink": "",
                "provider": "",
                "isEnabled": False
            },
            "policyAndConditions": df.at[idx, "policyAndConditions"].split("\n") if isinstance(df.at[idx, "policyAndConditions"], str) else [],
            "frequentlyAskedQuestions": []
        }

        df.at[idx, "Plur_json_format"] = json.dumps(plur_json, ensure_ascii=False, indent=2)

# Save final CSV
print(f"✅ Saving updated file with JSON-LD enhancements → {CSV_OUTPUT}")
df.to_csv(CSV_OUTPUT, index=False)


  df.at[idx, "zipcode"] = zip_match.group(1)
  df.at[idx, "layout"] = "Indoor"


✅ Saving updated file with JSON-LD enhancements → highape_events_cleaned_with_jsonld.csv


In [29]:
# Script: Enhance HighApe Event CSV using JSON-LD in raw HTML

import pandas as pd
import json
import re
from datetime import datetime
from pathlib import Path
from bs4 import BeautifulSoup

CSV_INPUT = "highape_events_cleaned_with_jsonld.csv"
CSV_OUTPUT = "highape_events_cleaned_with_jsonld_updated.csv"

# Load CSV
df = pd.read_csv(CSV_INPUT)

# --- Helpers ---
def parse_iso_and_duration(start, end):
    try:
        start_dt = datetime.strptime(start, "%Y-%m-%d %H:%M:%S")
        end_dt = datetime.strptime(end, "%Y-%m-%d %H:%M:%S")
        return start_dt.isoformat() + "Z", int((end_dt - start_dt).total_seconds() * 1000)
    except:
        return start, ''

def extract_jsonld_from_html(html):
    try:
        soup = BeautifulSoup(html, 'html.parser')
        script_tags = soup.find_all('script', type='application/ld+json')
        for tag in script_tags:
            try:
                data = json.loads(tag.text.strip())
                if isinstance(data, list):
                    for item in data:
                        if item.get('@type') == 'Event':
                            return item
                elif data.get('@type') == 'Event':
                    return data
            except:
                continue
    except Exception as e:
        print(f"Error parsing HTML: {e}")
    return None

# --- Main enhancement loop ---
for idx, row in df.iterrows():
    json_path = row.get("jsonFilePath")
    if not json_path or not Path(json_path).exists():
        continue

    with open(json_path, 'r') as f:
        html_json = json.load(f)
        html = html_json.get("rawHtml")
        ld = extract_jsonld_from_html(html)

    if ld:
        if not row.get("eventName"):
            df.at[idx, "eventName"] = ld.get("name", "")

        if not row.get("highlightImages"):
            df.at[idx, "highlightImages"] = ld.get("image", "")

        loc = ld.get("location", {})
        if isinstance(loc, dict):
            df.at[idx, "venueName"] = loc.get("name", "")
            df.at[idx, "address"] = loc.get("address", "")

            addr = loc.get("address", "")
            zip_match = re.search(r'\b(\d{6})\b', addr)
            if zip_match:
                df.at[idx, "zipcode"] = zip_match.group(1)

            parts = re.findall(r'[A-Za-z]+', addr)
            if len(parts) >= 3:
                df.at[idx, "locality"] = parts[-3]
                df.at[idx, "city"] = parts[-2]
                df.at[idx, "state"] = parts[-1]

        start, end = ld.get("startDate", ""), ld.get("endDate", "")
        iso, duration = parse_iso_and_duration(start, end)
        df.at[idx, "eventDateAndTime"] = iso
        df.at[idx, "eventDuration"] = duration

        offers = ld.get("offers", [])
        if isinstance(offers, list) and offers:
            all_prices = []
            for offer in offers:
                try:
                    price = float(offer.get("price", 0))
                    all_prices.append(price)
                except:
                    continue
            if all_prices:
                df.at[idx, "ticketAmount"] = int(max(all_prices))
            if offers[0].get("url"):
                df.at[idx, "ticketLink"] = offers[0].get("url")

        performers = ld.get("performer", [])
        if isinstance(performers, list):
            names = [p.get("name") for p in performers if p.get("name")]
            df.at[idx, "artists"] = ", ".join(names)

        if not row.get("supportedLanguages"):
            df.at[idx, "supportedLanguages"] = "English"

        df.at[idx, "layout"] = "Indoor"

        # Rebuild Plur_json_format
        from ast import literal_eval
        def to_int(val): return int(val) if str(val).isdigit() else 0
        def to_float(val): return float(val) if re.match(r'^-?\d+(\.\d+)?$', str(val)) else None

        plur_json = {
            "eventName": df.at[idx, "eventName"],
            "eventDescription": df.at[idx, "eventDescription"],
            "eventDateAndTime": df.at[idx, "eventDateAndTime"],
            "eventDuration": to_int(df.at[idx, "eventDuration"]),
            "venue": {
                "venueName": df.at[idx, "venueName"],
                "locality": df.at[idx, "locality"],
                "address": df.at[idx, "address"],
                "city": df.at[idx, "city"],
                "state": df.at[idx, "state"],
                "zipcode": df.at[idx, "zipcode"],
                "geolocation": {
                    "lat": to_float(df.at[idx, "lat"]),
                    "lon": to_float(df.at[idx, "lon"])
                },
                "layout": "Indoor"
            },
            "highlightImageLinks": [df.at[idx, "highlightImages"]] if df.at[idx, "highlightImages"] else [],
            "galleryImageLinks": [df.at[idx, "highlightImages"]] if df.at[idx, "highlightImages"] else [],
            "ticketAmount": to_int(df.at[idx, "ticketAmount"]),
            "ticketLink": df.at[idx, "ticketLink"],
            "supportedLanguages": ["en"],
            "category": df.at[idx, "category"],
            "subCategory": df.at[idx, "subCategory"],
            "eventType": df.at[idx, "eventType"],
            "eventFeatures": {
                "foodAvailable": df.at[idx, "foodAvailable"] == "Yes",
                "smokingAllowed": df.at[idx, "smokingAllowed"] == "Yes",
                "wheelchairAccess": df.at[idx, "wheelchairAccess"] == "Yes",
                "parkingAvailable": df.at[idx, "parkingAvailable"] == "Yes",
                "supportAvailable": df.at[idx, "supportAvailable"] == "Yes",
                "petFriendly": df.at[idx, "petFriendly"] in ["Yes", True],
                "alcoholServed": df.at[idx, "alcoholServed"] == "Yes",
                "minimumAge": to_int(df.at[idx, "minimumAge"]),
                "ticketsAtVenue": df.at[idx, "ticketsAtVenue"] == "Yes",
                "washroomAvailable": df.at[idx, "washroomAvailable"] == "Yes",
                "danceFloorAvailable": df.at[idx, "danceFloorAvailable"] == "Yes",
                "poolAvailable": df.at[idx, "poolAvailable"] == "Yes"
            },
            "artists": df.at[idx, "artists"].split(", ") if isinstance(df.at[idx, "artists"], str) else [],
            "sharableEventOgImageLink": df.at[idx, "highlightImages"],
            "attendeesCount": to_int(df.at[idx, "attendeesCount"]),
            "likesCount": 0,
            "joinChatDetails": {
                "joinChatLink": "",
                "provider": "",
                "isEnabled": False
            },
            "policyAndConditions": df.at[idx, "policyAndConditions"].split("\n") if isinstance(df.at[idx, "policyAndConditions"], str) else [],
            "frequentlyAskedQuestions": []
        }

        df.at[idx, "Plur_json_format"] = json.dumps(plur_json, ensure_ascii=False, indent=2)

# Save final CSV
print(f"✅ Saving updated file with JSON-LD enhancements → {CSV_OUTPUT}")
df.to_csv(CSV_OUTPUT, index=False)


  df.at[idx, "zipcode"] = zip_match.group(1)


✅ Saving updated file with JSON-LD enhancements → highape_events_cleaned_with_jsonld_updated.csv


In [39]:
import json
import re
from bs4 import BeautifulSoup
from datetime import datetime

# === Load file ===
with open("/Users/shiv/Documents/Jupyter/HighApe/event_json_files/run_20250607_161335/event_1bdd4ba4-1d6b-4c21-beb8-19482f712e86.json", "r", encoding="utf-8") as f:
    raw = json.load(f)

soup = BeautifulSoup(raw["rawHtml"], "html.parser")

# === Try to extract JSON-LD ===
ld_json_script = soup.find("script", {"type": "application/ld+json"})
schema = {}
try:
    ld_data = json.loads(ld_json_script.string)
    schema = ld_data[0] if isinstance(ld_data, list) else ld_data
except:
    schema = {}

# === Extract image links ===
image_links = [a['href'] for a in soup.select('a[data-fancybox="event_images"]') if a.get("href")]

# === Extract meta info as fallback ===
meta = lambda prop: soup.find("meta", attrs={"property": prop}) or soup.find("meta", attrs={"name": prop})
meta_title = meta("og:title")["content"] if meta("og:title") else ""
meta_desc = meta("og:description")["content"] if meta("og:description") else ""
meta_keywords = meta("keywords")["content"] if meta("keywords") else ""
meta_image = meta("og:image")["content"] if meta("og:image") else ""

# === Dates ===
start = schema.get("startDate", "")
end = schema.get("endDate", "")
duration = 0
try:
    duration = int((datetime.fromisoformat(end) - datetime.fromisoformat(start)).total_seconds() * 1000)
except:
    pass

# === Location parsing ===
address = schema.get("location", {}).get("address", "")
venue_name = schema.get("location", {}).get("name", "")
zipcode = re.search(r"\b\d{6}\b", address)
zipcode = zipcode.group(0) if zipcode else ""
parts = address.split(",")
city = parts[-3].strip() if len(parts) >= 3 else ""
state = parts[-2].strip() if len(parts) >= 2 else ""
locality = parts[-4].strip() if len(parts) >= 4 else ""

# === Ticket info ===
offers = schema.get("offers", [])
ticket_options = []
prices = []
for o in offers:
    try:
        p = float(o.get("price", "0"))
        prices.append(p)
        ticket_options.append({
            "type": o.get("name", ""),
            "price": p,
            "currency": o.get("priceCurrency", "INR"),
            "url": o.get("url", "")
        })
    except:
        pass
ticket_amount = max(prices) if prices else 0

# === Policy parsing ===
policies = [p.get_text(strip=True) for p in soup.select("div#tnc_text p, .tnc_text p")]

# === Final Output ===
event = {
    "eventName": schema.get("name", meta_title),
    "eventDescription": schema.get("description", meta_desc),
    "eventDateAndTime": datetime.fromisoformat(start).isoformat() if start else "",
    "eventDuration": duration,
    "venue": {
        "venueName": venue_name,
        "locality": locality,
        "address": address,
        "city": city,
        "state": state,
        "zipcode": zipcode,
        "geolocation": {"lat": None, "lon": None},
        "layout": "Indoor"
    },
    "highlightImageLinks": [image_links[0]] if image_links else [],
    "galleryImageLinks": image_links,
    "ticketAmount": ticket_amount,
    "ticketOptions": ticket_options,
    "ticketLink": ticket_options[0]["url"] if ticket_options else "",
    "supportedLanguages": ["en"],
    "category": meta_keywords,
    "subCategory": float("nan"),
    "eventType": float("nan"),
    "eventFeatures": {
        "foodAvailable": bool(re.search(r"food|appetizer", meta_desc, re.I)),
        "smokingAllowed": False,
        "wheelchairAccess": False,
        "parkingAvailable": bool(re.search(r"parking", meta_desc, re.I)),
        "supportAvailable": False,
        "petFriendly": False,
        "alcoholServed": bool(re.search(r"alcohol|cocktail", meta_desc, re.I)),
        "minimumAge": 0,
        "ticketsAtVenue": False,
        "washroomAvailable": False,
        "danceFloorAvailable": bool(re.search(r"dance", meta_desc, re.I)),
        "poolAvailable": False
    },
    "artists": [p.get("name") for p in schema.get("performer", [])] if "performer" in schema else [],
    "sharableEventOgImageLink": schema.get("image", meta_image),
    "attendeesCount": 0,
    "likesCount": 0,
    "joinChatDetails": {
        "joinChatLink": "",
        "provider": "",
        "isEnabled": False
    },
    "policyAndConditions": policies,
    "frequentlyAskedQuestions": []
}

import pprint
pprint.pprint(event)


{'artists': [],
 'attendeesCount': 0,
 'category': 'Dj Night,offers,ladies night,Club Margarita '
             ',Goa,Clubmargarita,friday,Free Entry',
 'eventDateAndTime': '2025-06-13T18:00:00',
 'eventDescription': 'Highlights DJ Night Nonstop Music Great Ambience Ladies '
                     'Night Mouthwatering Appetizers Cocktails and Mocktails '
                     'Friday Fever at Club Margarita is the ultimate ignition '
                     'to your weekend. Step into a world pulsing with energy, '
                     'where the lights burn brighter and the beats hit harder. '
                     'The air is electric with anticipation, as every corner '
                     'of the club invites you to dance, mingle, and indulge. '
                     'This is where glamour meets rhythm, where bold fashion '
                     'and fiery spirits set the tone for an unforgettable nig',
 'eventDuration': 23400000,
 'eventFeatures': {'alcoholServed': True,
                  

SyntaxError: invalid syntax (118699560.py, line 1)