In [None]:
import os
import json
from dotenv import load_dotenv
from bs4 import BeautifulSoup
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_openai import ChatOpenAI
from langchain.chains import create_extraction_chain_pydantic
from playwright.sync_api import sync_playwright

# Load environment variables
load_dotenv()

# Define materials mapping
MATERIALS_MAPPING = {
    "Electronics": [
        "Computers, Laptops, Tablets",
        "Monitors, TVs (CRT & Flat Screen)",
        "Cell Phones, Smartphones",
        "Printers, Copiers, Fax Machines",
        "Audio/Video Equipment",
        "Gaming Consoles",
        "Small Appliances (Microwaves, Toasters, etc.)",
        "Computer Peripherals (Keyboards, Mice, Cables, etc.)"
    ],
    "Batteries": [
        "Household Batteries (AA, AAA, 9V, etc.)",
        "Rechargeable Batteries",
        "Lithium-ion Batteries",
        "Button/Watch Batteries",
        "Power Tool Batteries",
        "E-bike/Scooter Batteries",
        "Car/Automotive Batteries"
    ],
    "Paint & Chemicals": [
        "Latex/Water-based Paint",
        "Oil-based Paint and Stains",
        "Spray Paint",
        "Paint Thinners and Solvents",
        "Household Cleaners",
        "Pool Chemicals",
        "Pesticides and Herbicides",
        "Automotive Fluids (Oil, Antifreeze)"
    ],
    "Medical Sharps": [
        "Needles and Syringes",
        "Lancets",
        "Auto-injectors (EpiPens)",
        "Insulin Pens",
        "Home Dialysis Equipment"
    ],
    "Textiles/Clothing": [
        "Clothing and Shoes",
        "Household Textiles (Towels, Bedding)",
        "Fabric Scraps",
        "Accessories (Belts, Bags, etc.)"
    ],
    "Other Important Materials": [
        "Fluorescent Bulbs and CFLs",
        "Mercury Thermometers",
        "Smoke Detectors",
        "Fire Extinguishers",
        "Propane Tanks",
        "Mattresses and Box Springs",
        "Large Appliances (Fridges, Washers, etc.)",
        "Construction Debris (Residential Quantities)"
    ]
}

# Pydantic model for structured extraction
class Facility(BaseModel):
    business_name: str = Field(..., description="Official facility name")
    last_update_date: str = Field(..., description="Last update date of information")
    street_address: str = Field(..., description="Full physical street address")
    materials_accepted: list[str] = Field(..., description="List of accepted materials from predefined categories")

# Initialize LLM
llm = ChatOpenAI(
    model="gpt-4-turbo",
    temperature=0,
    api_key=os.getenv("OPENAI_API_KEY")
)

def scrape_facilities():
    """Scrape facility data using Playwright and extract structured info with LLM"""
    facilities = []

    with sync_playwright() as p:
        browser = p.chromium.launch(headless=True)
        page = browser.new_page()
        page.goto("https://search.earth911.com/?what=Electronics&where=10001&max_distance=100", timeout=60000)
        page.wait_for_selector(".result-item", timeout=30000)

        soup = BeautifulSoup(page.content(), "html.parser")
        facility_cards = soup.select(".result-item")[:5]  # Get first 5 facilities

        for card in facility_cards:
            # Clean HTML and remove unnecessary elements
            for script in card.find_all("script"):
                script.decompose()
            for style in card.find_all("style"):
                style.decompose()
            card_html = str(card)

            # Extract structured data with LLM
            extract_chain = create_extraction_chain_pydantic(pydantic_schema=Facility, llm=llm)
            result = extract_chain.invoke({"input": card_html})

            if result and result.get("text"):
                facility_data = result["text"][0]

                # Add materials_category based on accepted materials
                materials_category = set()
                for material in facility_data.materials_accepted:
                    for category, items in MATERIALS_MAPPING.items():
                        if material in items:
                            materials_category.add(category)

                # Convert to JSON-compatible format
                facility_json = {
                    "business_name": facility_data.business_name,
                    "last_update_date": facility_data.last_update_date,
                    "street_address": facility_data.street_address,
                    "materials_category": list(materials_category),
                    "materials_accepted": facility_data.materials_accepted
                }
                facilities.append(facility_json)

        browser.close()

    return facilities[:3]  # Return max 3 facilities

if __name__ == "__main__":
    scraped_data = scrape_facilities()
    print(json.dumps(scraped_data, indent=2))