In [22]:
!pip install scrapfly-sdk
!pip install httpx parsel loguru
!pip install httpx[http2]
!pip install nest_asyncio



In [27]:
import asyncio
import httpx
import json
import nest_asyncio
nest_asyncio.apply()
from parsel import Selector

client = httpx.AsyncClient(
    # enable http2
    http2=True,
    # add basic browser-like headers to prevent being blocked
    headers={
        "accept-language": "en-US,en;q=0.9",
        "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36",
        "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
        "accept-language": "en-US;en;q=0.9",
        "accept-encoding": "gzip, deflate, br",
    },
)


async def scrape_property_info(url: str):
    """Scrape a single Zillow property page for property information."""
    async with httpx.AsyncClient(
        # enable http2
        http2=True,
        # add basic browser-like headers to prevent being blocked
        headers={
            "accept-language": "en-US,en;q=0.9",
            "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36",
            "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
            "accept-language": "en-US;en;q=0.9",
            "accept-encoding": "gzip, deflate, br",
        },
    ) as client:
        response = await client.get(url)
        assert response.status_code == 200, "Request has been blocked"
        selector = Selector(response.text)
        data = selector.css("script#__NEXT_DATA__::text").get()
        if data:
            # Option 1: some properties are located in NEXT DATA cache
            data = json.loads(data)
            property_data = json.loads(data["props"]["pageProps"]["gdpClientCache"])
            property_data = next(v['property'] for v in property_data.values())
        else:
            # Option 2: other times it's in Apollo cache
            data = selector.css("script#hdpApolloPreloadedData::text").get()
            data = json.loads(json.loads(data)["apiCache"])
            property_data = next(
                v["property"] for k, v in data.items() if "ForSale" in k
            )

        # Extract property information
        zip_code = property_data.get("address", {}).get("zipcode")
        bedrooms = property_data.get("bedrooms")
        bathrooms = property_data.get("bathrooms")
        year_built = property_data.get("yearBuilt")
        property_value = property_data.get("price")

        return {
            "Zip Code": zip_code,
            "Bedrooms": bedrooms,
            "Bathrooms": bathrooms,
            "Year Built": year_built,
            "Property Value": property_value,
        }

# example run:
if __name__ == "__main__":
    async def run():
        property_info = await scrape_property_info(
            #URL HERE
            "https://www.zillow.com/homedetails/162-Bentley-St-Staten-Island-NY-10307/32377844_zpid/"
        )
        print("Property Information:")
        for key, value in property_info.items():
            print(f"{key}: {value}")

    asyncio.run(run())

Property Information:
Zip Code: 10307
Bedrooms: None
Bathrooms: None
Year Built: 1945
Property Value: 693800
