In [6]:
!pip install playwright nest_asyncio --quiet
!playwright install firefox

import asyncio
import nest_asyncio
nest_asyncio.apply()

from playwright.async_api import async_playwright
from IPython.display import Image, display
import tempfile

async def load_page():
    print("Launching Firefox browser (headless=False)...")
    async with async_playwright() as p:
        browser = await p.firefox.launch(headless=False)
        page = await browser.new_page()

        print("Navigating to page...")
        await page.goto("https://www.applesfromny.com/varieties/", wait_until="networkidle")

        print("Page loaded! Taking screenshot...")

        tmp = tempfile.NamedTemporaryFile(suffix=".png", delete=False)
        await page.screenshot(path=tmp.name)

        display(Image(filename=tmp.name))

        html = await page.content()

        print("HTML captured! Closing browser...")
        await browser.close()

        return html

# ✅ This works inside Jupyter:
html_text = await load_page()

print("\nHTML successfully saved to variable: html_text")



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.2[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Launching Firefox browser (headless=False)...
Navigating to page...


TimeoutError: Page.goto: Timeout 30000ms exceeded.
Call log:
  - navigating to "https://www.applesfromny.com/varieties/", waiting until "networkidle"


In [7]:
!pip install playwright nest_asyncio --quiet
!playwright install firefox

import asyncio
import nest_asyncio
nest_asyncio.apply()

from playwright.async_api import async_playwright

async def load_page():
    print("Launching Firefox browser (headless=False)...")
    async with async_playwright() as p:
        browser = await p.firefox.launch(headless=False)
        page = await browser.new_page()

        print("Navigating to page...")
        await page.goto("https://www.applesfromny.com/varieties/", wait_until="networkidle")

        print("Page loaded. Capturing HTML...")
        html = await page.content()

        print("Closing browser...")
        await browser.close()

        return html

html_text = await load_page()

print("HTML saved to variable: html_text")
print("Length of HTML:", len(html_text))



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.2[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Launching Firefox browser (headless=False)...
Navigating to page...


TimeoutError: Page.goto: Timeout 30000ms exceeded.
Call log:
  - navigating to "https://www.applesfromny.com/varieties/", waiting until "networkidle"


In [8]:
if "html_text" in globals():
    print("✅ html_text exists!")
    
    mid = len(html_text) // 2
    preview = html_text[mid-500 : mid+500]
    
    print("\n--- MIDDLE 1000 CHARACTERS OF PAGE ---\n")
    print(preview)
else:
    print("❌ html_text does NOT exist. The page was not captured correctly.")


❌ html_text does NOT exist. The page was not captured correctly.


In [9]:
from playwright.async_api import async_playwright
import nest_asyncio
nest_asyncio.apply()

print("Starting Playwright...")

async def load_page_and_store():
    global html_text  # ✅ force it to stay in memory

    async with async_playwright() as p:
        print("Launching Firefox...")
        browser = await p.firefox.launch(headless=False)
        page = await browser.new_page()

        print("Going to page...")
        await page.goto("https://www.applesfromny.com/varieties/", wait_until="networkidle")

        print("Capturing HTML now...")
        html_text = await page.content()

        print("HTML captured! Length:", len(html_text))

        print("Closing browser...")
        await browser.close()

await load_page_and_store()

print("✅ DONE — html_text is now saved globally")


Starting Playwright...
Launching Firefox...
Going to page...


TimeoutError: Page.goto: Timeout 30000ms exceeded.
Call log:
  - navigating to "https://www.applesfromny.com/varieties/", waiting until "networkidle"


In [10]:
from playwright.async_api import async_playwright
import nest_asyncio
nest_asyncio.apply()

async def load_page_and_store():
    global html_text

    async with async_playwright() as p:
        print("Launching Firefox...")
        browser = await p.firefox.launch(headless=False)
        page = await browser.new_page()

        print("Going to page (domcontentloaded)...")
        await page.goto("https://www.applesfromny.com/varieties/", wait_until="domcontentloaded")

        print("Waiting 2 seconds for page to settle...")
        await page.wait_for_timeout(2000)

        print("Capturing HTML...")
        html_text = await page.content()
        print("HTML captured! Length:", len(html_text))

        print("Closing browser...")
        await browser.close()

await load_page_and_store()

print("✅ DONE — html_text is now stored globally")


Launching Firefox...
Going to page (domcontentloaded)...
Waiting 2 seconds for page to settle...
Capturing HTML...
HTML captured! Length: 755589
Closing browser...
✅ DONE — html_text is now stored globally


In [11]:
from bs4 import BeautifulSoup

soup = BeautifulSoup(html_text, "html.parser")

# Find just the first apple card
first_apple = soup.find("div", class_="apple-variety")

# Extract fields safely
name = first_apple.find("h2").get_text(strip=True)

description = first_apple.find("p").get_text(strip=True)

detail_url = first_apple.find("a")["href"]

image_url = first_apple.find("img")["src"]

# Extract the sweetness scale number from class like: scale-7
scale_div = first_apple.find("div", class_="scale-icon")
scale_class = scale_div["class"]
sweetness = [c.replace("scale-", "") for c in scale_class if c.startswith("scale-")][0]

print("✅ TEST SCRAPE RESULT:\n")
print("Name:", name)
print("Description:", description)
print("Detail URL:", detail_url)
print("Image URL:", image_url)
print("Sweet–Tart Scale:", sweetness)


✅ TEST SCRAPE RESULT:

Name: 20 Ounce
Description: This heirloom variety is famous not only for its size, but excellent cooking qualities as well.
Detail URL: https://www.applesfromny.com/varieties/20-ounce/
Image URL: https://www.applesfromny.com/wp-content/uploads/2020/05/20Ounce_NYAS-Apples2.png
Sweet–Tart Scale: icon


  params = OrderedDict((param.name, param) for param in parameters)


In [12]:
# Re-extract just the sweetness correctly

scale_div = first_apple.find("div", class_="scale-icon")

# Get the class that looks like "scale-7"
scale_class = scale_div["class"]
sweetness = [c for c in scale_class if c.startswith("scale-")][0]
sweetness_number = sweetness.replace("scale-", "")

print("✅ Corrected Sweet–Tart Scale:", sweetness_number)


✅ Corrected Sweet–Tart Scale: icon


In [13]:
# ✅ FINAL, robust sweetness extraction

scale_container = first_apple.find("div", class_="sweet-tart-scale")

all_divs = scale_container.find_all("div")

sweetness_number = None

for d in all_divs:
    classes = d.get("class", [])
    for c in classes:
        if c.startswith("scale-") and c != "scale-icon":
            sweetness_number = c.replace("scale-", "")

print("✅ Correct Sweet–Tart Scale:", sweetness_number)


✅ Correct Sweet–Tart Scale: 7


In [14]:
from bs4 import BeautifulSoup
import pandas as pd

soup = BeautifulSoup(html_text, "html.parser")

apple_cards = soup.find_all("div", class_="apple-variety")

print("Total apples found:", len(apple_cards))

rows = []

for apple in apple_cards:
    name = apple.find("h2").get_text(strip=True)
    description = apple.find("p").get_text(strip=True)
    detail_url = apple.find("a")["href"]
    image_url = apple.find("img")["src"]

    # Sweet–Tart scale (robust version)
    scale_container = apple.find("div", class_="sweet-tart-scale")
    all_divs = scale_container.find_all("div")

    sweetness_number = None
    for d in all_divs:
        classes = d.get("class", [])
        for c in classes:
            if c.startswith("scale-") and c != "scale-icon":
                sweetness_number = c.replace("scale-", "")

    rows.append({
        "name": name,
        "description": description,
        "detail_url": detail_url,
        "image_url": image_url,
        "sweet_tart_scale": sweetness_number
    })

df = pd.DataFrame(rows)

print("\n✅ Preview of first 5 rows:\n")
df.head()


Total apples found: 30

✅ Preview of first 5 rows:



Unnamed: 0,name,description,detail_url,image_url,sweet_tart_scale
0,20 Ounce,This heirloom variety is famous not only for i...,https://www.applesfromny.com/varieties/20-ounce/,https://www.applesfromny.com/wp-content/upload...,7
1,Acey Mac,"Sweet, tart and juicy.",https://www.applesfromny.com/varieties/acey-mac/,https://www.applesfromny.com/wp-content/upload...,6
2,Autumn Crisp,Cross between a Golden Delicious and a Monroe ...,https://www.applesfromny.com/varieties/autumn-...,https://www.applesfromny.com/wp-content/upload...,4
3,Braeburn,"Sweet, tangy and juicy.",https://www.applesfromny.com/varieties/braeburn/,https://www.applesfromny.com/wp-content/upload...,6
4,Cameo,"A bright, red-striped apple with a creamy oran...",https://www.applesfromny.com/varieties/cameo/,https://www.applesfromny.com/wp-content/upload...,2


In [15]:
output_path = "apple_varieties.csv"

df.to_csv(output_path, index=False)

print(f"✅ CSV file saved as: {output_path}")

# Show file size as a quick sanity check
import os
print("File size (bytes):", os.path.getsize(output_path))


✅ CSV file saved as: apple_varieties.csv
File size (bytes): 10194
