In [0]:
from firecrawl import Firecrawl

# 1. Initialize with your key
# Tip: In a real project, use dbutils.secrets.get() for security!
app = Firecrawl(api_key="fc-d90524f417d04630bb6d779fb1ba2b96")


# Scrape a website:
doc = app.scrape("https://firecrawl.dev", formats=["markdown", "html"])
print(doc)


In [0]:
%sql
-- 1. Create the database (schema) in the default legacy metastore
CREATE SCHEMA IF NOT EXISTS bronze;
CREATE SCHEMA IF NOT EXISTS silver;
CREATE SCHEMA IF NOT EXISTS gold;

-- 2. Verify they were created
SHOW SCHEMAS;

In [0]:
from firecrawl import Firecrawl
import pandas as pd
from pyspark.sql.functions import current_timestamp

# 1. Initialize Firecrawl v2
app = Firecrawl(api_key="fc-d90524f417d04630bb6d779fb1ba2b96")

# 2. Define your target list
startup_list = [
    "https://en.wikipedia.org/wiki/OpenAI",
    "https://en.wikipedia.org/wiki/Anthropic",
    "https://en.wikipedia.org/wiki/Perplexity_AI",
    "https://en.wikipedia.org/wiki/Mistral_AI"
]

results = []

for url in startup_list:
    print(f"Scraping Wikipedia: {url.split('/')[-1]}...")
    try:
        # Use only_main_content=True to ignore Wikipedia's sidebars/menus
        response = app.scrape(
            url=url, 
            formats=['markdown'],
            only_main_content=True,
            # ADD THIS: Exclude the specific Wikipedia donation and navigation divs
            exclude_tags=[
                '.siteNotice', '#siteNotice', '.cm-box', 
                '.navbox', '.fundraising', '.mw-indicator'
            ]
        )
        
        results.append({
            "startup_name": url.split('/')[-1].replace('_', ' '),
            "raw_markdown": response.markdown,
            "status": "Success"
        })
    except Exception as e:
        results.append({"startup_name": url, "raw_markdown": str(e), "status": "Error"})

# Save to Bronze
bronze_df = spark.createDataFrame(results).withColumn("ingested_at", current_timestamp())
bronze_df.write.format("delta").mode("overwrite").option("overwriteSchema", "true").saveAsTable("bronze.raw_web_data")

display(spark.table("bronze.raw_web_data").select("startup_name", "raw_markdown"))

print("âœ… Bronze Layer updated using v2 Document objects.")

In [0]:

display(spark.table("bronze.raw_web_data"))