In [0]:
from pyspark.sql.functions import col, udf
from pyspark.sql.types import StringType
import json

# 1. Access the Databricks Foundation Model (Llama 3 or similar)
# Note: This syntax works on most modern Databricks Workspaces
def extract_startup_info(raw_text):
    # We tell the AI exactly where to look and what to ignore
    prompt = f"""
    SYSTEM: You are a data extractor. You will be provided with a Wikipedia article.
    IMPORTANT: Ignore any text about 'donations', 'fundraising', or 'Wikipedia is free'.
    
    Look specifically for the 'Infobox' section (usually at the start of the article) 
    to find company details.
    
    Return a JSON object for:
    - company_name
    - founding_year
    - top_products (list the main software or AI models mentioned)

    RAW TEXT:
    {raw_text[:6000]}
    """
    
    try:
        import mlflow.deployments
        client = mlflow.deployments.get_deploy_client("databricks")
        
        response = client.predict(
            endpoint="databricks-meta-llama-3-3-70b-instruct",
            inputs={"messages": [{"role": "user", "content": prompt}]}
        )
        
        # Access the dictionary response
        content = response['choices'][0]['message']['content']
        
        # Simple cleanup: if the AI includes markdown backticks like ```json, strip them
        clean_json = content.replace('```json', '').replace('```', '').strip()
        return clean_json
    except Exception as e:
        return f"Error: {str(e)}"

# 2. Apply it to your Bronze Table
ai_udf = udf(extract_startup_info, StringType())

silver_df = spark.table("bronze.raw_web_data") \
    .withColumn("structured_data", ai_udf(col("raw_markdown")))

# 3. Save to Silver
silver_df.write.format("delta").mode("overwrite").saveAsTable("silver.startup_fundamentals")

display(spark.table("silver.startup_fundamentals").select("startup_name", "structured_data"))