In [1]:
# 03_Gold_AI_Enrichment

StatementMeta(, 7ff9462c-da1b-4771-8b0c-3c5e30a8ea44, 3, Finished, Available, Finished)

In [2]:
# Install the necessary AI libraries
%pip install transformers torch

StatementMeta(, 7ff9462c-da1b-4771-8b0c-3c5e30a8ea44, 9, Finished, Available, Finished)


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.



In [7]:
import pandas as pd
from pyspark.sql.functions import col, pandas_udf, struct
from pyspark.sql.types import StringType, StructType, StructField, FloatType
from transformers import pipeline

# --- CONFIGURATION ---
SILVER_TABLE_NAME = "silver_news"
GOLD_TABLE_NAME = "gold_market_pulse"

# 1. READ SILVER DATA
print("Reading clean data from Silver Layer...")
df_silver = spark.read.table(SILVER_TABLE_NAME)

# 2. DEFINE THE AI MODEL (Advanced Pandas UDF)
# We wrap the model in a function that Spark distributes across workers
# This runs MUCH faster than a standard Python loop.

@pandas_udf("label string, score float")
def sentiment_analysis_udf(text_series: pd.Series) -> pd.DataFrame:
    # Load the model inside the function (so it works on parallel nodes)
    # 'distilbert' is a fast, accurate, free model
    pipe = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")
    
    # Run the model on the batch of text
    # Truncate to 512 tokens to prevent errors with long text
    results = pipe(text_series.tolist(), truncation=True, max_length=512)
    
    # Convert results (list of dicts) to DataFrame
    return pd.DataFrame(results)

# 3. APPLY THE MODEL
print("Running AI Sentiment Analysis (this may take a minute)...")
# We pass the 'snippet' column to our AI function
df_scored = df_silver.withColumn("sentiment_result", sentiment_analysis_udf(col("snippet")))

# 4. FLATTEN RESULTS
# The model returns a struct {label, score}. Let's split them into columns.
df_gold = df_scored.select(
    col("date"),
    col("competitor_tag"),
    col("source"),
    col("title"),
    col("url"),
    col("snippet"),
    col("sentiment_result.label").alias("sentiment_label"), # POSITIVE / NEGATIVE
    col("sentiment_result.score").alias("confidence_score")  # 0.99, 0.85, etc.
)

# 5. SAVE TO GOLD
# For the Gold layer, we usually 'Overwrite' to ensure the dashboard reflects the latest AI logic.
df_gold.write.mode("overwrite").format("delta").saveAsTable(GOLD_TABLE_NAME)

print(f"SUCCESS: AI Analysis complete. Saved to '{GOLD_TABLE_NAME}'.")

StatementMeta(, 7ff9462c-da1b-4771-8b0c-3c5e30a8ea44, 15, Submitted, Running, Running)

Reading clean data from Silver Layer...
Running AI Sentiment Analysis (this may take a minute)...
SUCCESS: AI Analysis complete. Saved to 'gold_market_pulse'.


In [8]:
%%sql
SELECT 
    date, 
    competitor_tag, 
    sentiment_label, 
    confidence_score, 
    snippet 
FROM gold_market_pulse 
ORDER BY date DESC 
LIMIT 10

StatementMeta(, 7ff9462c-da1b-4771-8b0c-3c5e30a8ea44, 16, Finished, Available, Finished)

<Spark SQL result set with 10 rows and 5 fields>