In [0]:
%pip install torch
%pip install transformers
%pip install spacy

In [0]:
%restart_python

In [0]:
import spacy
import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from pyspark.sql.functions import col, udf, explode, lower
from pyspark.sql.types import ArrayType, StructType, StructField, StringType
from collections import defaultdict
import os


In [0]:
# -----------------------------------------------------------------
# 1. LOAD LOCAL MODELS (from DBFS)
# -----------------------------------------------------------------
# Note: Paths in Databricks Repos are relative to the notebook
ner_model_path = "../models/ner_model"
sentiment_model_path = "../models/sentiment_model"
sentiment_labels = ["negative", "neutral", "positive"]

# Load NER Model
nlp_ner = spacy.load(ner_model_path)
nlp_ner.add_pipe('sentencizer', before='ner')
print("✅ Custom NER Model Loaded")

# Load Sentiment Model
sentiment_tokenizer = AutoTokenizer.from_pretrained(sentiment_model_path)
sentiment_model = AutoModelForSequenceClassification.from_pretrained(sentiment_model_path)
print("✅ Sentiment Model Loaded")

# -----------------------------------------------------------------
# 2. DEFINE THE ABSA LOGIC AS A UDF
# -----------------------------------------------------------------

def run_sentiment_analysis(text: str) -> str:
    """Runs sentiment analysis on a single piece of text."""
    try:
        inputs = sentiment_tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
        with torch.no_grad():
            outputs = sentiment_model(**inputs)
        probabilities = F.softmax(outputs.logits, dim=-1)
        predicted_index = torch.argmax(probabilities, dim=-1).item()
        return sentiment_labels[predicted_index]
    except Exception:
        return "neutral"

def aggregate_sentiments(sentiments: list) -> str:
    """Aggregates a list of sentiments into one. Negative > Positive > Neutral."""
    if "negative" in sentiments:
        return "negative"
    if "positive" in sentiments:
        return "positive"
    return "neutral"

def absa_spark_udf(text: str) -> list:
    """
    This is our robust 'absa_analyst' logic, rewritten as a
    Spark-compatible UDF that returns a list of serializable dicts.
    """
    try:
        doc = nlp_ner(text)
        aspect_sentiments = defaultdict(list)

        if not doc.ents:
            sentiment = run_sentiment_analysis(text)
            return [{"aspect": "overall", "sentiment": sentiment}]
        
        for ent in doc.ents:
            if ent.label_ == "ASPECT":
                sentence = ent.sent.text
                sentiment = run_sentiment_analysis(sentence)
                aspect_sentiments[ent.text.lower()].append(sentiment)

        if not aspect_sentiments:
            return [{"aspect": "overall", "sentiment": run_sentiment_analysis(text)}]

        final_results = []
        for aspect, sentiments in aspect_sentiments.items():
            final_sentiment = aggregate_sentiments(sentiments)
            final_results.append({"aspect": aspect, "sentiment": final_sentiment})
        
        return final_results
    except Exception as e:
        return [{"aspect": "error", "sentiment": str(e)}]

# Define the UDF's return schema
absa_schema = ArrayType(
    StructType([
        StructField("aspect", StringType()),
        StructField("sentiment", StringType())
    ])
)

# Register the UDF
absa_udf = udf(absa_spark_udf, absa_schema)
print("✅ ABSA UDF Registered")

# -----------------------------------------------------------------
# 3. RUN THE JOB AND SAVE THE CSV
# -----------------------------------------------------------------
# Load our clean bronze table
df_bronze = spark.table("voc_bronze_layer")

# Run the ABSA UDF over the entire 'body_clean' column
# This may take a few minutes
print("Running ABSA UDF over all bronze data...")
df_with_absa = df_bronze.withColumn("absa_results", absa_udf(col("body_clean")))

# Explode the results into a flat table
# This is the "melt" operation we need for the dashboard
df_flat = df_with_absa.select(
    col("message_id"),
    explode(col("absa_results")).alias("absa")
).select(
    "message_id",
    col("absa.aspect").alias("aspect"),
    col("absa.sentiment").alias("sentiment")
)

# Filter out errors and "overall" (we only want specific aspects)
df_final = df_flat.where(col("aspect") != "error").where(col("aspect") != "overall")

print("Analysis complete. Converting to Pandas...")
# Collect the results to the driver and convert to Pandas
pdf_final = df_final.toPandas()

# Save the data as a CSV in your Git repo folder
# This path goes up from 'notebooks/' to the root, then into 'app/data/'
output_dir = "../app/data"
output_path = f"{output_dir}/dashboard_data.csv"

if not os.path.exists(output_dir):
    os.makedirs(output_dir)

pdf_final.to_csv(output_path, index=False)

print(f"✅ Successfully saved dashboard data to {output_path}")
print(f"Total rows created: {len(pdf_final)}")