In [1]:
# 1) Read the Delta table from your Lakehouse
from pyspark.sql import functions as F, types as T

df = spark.read.format("delta").load("Tables/Gold/customerdata")

# 2) Keyword lists for sentiment
positive_keywords = [
    'positive', 'good', 'great', 'excellent', 'satisfied', 'happy',
    'improvement', 'suggestion', 'request', 'resolved', 'paid', 'discount'
]
negative_keywords = [
    'bad', 'issue', 'error', 'complaint', 'problem', 'slow', 'not working',
    'refund', 'dispute', 'concern', 'unresolved', 'failed', 'incorrect',
    'overdue', 'confusing', 'missing', 'pending'
]

# 3) Spark UDF for sentiment score
@F.udf(T.DoubleType())
def compute_sentiment_score(text):
    if text is None:
        return 0.0
    t = str(text).lower()
    score = 0.0
    for w in positive_keywords:
        if w in t:
            score += 0.2
    for w in negative_keywords:
        if w in t:
            score -= 0.2
    return max(min(score, 1.0), -1.0)

# 4) Apply UDF and (optionally) normalize Interaction_Date to timestamp
scored = (
    df
    .withColumn("SentimentScore", compute_sentiment_score(F.col("Complaint_Description")))
    .withColumn("Interaction_Date",
                F.when(F.col("Interaction_Date").cast("date").isNotNull(),
                       F.to_timestamp(F.col("Interaction_Date")))  # if already YYYY-MM-DD
                 .otherwise(F.to_timestamp(F.col("Interaction_Date").cast("string"), "yyyyMMdd"))  # if int YYYYMMDD
               )
)

# 5) Save as a new Delta table in your Lakehouse (managed table)
scored.write.format("delta") \
     .mode("overwrite") \
     .option("overwriteSchema", "true") \
     .save("Tables/Sentiment/customerdata")


# Quick check
spark.read.format("delta").load("Tables/Sentiment/customerdata").show(5, truncate=False)

StatementMeta(, 4f0bd0ae-042a-42a5-a6b6-532e1f0f46f6, 3, Finished, Available, Finished)

+-----------+-------------+--------------------+----------------+--------------+-------------------+---------------------+----------------------------------------------------------------------------+--------------+
|Customer_ID|Name         |Email               |Interaction_Type|Issue_Category|Interaction_Date   |Customer_Name        |Complaint_Description                                                       |SentimentScore|
+-----------+-------------+--------------------+----------------+--------------+-------------------+---------------------+----------------------------------------------------------------------------+--------------+
|CUST134    |Chad Cook    |chadcook@example.org|Email           |Billing       |2025-09-22 00:00:00|Thunder Mountain Inc.|Overcharging / Incorrect Charges-Duplicate charges in the same billing cycle|-0.2          |
|CUST898    |Elle White   |Elle@example.org    |Email           |Billing       |2025-09-23 00:00:00|Bienville Medical    |Failed credit card

In [5]:
from pyspark.sql.functions import udf, col
from pyspark.sql.types import DoubleType

# Step 1: Read billing data from Lakehouse table
df = spark.read.format("delta").load("Tables/Gold/billingdata")

# Step 2: Define the UDF for sentiment scoring
def calculate_sentiment(payment_status, late_fee_applied, discount_applied):
    score = 0.5  # Neutral base

    # Adjust based on payment status
    if payment_status == "Paid":
        score += 0.3
    elif payment_status == "Pending":
        score -= 0.2
    elif payment_status == "Overdue":
        score -= 0.4

    # Adjust for late fee
    if late_fee_applied == "Yes":
        score -= 0.2

    # Adjust for discount
    try:
        discount_val = float(discount_applied.replace('%', '')) / 100
        score += discount_val * 0.5
    except:
        pass

    # Normalize score between 0 and 1
    return max(0, min(round(score, 2), 1))

# Step 3: Register UDF
sentiment_udf = udf(calculate_sentiment, DoubleType())

# Step 4: Apply UDF using correct column names
df = df.withColumn("sentiment_score", sentiment_udf(
    col("payment_status"),
    col("late_fee_applied"),
    col("discount_applied")
))

# Step 5: Save as a new Delta table
df.write.format("delta") \
    .mode("overwrite") \
    .option("overwriteSchema", "true") \
    .save("Tables/Sentiment/billingdata")

# Step 6: Quick check
spark.read.format("delta").load("Tables/Sentiment/billingdata").show(5, truncate=False)

StatementMeta(, 4f0bd0ae-042a-42a5-a6b6-532e1f0f46f6, 7, Finished, Available, Finished)

+-------------+-----------+---------------+-------------------------+------------------------+------------+-----------------+--------+-------------------+-----------------------+--------------+--------------+----------+----------------+----------------+--------------------+---------------+
|invoice_id   |customer_id|name           |email                    |billing_period          |invoice_date|total_bill_amount|currency|high_cost_product  |high_cost_product_price|payment_status|payment_method|due_date  |late_fee_applied|discount_applied|customer_name       |sentiment_score|
+-------------+-----------+---------------+-------------------------+------------------------+------------+-----------------+--------+-------------------+-----------------------+--------------+--------------+----------+----------------+----------------+--------------------+---------------+
|INV-2025-0009|CUST009    |Andres Mcdonald|matthew08@example.net    |2025-08-01 to 2025-08-31|2025-09-03  |3200.00          |US