In [0]:
from pyspark.sql import functions as F

data = [
    ("This product is amazing and works perfectly",),
    ("Terrible quality and very disappointing",),
    ("Good value for the price",),
    ("Worst purchase ever",)
]

df = spark.createDataFrame(data, ["review"])
df.show(truncate=False)


In [0]:
positive_words = ["amazing", "good", "perfect", "excellent", "value"]
negative_words = ["terrible", "worst", "bad", "disappointing"]

sentiment_df = df.withColumn(
    "sentiment",
    F.when(
        F.lower(F.col("review")).rlike("|".join(positive_words)),
        "POSITIVE"
    ).when(
        F.lower(F.col("review")).rlike("|".join(negative_words)),
        "NEGATIVE"
    ).otherwise("NEUTRAL")
)

sentiment_df.show(truncate=False)


In [0]:
sentiment_summary = sentiment_df.groupBy("sentiment").count()
sentiment_summary.show()


In [0]:
import mlflow

with mlflow.start_run(run_name="rule_based_sentiment_analysis"):
    mlflow.log_param("approach", "keyword_based_nlp")
    mlflow.log_metric("positive_reviews", sentiment_summary.filter("sentiment='POSITIVE'").first()["count"])
    mlflow.log_metric("negative_reviews", sentiment_summary.filter("sentiment='NEGATIVE'").first()["count"])


In [0]:
feedback_df = df.withColumn(
    "category",
    F.when(F.col("review").rlike("price|value"), "Pricing")
     .when(F.col("review").rlike("quality|worst|terrible"), "Quality")
     .otherwise("General")
)

feedback_df.show(truncate=False)
