### Load grocery reviews for NLP

This notebook focuses on building a simple sentiment model using the grocery reviews data loaded from the three CSV parts created in the data prep notebook.

In [1]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F

spark = (
    SparkSession.builder
    .appName("big-data-project-nlp")
    .config("spark.driver.extraJavaOptions", "-Djava.security.manager=allow")
    .config("spark.executor.extraJavaOptions", "-Djava.security.manager=allow")
    .getOrCreate()
)

base_path = "../data"
df = (
    spark.read
    .option("header", True)
    .option("inferSchema", True)
    .csv([
        f"{base_path}/grocery_reviews_part1",
        f"{base_path}/grocery_reviews_part2",
        f"{base_path}/grocery_reviews_part3",
    ])
)

df.printSchema()
df.select("rating", "title", "text").show(5, truncate=False)


Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/12/12 19:32:49 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
                                                                                

root
 |-- rating: double (nullable = true)
 |-- title: string (nullable = true)
 |-- text: string (nullable = true)
 |-- asin: string (nullable = true)
 |-- parent_asin: string (nullable = true)
 |-- user_id: string (nullable = true)
 |-- timestamp: long (nullable = true)
 |-- helpful_vote: integer (nullable = true)
 |-- verified_purchase: boolean (nullable = true)

+------+--------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|rating|title                                 |text                                                                                

### Label definition and text cleaning

We filter out reviews without text and define a binary label is_positive based on the star rating: ratings of 4 or 5 are treated as positive (1), while lower ratings are treated as non-positive (0).


In [2]:
# Create is_positive label (1 if rating >= 4.0, else 0)
df_nlp = (
    df
    .filter(F.col("text").isNotNull() & (F.col("text") != ""))
    .withColumn("is_positive", F.when(F.col("rating") >= 4.0, 1).otherwise(0))
    .select("text", "rating", "is_positive")
)

df_nlp.show(5, truncate=False)
df_nlp.count()


+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------+-----------+
|text                                                                                                                                                                                                                                                                                                                                                                                                                                                               |rating|is_positive|
+-----------------------------------------------------

                                                                                

14301719

In [3]:
# use only 10% of the reviews to fit the model
df_nlp_small = df_nlp.sample(withReplacement=False, fraction=0.1, seed=42)

print("full:", df_nlp.count())
print("sampled:", df_nlp_small.count())

train_df, test_df = df_nlp_small.randomSplit([0.8, 0.2], seed=42)


                                                                                

full: 14301719




sampled: 1430512


                                                                                

In [5]:
from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline

tokenizer = Tokenizer(inputCol="text", outputCol="words")
remover = StopWordsRemover(inputCol="words", outputCol="filtered_words")

# smaller feature space to reduce memory
hashing_tf = HashingTF(
    inputCol="filtered_words",
    outputCol="tf_features",
    numFeatures=3000
)

idf = IDF(inputCol="tf_features", outputCol="features")

lr = LogisticRegression(
    featuresCol="features",
    labelCol="is_positive",
    maxIter=8,
    regParam=0.1,
    elasticNetParam=0.0,
    aggregationDepth=2,
)

nlp_pipeline = Pipeline(stages=[tokenizer, remover, hashing_tf, idf, lr])


In [6]:
nlp_model = nlp_pipeline.fit(train_df)


25/12/12 19:57:10 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
                                                                                

In [7]:
predictions = nlp_model.transform(test_df)
predictions.select("text", "is_positive", "probability", "prediction").show(5, truncate=80)


[Stage 37:>                                                         (0 + 1) / 1]

+--------------------------------------------------------------------------------+-----------+----------------------------------------+----------+
|                                                                            text|is_positive|                             probability|prediction|
+--------------------------------------------------------------------------------+-----------+----------------------------------------+----------+
|"HINT" water is my favorite drinking water!!! and>>>> Amazon is my favorite p...|          1| [0.0509707696039999,0.9490292303960001]|       1.0|
|                                                                #deliciousasaMUG|          1| [0.2649571192970312,0.7350428807029687]|       1.0|
|$6 for five bagels, price gouging.<br />This product tasted like it had no fl...|          0| [0.2418170513817265,0.7581829486182735]|       1.0|
|                                     &quot;Almost&quot; as good as coffee taste!|          1|[0.15739046829543366,0.8

                                                                                

In [8]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
import pyspark.sql.functions as F

evaluator = BinaryClassificationEvaluator(
    labelCol="is_positive",
    rawPredictionCol="rawPrediction"
)

auc = evaluator.evaluate(predictions)
print("Test AUC:", auc)

accuracy = (
    predictions.filter(F.col("prediction") == F.col("is_positive")).count()
    / predictions.count()
)
print("Test accuracy:", accuracy)


                                                                                

Test AUC: 0.8950702555589279




Test accuracy: 0.8321967528932196


                                                                                

In [9]:
predictions.select(
    "text", "rating", "is_positive", "probability", "prediction"
).show(10, truncate=120)


[Stage 55:>                                                         (0 + 1) / 1]

+------------------------------------------------------------------------------------------------------------------------+------+-----------+----------------------------------------+----------+
|                                                                                                                    text|rating|is_positive|                             probability|prediction|
+------------------------------------------------------------------------------------------------------------------------+------+-----------+----------------------------------------+----------+
|"HINT" water is my favorite drinking water!!! and>>>> Amazon is my favorite place to shop for all my favorite things!...|   5.0|          1| [0.0509707696039999,0.9490292303960001]|       1.0|
|                                                                                                        #deliciousasaMUG|   5.0|          1| [0.2649571192970312,0.7350428807029687]|       1.0|
|                           $6

                                                                                

In [10]:
wrong = predictions.filter(F.col("prediction") != F.col("is_positive"))
wrong.select("text", "rating", "is_positive", "probability", "prediction").show(10, truncate=120)


[Stage 56:>                                                         (0 + 1) / 1]

+------------------------------------------------------------------------------------------------------------------------+------+-----------+-----------------------------------------+----------+
|                                                                                                                    text|rating|is_positive|                              probability|prediction|
+------------------------------------------------------------------------------------------------------------------------+------+-----------+-----------------------------------------+----------+
|                           $6 for five bagels, price gouging.<br />This product tasted like it had no flavor whatsoever.|   1.0|          0|  [0.2418170513817265,0.7581829486182735]|       1.0|
|..... because the wrapper is melted into EVERY sugar daddy and i have to chip that part off so part of it goes down t...|   3.0|          0| [0.14930209324501956,0.8506979067549805]|       1.0|
|...I got crumbs!  On a p

                                                                                

In [11]:
nlp_model.write().overwrite().save("../models/grocery_sentiment_lr")


                                                                                