Initialize PySpark

In [16]:
from pyspark.sql import SparkSession

# Initialize a Spark session
spark = SparkSession.builder \
    .appName("Retail Sales Sentiment Analysis") \
    .getOrCreate()

# Check Spark session
print(spark.version)

3.5.3


Loading Dataset

In [17]:
# Load the dataset
data_path = "/content/amazon_reviews.csv"
df = spark.read.csv(data_path, header=True, inferSchema=True)

# Show the first few rows
df.show(5)

+---+------------+-------+--------------------+----------+--------+-----------+----------+----------+------------------+--------------------+------------------+
|_c0|reviewerName|overall|          reviewText|reviewTime|day_diff|helpful_yes|helpful_no|total_vote|score_pos_neg_diff|score_average_rating|wilson_lower_bound|
+---+------------+-------+--------------------+----------+--------+-----------+----------+----------+------------------+--------------------+------------------+
|  0|        NULL|    4.0|          No issues.|2014-07-23|     138|          0|         0|         0|                 0|                 0.0|               0.0|
|  1|        0mie|    5.0|Purchased this fo...|2013-10-25|     409|          0|         0|         0|                 0|                 0.0|               0.0|
|  2|         1K3|    4.0|it works as expec...|2012-12-23|     715|          0|         0|         0|                 0|                 0.0|               0.0|
|  3|         1m2|    5.0|This thi

Analyzing Missing Values

In [18]:
from pyspark.sql.functions import col

# Count total rows
total_rows = df.count()

# Count missing values in each column
missing_counts = {col_name: df.filter(col(col_name).isNull()).count() for col_name in df.columns}

# Print missing values summary
for col_name, count in missing_counts.items():
    print(f"Column: {col_name}, Missing Values: {count}, Percentage: {(count / total_rows) * 100:.2f}%")

Column: _c0, Missing Values: 0, Percentage: 0.00%
Column: reviewerName, Missing Values: 1, Percentage: 0.02%
Column: overall, Missing Values: 0, Percentage: 0.00%
Column: reviewText, Missing Values: 1, Percentage: 0.02%
Column: reviewTime, Missing Values: 1, Percentage: 0.02%
Column: day_diff, Missing Values: 2, Percentage: 0.04%
Column: helpful_yes, Missing Values: 1, Percentage: 0.02%
Column: helpful_no, Missing Values: 0, Percentage: 0.00%
Column: total_vote, Missing Values: 1, Percentage: 0.02%
Column: score_pos_neg_diff, Missing Values: 1, Percentage: 0.02%
Column: score_average_rating, Missing Values: 0, Percentage: 0.00%
Column: wilson_lower_bound, Missing Values: 1, Percentage: 0.02%


Removing row in the dataset where reviewText is null and Filling other missing values

In [19]:
df = df.filter(df["reviewText"].isNotNull())

In [20]:
# Fill missing values for columns not directly used
from pyspark.sql import functions as F
df = df.fillna({
    "reviewerName": "Anonymous",
    "reviewTime": "Unknown",
    "day_diff": -1,
    "helpful_yes": 0,
    "helpful_no": 0,
    "total_vote": 0,
    "score_pos_neg_diff": 0,
    "wilson_lower_bound": 0
})
df.select([F.count(F.when(F.col(c).isNull(), c)).alias(c) for c in df.columns]).show()

+---+------------+-------+----------+----------+--------+-----------+----------+----------+------------------+--------------------+------------------+
|_c0|reviewerName|overall|reviewText|reviewTime|day_diff|helpful_yes|helpful_no|total_vote|score_pos_neg_diff|score_average_rating|wilson_lower_bound|
+---+------------+-------+----------+----------+--------+-----------+----------+----------+------------------+--------------------+------------------+
|  0|           0|      0|         0|         0|       0|          0|         0|         0|                 0|                   0|                 0|
+---+------------+-------+----------+----------+--------+-----------+----------+----------+------------------+--------------------+------------------+



In [21]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import Tokenizer, StopWordsRemover, NGram, HashingTF, IDF
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.sql.functions import when, avg

# Step 1: Label Creation (Based on 'overall' Ratings)
# Positive Sentiment (1): Ratings 4 and 5
# Negative Sentiment (0): Ratings 1 and 2
df = df.withColumn("label", when(col("overall") >= 4, 1).when(col("overall") <= 2, 0))
df = df.filter(df["label"].isNotNull())  # Remove neutral ratings

# Step 2: Text Preprocessing
tokenizer = Tokenizer(inputCol="reviewText", outputCol="words")
remover = StopWordsRemover(inputCol="words", outputCol="filtered")
ngram = NGram(n=2, inputCol="filtered", outputCol="ngrams")

# Step 3: Feature Engineering
hashingTF = HashingTF(inputCol="filtered", outputCol="rawFeatures", numFeatures=10000)
idf = IDF(inputCol="rawFeatures", outputCol="finalFeatures")

# Step 4: Logistic Regression Model
lr = LogisticRegression(featuresCol="finalFeatures", labelCol="label")

# Step 5: Build and Train Pipeline
pipeline = Pipeline(stages=[tokenizer, remover, ngram, hashingTF, idf, lr])

# Split data into training and testing
(trainingData, testData) = df.randomSplit([0.7, 0.3], seed=1234)

# Train the model
model = pipeline.fit(trainingData)

# Step 6: Predictions
predictions = model.transform(testData)

# Step 7: Evaluate the Model
evaluator = BinaryClassificationEvaluator(labelCol="label", rawPredictionCol="rawPrediction", metricName="areaUnderROC")
auc = evaluator.evaluate(predictions)
print(f"Area Under ROC: {auc}")

# Step 8: Aggregate Sentiment Scores
product_sentiment = predictions.groupBy("overall").agg(avg("prediction").alias("avg_sentiment"))
product_sentiment.show()



Area Under ROC: 0.8782051282051285
+-------+------------------+
|overall|     avg_sentiment|
+-------+------------------+
|    1.0|0.3235294117647059|
|    5.0|0.9220665499124343|
|    4.0|0.8766233766233766|
|    2.0|0.5652173913043478|
+-------+------------------+



In [23]:
output_path = "/content/output"
product_sentiment.write.csv(f"{output_path}/product_sentiments.csv", header=True, mode="overwrite")

# Save Evaluation Results
with open(f"{output_path}/evaluation_results.txt", "w") as file:
    file.write(f"Area Under ROC: {auc}\n")

# Stop Spark session
spark.stop()