In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import from_json, col
from pyspark.sql.types import StructType, StringType, DoubleType
from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline, PipelineModel
import time

In [3]:
# Initialize Spark Session with Kafka package
spark = SparkSession.builder \
    .appName("YelpKafkaSentiment") \
    .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.12:3.5.4") \
    .config("spark.executor.memory", "8g") \
    .config("spark.driver.memory", "8g") \
    .config("spark.driver.maxResultSize", "2g") \
    .config("spark.sql.streaming.checkpointLocation", "/tmp/spark-checkpoint") \
    .getOrCreate()

# Define schema
schema = StructType() \
    .add("text", StringType()) \
    .add("stars", DoubleType())

# Ingest from Kafka
df_raw = spark.readStream.format("kafka") \
    .option("kafka.bootstrap.servers", "localhost:9092") \
    .option("subscribe", "yelp_reviews") \
    .option("startingOffsets", "earliest") \
    .load()

# Parse JSON
df = df_raw.selectExpr("CAST(value AS STRING)") \
    .select(from_json(col("value"), schema).alias("data")) \
    .select("data.*")

# Show the results
query = df.writeStream.format("console").outputMode("append").start()
time.sleep(30)  # run for 30 seconds
query.stop()


25/04/18 18:04:19 WARN Utils: Your hostname, sriganesh-Inspiron-14-Plus-7440 resolves to a loopback address: 127.0.1.1; using 172.31.82.137 instead (on interface wlp0s20f3)
25/04/18 18:04:19 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


:: loading settings :: url = jar:file:/home/sriganesh/conda_root/lib/python3.12/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/sriganesh/.ivy2/cache
The jars for the packages stored in: /home/sriganesh/.ivy2/jars
org.apache.spark#spark-sql-kafka-0-10_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-7740858f-e04c-4ba5-ab8c-79d7275228f3;1.0
	confs: [default]
	found org.apache.spark#spark-sql-kafka-0-10_2.12;3.5.4 in central
	found org.apache.spark#spark-token-provider-kafka-0-10_2.12;3.5.4 in central
	found org.apache.kafka#kafka-clients;3.4.1 in central
	found org.lz4#lz4-java;1.8.0 in central
	found org.xerial.snappy#snappy-java;1.1.10.5 in central
	found org.slf4j#slf4j-api;2.0.7 in central
	found org.apache.hadoop#hadoop-client-runtime;3.3.4 in central
	found org.apache.hadoop#hadoop-client-api;3.3.4 in central
	found commons-logging#commons-logging;1.1.3 in local-m2-cache
	found com.google.code.findbugs#jsr305;3.0.0 in central
	found org.apache.commons#commons-pool2;2.11.1 in central
:: resolution report :: resolve 479ms :: artifact

-------------------------------------------
Batch: 0
-------------------------------------------


25/04/18 18:04:34 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors


+--------------------+-----+
|                text|stars|
+--------------------+-----+
|If you decide to ...|  3.0|
|I've taken a lot ...|  5.0|
|Family diner. Had...|  3.0|
|Wow!  Yummy, diff...|  5.0|
|Cute interior and...|  4.0|
|I am a long term ...|  1.0|
|Loved this tour! ...|  5.0|
|Amazingly amazing...|  5.0|
|This easter inste...|  3.0|
|Had a party of 6 ...|  3.0|
|My experience wit...|  5.0|
|Locals recommende...|  4.0|
|Love going here f...|  4.0|
|Good food--loved ...|  4.0|
|The bun makes the...|  4.0|
|Great place for b...|  5.0|
|Tremendous servic...|  5.0|
|The hubby and I h...|  4.0|
|I go to blow bar ...|  5.0|
|My absolute favor...|  5.0|
+--------------------+-----+
only showing top 20 rows



In [4]:
# Labeling: 1 (positive), 0 (negative), remove neutral
df_filtered = df.filter(col("stars") != 3.0)
df_labeled = df_filtered.withColumn("label", (col("stars") > 3.0).cast("int")).drop("stars")

# Write buffered stream to memory
df_labeled.writeStream \
    .format("memory") \
    .queryName("training_buffer") \
    .outputMode("append") \
    .start()

# Wait for buffer to fill
print("Buffering stream for 60 seconds before training...")
time.sleep(60)

# Pull for training
training_df = spark.sql("SELECT * FROM training_buffer")
print(f"ðŸ“Š Training on {training_df.count()} records...")

25/04/18 18:05:03 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
25/04/18 18:05:03 WARN AdminClientConfig: These configurations '[key.deserializer, value.deserializer, enable.auto.commit, max.poll.records, auto.offset.reset]' were supplied but are not used yet.


Buffering stream for 60 seconds before training...


25/04/18 18:06:04 WARN TaskSetManager: Stage 2 contains a task of very large size (4622 KiB). The maximum recommended task size is 1000 KiB.


ðŸ“Š Training on 177352 records...


In [5]:
# Preprocessing pipeline
tokenizer = Tokenizer(inputCol="text", outputCol="words")
remover = StopWordsRemover(inputCol="words", outputCol="filtered")
vectorizer = HashingTF(inputCol="filtered", outputCol="features", numFeatures=10000)
lr = LogisticRegression(labelCol="label", featuresCol="features", maxIter=10)

pipeline = Pipeline(stages=[tokenizer, remover, vectorizer, lr])
model = pipeline.fit(training_df)

# Save model
model_path = "sentiment_model-3"
model.write().overwrite().save(model_path)
print("âœ… Model saved to disk.")


25/04/18 18:06:34 WARN TaskSetManager: Stage 5 contains a task of very large size (4622 KiB). The maximum recommended task size is 1000 KiB.
25/04/18 18:06:38 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.VectorBLAS
25/04/18 18:06:38 WARN TaskSetManager: Stage 7 contains a task of very large size (4622 KiB). The maximum recommended task size is 1000 KiB.
25/04/18 18:06:40 WARN TaskSetManager: Stage 9 contains a task of very large size (4622 KiB). The maximum recommended task size is 1000 KiB.
25/04/18 18:06:40 WARN TaskSetManager: Stage 11 contains a task of very large size (4622 KiB). The maximum recommended task size is 1000 KiB.
25/04/18 18:06:41 WARN TaskSetManager: Stage 13 contains a task of very large size (4622 KiB). The maximum recommended task size is 1000 KiB.
25/04/18 18:06:41 WARN TaskSetManager: Stage 15 contains a task of very large size (4622 KiB). The maximum recommended task size is 1000 KiB.
25/04/18 18:06:41 WARN TaskSetManager: St

âœ… Model saved to disk.


In [6]:
# Your test input string
input_text = "This restaurant was absolutely worst tastes like shit! "
# Load model for predictions
loaded_model = PipelineModel.load(model_path)
# Create a DataFrame with the same input column name used during training
# Assume it was "text" as your inputCol in Tokenizer
test_df = spark.createDataFrame([(input_text,)], ["text"])

# Make prediction
predictions = loaded_model.transform(test_df)

# Show results
predictions.select("text", "probability", "prediction").show(truncate=False)

                                                                                

+-------------------------------------------------------+----------------------------------------+----------+
|text                                                   |probability                             |prediction|
+-------------------------------------------------------+----------------------------------------+----------+
|This restaurant was absolutely worst tastes like shit! |[0.9399839349017753,0.06001606509822466]|0.0       |
+-------------------------------------------------------+----------------------------------------+----------+



25/04/18 21:33:34 WARN Executor: Issue communicating with driver in heartbeater
org.apache.spark.rpc.RpcTimeoutException: Futures timed out after [10000 milliseconds]. This timeout is controlled by spark.executor.heartbeatInterval
	at org.apache.spark.rpc.RpcTimeout.org$apache$spark$rpc$RpcTimeout$$createRpcTimeoutException(RpcTimeout.scala:47)
	at org.apache.spark.rpc.RpcTimeout$$anonfun$addMessageIfTimeout$1.applyOrElse(RpcTimeout.scala:62)
	at org.apache.spark.rpc.RpcTimeout$$anonfun$addMessageIfTimeout$1.applyOrElse(RpcTimeout.scala:58)
	at scala.runtime.AbstractPartialFunction.apply(AbstractPartialFunction.scala:38)
	at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:76)
	at org.apache.spark.rpc.RpcEndpointRef.askSync(RpcEndpointRef.scala:101)
	at org.apache.spark.executor.Executor.reportHeartBeat(Executor.scala:1219)
	at org.apache.spark.executor.Executor.$anonfun$heartbeater$1(Executor.scala:295)
	at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java