In [16]:
#importing libraries
from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.sql.functions import col, when




In [17]:
#Spark Session
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Taxi Reviews Sentiment Analysis") \
    .getOrCreate()


In [18]:
#Reviews Data Load
path = "/content/drive/MyDrive/NYC_Taxi_Project/task4_input_data"

df = spark.read.csv(path, header=True, inferSchema=True)
df.show(5)


+--------+--------------------+---------------------+---------------+-------------+------------------+-----------------+----------+------------------+------------------+------------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+---------------------+--------------------+------+---------------+
|VendorID|tpep_pickup_datetime|tpep_dropoff_datetime|passenger_count|trip_distance|  pickup_longitude|  pickup_latitude|RateCodeID|store_and_fwd_flag| dropoff_longitude|  dropoff_latitude|payment_type|fare_amount|extra|mta_tax|tip_amount|tolls_amount|improvement_surcharge|total_amount|trip_duration_minutes|         review_text|rating|sentiment_label|
+--------+--------------------+---------------------+---------------+-------------+------------------+-----------------+----------+------------------+------------------+------------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------

In [19]:
#Label Creation
df = df.withColumn(
    "label",
    when(col("rating") >= 4, 1)  # Positive
    .when(col("rating") <= 2, 0) # Negative
    .otherwise(2)                # Neutral
)

df.select("review_text", "rating", "label").show(5)


+--------------------+------+-----+
|         review_text|rating|label|
+--------------------+------+-----+
|The trip was shor...|     4|    1|
|The ride was aver...|     4|    1|
|The trip was shor...|     4|    1|
|The trip was shor...|     4|    1|
|The trip was shor...|     2|    0|
+--------------------+------+-----+
only showing top 5 rows


In [21]:
#NLP Pre-Processing
tokenizer = Tokenizer(
    inputCol="review_text",
    outputCol="words"
)

stopword_remover = StopWordsRemover(
    inputCol="words",
    outputCol="filtered_words"
)

hashingTF = HashingTF(
    inputCol="filtered_words",
    outputCol="rawFeatures",
    numFeatures=10000
)

idf = IDF(
    inputCol="rawFeatures",
    outputCol="features"
)


In [22]:
#ML Model (Logistic Regression)
lr = LogisticRegression(
    featuresCol="features",
    labelCol="label",
    maxIter=10
)


In [23]:
#Pipeline Creation
pipeline = Pipeline(stages=[
    tokenizer,
    stopword_remover,
    hashingTF,
    idf,
    lr
])


In [24]:
#Train-Test Split
train_df, test_df = df.randomSplit([0.8, 0.2], seed=42)


In [25]:
#Model Training
model = pipeline.fit(train_df)



In [26]:
#Predictions
predictions = model.transform(test_df)

predictions.select(
    "review_text",
    "label",
    "prediction"
).show(10, truncate=False)


+-------------------------------------------------------------+-----+----------+
|review_text                                                  |label|prediction|
+-------------------------------------------------------------+-----+----------+
|The trip was short and completed quickly                     |1    |1.0       |
|The ride was average and reached the destination on time     |0    |1.0       |
|The trip was short and completed quickly                     |1    |1.0       |
|The trip was short and completed quickly                     |1    |1.0       |
|The ride was excellent, smooth and the driver was very polite|1    |1.0       |
|The ride was average and reached the destination on time     |1    |1.0       |
|The trip was short and completed quickly                     |1    |1.0       |
|The ride was average and reached the destination on time     |1    |1.0       |
|The ride was average and reached the destination on time     |1    |1.0       |
|The ride was average and re

In [27]:
#Model Evaluation
evaluator = MulticlassClassificationEvaluator(
    labelCol="label",
    predictionCol="prediction",
    metricName="accuracy"
)

accuracy = evaluator.evaluate(predictions)
print("Model Accuracy:", accuracy)


Model Accuracy: 0.6045678219940094
