In [0]:
# Install Java, Spark, and Findspark
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://www-us.apache.org/dist/spark/spark-2.4.5/spark-2.4.5-bin-hadoop2.7.tgz
!tar xf spark-2.4.5-bin-hadoop2.7.tgz
!pip install -q findspark
# Set Environment Variables
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.4.5-bin-hadoop2.7"
# Start a SparkSession
import findspark
findspark.init()

In [0]:
# Start Spark session
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Vine_Review_Analysis").getOrCreate()

In [143]:
# Read in data from S3 Buckets
from pyspark import SparkFiles
url ="https://shruthiramu-bucket.s3.us-east-2.amazonaws.com/vine_table.csv"
spark.sparkContext.addFile(url)
df = spark.read.csv(SparkFiles.get("vine_table.csv"), sep=",", header=True)

# Show DataFrame
df.show()

+--------------+-----------+-------------+-----------+----+
|     review_id|star_rating|helpful_votes|total_votes|vine|
+--------------+-----------+-------------+-----------+----+
|R3M9TEU08WAIQX|          5|            1|          1|   N|
|R2LTQL3SBKSYXW|          5|            3|          3|   N|
| R9XEAJEQ7X9TH|          5|            0|          0|   N|
|R36PUUGYGKC3GQ|          5|            0|          0|   N|
|R1ZDB2ZTQK6OO0|          1|            0|          1|   N|
|R1T64YX8QZQ97D|          5|            0|          0|   N|
|R2680XB05IT0WN|          3|            0|          0|   N|
|R3M54S76EJCQVR|          5|            0|          0|   N|
|R2RDS2RJET8H64|          5|            1|          1|   N|
| R9A1FY6DUKVLS|          4|            0|          0|   N|
|R193N6SEL5WKGE|          1|            0|          0|   N|
|R2YRKSXPT5PG35|          5|            0|          0|   N|
|R1GU2Z7C1V1NEG|          5|            0|          0|   N|
| R9993YL87NHTQ|          5|            

In [144]:
#Rule: Consider reviews which have more than 0 helpful_votes and total_votes
df_votes_df = df.filter((df.helpful_votes > 0)& (df.total_votes > 0))
df_votes_df.show(10)

+--------------+-----------+-------------+-----------+----+
|     review_id|star_rating|helpful_votes|total_votes|vine|
+--------------+-----------+-------------+-----------+----+
|R3M9TEU08WAIQX|          5|            1|          1|   N|
|R2LTQL3SBKSYXW|          5|            3|          3|   N|
|R2RDS2RJET8H64|          5|            1|          1|   N|
| R9993YL87NHTQ|          5|            1|          1|   N|
|R38OYHFA0P1PAF|          5|           14|         15|   N|
|R12REXBQ1TK0PG|          5|            1|          1|   N|
| R7SC4UY44OM7X|          5|            5|          8|   N|
|R326GY3LLG96B8|          4|            1|          6|   N|
| RAQGBP7QUSYLV|          1|            5|         12|   N|
|R14TPP4CJCASWX|          2|            1|          3|   N|
+--------------+-----------+-------------+-----------+----+
only showing top 10 rows



In [145]:
#Total reviews with more than 0 helpful_votes and total_votes in the dataset is..
df_votes_df.count()

84270

In [146]:
#Total 5 start reviews including both vine and non vine:
df_votes_df.filter(df_votes_df.star_rating == "5").count()

43027

In [147]:
#Number of Paid(vine) reviews are:
df_vine_df = df_votes_df.filter(df_votes_df.vine == "Y")
df_vine_df.count()

70

In [148]:
#Number of Non paid(non vine)reviews are:
df_non_vine_df = df_votes_df.filter(df.vine == "N")
df_non_vine_df.count()

84200

In [149]:
#Number of Paid(vine) 5 star reviews are:
df_vine_df.filter(df_vine_df.star_rating == "5").count()

32

In [150]:
#Number of non - Paid(non - vine) 5 star reviews are:
df_non_vine_df.filter(df_non_vine_df.star_rating == "5").count()

42995

In [151]:

#Average rating vine reviews
from pyspark.sql.functions import col, avg

df_vine_df.filter(df_vine_df['vine'] == "Y").agg(avg(col("star_rating"))).show()



+-----------------+
| avg(star_rating)|
+-----------------+
|4.128571428571429|
+-----------------+



In [152]:
#Average rating non vine reviews
df_non_vine_df.filter(df_non_vine_df['vine'] == "N").agg(avg(col("star_rating"))).show()

#Average rating for vine reviews is higher than average rating for non vine reviews.

+------------------+
|  avg(star_rating)|
+------------------+
|3.7149643705463182|
+------------------+



In [153]:
#Number of helpful_votes in vine reviews:
cleaned_df = df_vine_df.withColumn("helpful_votes",df_vine_df["helpful_votes"].cast('int'))

cleaned_df.groupBy("helpful_votes").sum().show()


+-------------+------------------+
|helpful_votes|sum(helpful_votes)|
+-------------+------------------+
|            1|                37|
|            6|                12|
|            3|                15|
|           20|                20|
|            5|                20|
|            4|                24|
|            8|                 8|
|            2|                28|
+-------------+------------------+



In [154]:
#Number of helpful_votes in non vine reviews:
cleaned_non_df = df_non_vine_df.withColumn("helpful_votes",df_non_vine_df["helpful_votes"].cast('int'))

cleaned_non_df.groupBy("helpful_votes").sum().show()
#Helpful votes are higher for the non vine reviews and are more spread.

+-------------+------------------+
|helpful_votes|sum(helpful_votes)|
+-------------+------------------+
|          243|               243|
|           31|              1829|
|           85|               425|
|          137|               137|
|           65|               520|
|           53|               371|
|          133|               133|
|           78|                78|
|          108|               108|
|           34|               782|
|          126|               126|
|          101|               202|
|          115|               115|
|           81|                81|
|           28|              1680|
|          300|               300|
|           76|               380|
|           27|              2025|
|           26|              1898|
|           44|               880|
+-------------+------------------+
only showing top 20 rows



In [155]:
df_votes_df.show(10)

+--------------+-----------+-------------+-----------+----+
|     review_id|star_rating|helpful_votes|total_votes|vine|
+--------------+-----------+-------------+-----------+----+
|R3M9TEU08WAIQX|          5|            1|          1|   N|
|R2LTQL3SBKSYXW|          5|            3|          3|   N|
|R2RDS2RJET8H64|          5|            1|          1|   N|
| R9993YL87NHTQ|          5|            1|          1|   N|
|R38OYHFA0P1PAF|          5|           14|         15|   N|
|R12REXBQ1TK0PG|          5|            1|          1|   N|
| R7SC4UY44OM7X|          5|            5|          8|   N|
|R326GY3LLG96B8|          4|            1|          6|   N|
| RAQGBP7QUSYLV|          1|            5|         12|   N|
|R14TPP4CJCASWX|          2|            1|          3|   N|
+--------------+-----------+-------------+-----------+----+
only showing top 10 rows



In [156]:
df_votes_df.filter(df_votes_df.vine=="Y").count()

70

In [0]:
# Import functions
from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF, StringIndexer, CountVectorizerModel

In [0]:
pos_neg_to_num = StringIndexer(inputCol='vine',outputCol='label')

tokenizer = Tokenizer(inputCol="star_rating", outputCol="token_text")

hashingTF = HashingTF(inputCol="token_text", outputCol='hash_token')

idf = IDF(inputCol='hash_token', outputCol='features')

In [0]:
# Create and run a data processing Pipeline
from pyspark.ml import Pipeline
data_prep_pipeline = Pipeline(stages=[pos_neg_to_num, tokenizer, hashingTF, idf])

In [0]:
# Fit and transform the pipeline
cleaner = data_prep_pipeline.fit(df_votes_df)
cleaned = cleaner.transform(df_votes_df)

In [161]:
cleaned.select(["label", "features"]).show(truncate=False)
#Label 0.0 is here means it is a non-vine review

+-----+--------------------------------------+
|label|features                              |
+-----+--------------------------------------+
|0.0  |(262144,[89689],[0.6721867297632954]) |
|0.0  |(262144,[89689],[0.6721867297632954]) |
|0.0  |(262144,[89689],[0.6721867297632954]) |
|0.0  |(262144,[89689],[0.6721867297632954]) |
|0.0  |(262144,[89689],[0.6721867297632954]) |
|0.0  |(262144,[89689],[0.6721867297632954]) |
|0.0  |(262144,[89689],[0.6721867297632954]) |
|0.0  |(262144,[233878],[1.9651758437139097])|
|0.0  |(262144,[236232],[1.687728883113754]) |
|0.0  |(262144,[212053],[2.575242925768841]) |
|0.0  |(262144,[89689],[0.6721867297632954]) |
|0.0  |(262144,[89689],[0.6721867297632954]) |
|0.0  |(262144,[236232],[1.687728883113754]) |
|0.0  |(262144,[233878],[1.9651758437139097])|
|0.0  |(262144,[236232],[1.687728883113754]) |
|0.0  |(262144,[236232],[1.687728883113754]) |
|0.0  |(262144,[89689],[0.6721867297632954]) |
|0.0  |(262144,[89074],[2.4275702546118914]) |
|0.0  |(26214

In [0]:
# Break data down into a training set and a testing set
training, testing = cleaned.randomSplit([0.7, 0.3])

In [0]:
from pyspark.ml.classification import NaiveBayes
# Create a Naive Bayes model and fit training data
nb = NaiveBayes()
predictor = nb.fit(training)

In [164]:
test_results = predictor.transform(testing)

test_results.show(5, truncate=False)

+--------------+-----------+-------------+-----------+----+-----+----------+-----------------------+-------------------------------------+-----------------------------------------+-------------------------------------------+----------+
|review_id     |star_rating|helpful_votes|total_votes|vine|label|token_text|hash_token             |features                             |rawPrediction                            |probability                                |prediction|
+--------------+-----------+-------------+-----------+----+-----+----------+-----------------------+-------------------------------------+-----------------------------------------+-------------------------------------------+----------+
|R10094UDVSDL5T|5          |2            |3          |N   |0.0  |[5]       |(262144,[89689],[1.0]) |(262144,[89689],[0.6721867297632954])|[-1.9025558126289972,-13.662263780786656]|[0.999992186954968,7.813045032046427E-6]   |0.0       |
|R100CRI2TMIS6I|2          |1            |1          |N 

In [165]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
acc_eval = MulticlassClassificationEvaluator()
acc= acc_eval.evaluate(test_results)

print("Accuracy of the model at predicting reviews was : %f" % acc)

Accuracy of the model at predicting reviews was : 0.998511


In [0]:
# 1. The trustworthiness of non vine reviews are way less compared to the vine reviews are the accuracy of predicting non vine reviews is 0.998 ~ 1.
# 2. Also the Number of helpful votes were less for vine reviews than non vine reviews.