In [1]:
import sparknlp
sparknlp.start()

from sparknlp.base import *
from sparknlp.annotator import *

from pyspark.sql import SparkSession
from pyspark.sql import functions as F



In [2]:
#create Spark session
spark = SparkSession.builder.appName('YelpML').getOrCreate()

#change configuration settings on Spark 
conf = spark.sparkContext._conf.setAll([('spark.executor.memory', '5g'), ('spark.app.name', 'Spark Updated Conf'), ('spark.executor.cores', '4'), ('spark.cores.max', '4'), ('spark.driver.memory','8g')])

#print spark configuration settings
#spark.sparkContext.getConf().getAll()

In [3]:
# Import Data
dataDir = "gs://msca-bdp-student-gcs/group2/yelp-datasample2"
business = spark.read.json(dataDir + "/sample_business")
checkin = spark.read.json(dataDir + "/sample_checkin")
review = spark.read.json(dataDir + "/sample_review")
tip = spark.read.json(dataDir + "/sample_tip")
user = spark.read.json(dataDir + "/sample_user")

23/08/11 20:46:03 WARN org.apache.spark.sql.catalyst.util.package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

## Review EDA

In [4]:
review.take(1)

[Row(business_id='arKiXax3ScSM_z3O-0CIyw', cool=0, date='2010-10-17 01:50:46', funny=0, review_id='zCNdcNrkIKefTPbak7CHVA', stars=5.0, text='Great Italian food!!  We have eaten here several times now and each time we have eaten something different.  Everytime the food has been fabulous!  We actually crave their food during the week and want to head over to Philadelphia for our Spasso food fix!', useful=0, user_id='bz2FrqfKrVmS7WwC-7C9aA')]

In [5]:
from pyspark.sql.functions import isnan, when, count, col
review.select([count(when(review[c].isNull(), c)).alias(c) for c in review.columns]).show()

[Stage 7:>                                                          (0 + 4) / 4]

+-----------+----+----+-----+---------+-----+----+------+-------+
|business_id|cool|date|funny|review_id|stars|text|useful|user_id|
+-----------+----+----+-----+---------+-----+----+------+-------+
|          0|   0|   0|    0|        0|    0|   0|     0|      0|
+-----------+----+----+-----+---------+-----+----+------+-------+



                                                                                

In [6]:
review.count()

                                                                                

70241

In [7]:
review_sample = review.sample(fraction=0.02, seed=42)

In [8]:
review_sample.count()

                                                                                

1482

In [9]:
review_sample.show()

+--------------------+----+-------------------+-----+--------------------+-----+--------------------+------+--------------------+
|         business_id|cool|               date|funny|           review_id|stars|                text|useful|             user_id|
+--------------------+----+-------------------+-----+--------------------+-----+--------------------+------+--------------------+
|E03HQDIBBR1UHVd0B...|   0|2020-06-04 17:25:44|    0|Vnfrt7BhWVwBH4Zjq...|  5.0|The experience at...|     0|V8dN1Nvj8bKJ2GL8C...|
|8_VaLzyX-H0nzbFIK...|   1|2014-12-03 00:25:46|    0|vJvjgW8cYExb2VM1o...|  5.0|It truly is as go...|     1|BkMqpJikNc3r5itc-...|
|fTZZih-F-0VPbnv7H...|   0|2020-07-06 05:41:35|    0|QkSPQf4YCPg1Fs8s5...|  1.0|$20 a piece for a...|     0|G7Sabx-ak70f_Zt8O...|
|Mfss88nOCGdyHkZZC...|   0|2014-03-07 23:37:04|    0|W8xEZ7SuEEaFwMdR6...|  4.0|I went to Caffe N...|     2|8MkZ6bpdP7x8Vlm_u...|
|FgnNJt32BcXz7u4Ny...|   0|2020-06-10 16:43:40|    0|SJX5aSGfGJqt8PB5e...|  1.0|Disgusting

In [11]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import Word2Vec
from pyspark.ml.regression import LinearRegression
# from sparknlp.base import DocumentAssembler, Finisher
# from sparknlp.annotator import Tokenizer, Normalizer

In [12]:
review_sample = review_sample.select('text', 'stars')

In [18]:
# Initialize SparkNLP components
document_assembler = DocumentAssembler().setInputCol("text").setOutputCol("document")
tokenizer = Tokenizer().setInputCols(["document"]).setOutputCol("token")
normalizer = Normalizer().setInputCols(["token"]).setOutputCol("normalized").setLowercase(True) \
    .setCleanupPatterns(["""[^\w\d\s]"""])
lemmatizer = LemmatizerModel.pretrained().setInputCols(["normalized"]).setOutputCol("lemmatized")
finisher = Finisher().setInputCols(['lemmatized']).setOutputCols(["finished"])

# Create a SparkNLP pipeline
pipeline = Pipeline(stages=[document_assembler, tokenizer, normalizer, lemmatizer, finisher])



# Fit and transform the data
processed_data = pipeline.fit(review_sample).transform(review_sample)

# processed_data = processed_data.withColumn("normalized_tokens", F.expr("transform(normalized, x -> x.result)"))

lemma_antbnc download started this may take some time.
Approximate size to download 907.6 KB
[OK!]


In [20]:
processed_data = processed_data.drop('text')

In [21]:
processed_data.show()

+-----+--------------------+
|stars|            finished|
+-----+--------------------+
|  5.0|[the, experience,...|
|  5.0|[it, truly, be, a...|
|  1.0|[20, a, piece, fo...|
|  4.0|[i, go, to, caffe...|
|  1.0|[disgusting, cust...|
|  1.0|[this, be, what, ...|
|  1.0|[this, be, not, a...|
|  5.0|[i, have, an, exc...|
|  3.0|[nice, atmosphere...|
|  5.0|[different, chees...|
|  4.0|[delicious, ice, ...|
|  1.0|[we, use, to, lik...|
|  1.0|[be, aware, must,...|
|  5.0|[this, place, be,...|
|  1.0|[i, fianc, take, ...|
|  5.0|[love, this, plac...|
|  5.0|[just, want, to, ...|
|  5.0|[this, familyowne...|
|  4.0|[this, spot, have...|
|  5.0|[this, nail, shop...|
+-----+--------------------+
only showing top 20 rows



In [24]:
# Use Word2Vec to convert text into dense vectors
word2vec = Word2Vec(vectorSize=100, inputCol="normalized", outputCol="features")
word2vec_model = word2vec.fit(processed_data)
word2vec_data = word2vec_model.transform(processed_data)

# Split the data into training and test sets
(training_data, test_data) = word2vec_data.randomSplit([0.8, 0.2], seed=42)

# Initialize and train the linear regression model
lr = LinearRegression(featuresCol="features", labelCol="rating")
lr_model = lr.fit(training_data)

# Make predictions on the test data
predictions = lr_model.transform(test_data)

# Evaluate the model (optional)
from pyspark.ml.evaluation import RegressionEvaluator
evaluator = RegressionEvaluator(labelCol="rating", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print("Root Mean Squared Error (RMSE):", rmse)

IllegalArgumentException: requirement failed: Column text must be of type equal to one of the following types: [array<string>, array<string>] but was actually of type string.

In [23]:
user.count()

                                                                                

20006