If you dont have the dataset, run the below code

In [None]:
import gdown

In [None]:
gdown.download("https://drive.google.com/uc?id=1mW974SwZsSMH-nr89c2Pe9PPHhT1ifDr")

In [None]:
import threading

# Helper thread to avoid the Spark StreamingContext from blocking Jupyter
        
class StreamingThread(threading.Thread):
    def __init__(self, ssc):
        super().__init__()
        self.ssc = ssc
    def run(self):
        self.ssc.start()
        self.ssc.awaitTermination()
    def stop(self):
        print('----- Stopping... this may take a few seconds -----')
        self.ssc.stop(stopSparkContext=False, stopGraceFully=True)

In [None]:
sc

In [None]:
spark

In [None]:
#import necessary packages
import random
from pyspark.streaming import StreamingContext
from pyspark.sql import Row
from pyspark.sql.functions import udf, struct, array, col, lit, lower
from pyspark.sql.types import StringType
from pyspark.ml.feature import RegexTokenizer, StopWordsRemover, CountVectorizer, SQLTransformer, NGram
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler
from pyspark.ml.feature import HashingTF, IDF
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.feature import StandardScaler
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

Load in data

In [None]:
df = spark.read.json('/Users/christianbutcher/Desktop/spark/reviews/*')
df.show(10)

In [None]:
df.count()

In [None]:
df.filter(df['review_text'] == '').show()

In [None]:
df.select('review_id').distinct().count()

Clean the data set:

In [None]:
df = df.dropDuplicates(['review_id'])

In [None]:
df.count()

In [None]:
df = df.filter(df['review_text'] != '')

In [None]:
df.count()

In [None]:
df.groupBy('label').count().show()

Create a balanced data set:

In [None]:
n = 500
seed = 1

fractions = df.groupBy("label").count().withColumn("required_n", n/col("count"))\
                .drop("count").rdd.collectAsMap()

df_bal = df.stat.sampleBy("label", fractions, seed)
df_bal.groupBy("label").count().show()

Split data into training and test sets:

In [None]:
(trainingData, testData) = df_bal.randomSplit([0.7, 0.3], seed = 100)
print("Training Dataset Count: " + str(trainingData.count()))
print("Test Dataset Count: " + str(testData.count()))

Inititalise pipeline stages:

In [None]:
# regular expression tokenizer
regexTokenizer = RegexTokenizer(inputCol="review_text", outputCol="words", pattern="\\W")
# stop words
stops = StopWordsRemover.loadDefaultStopWords('english')
stopwordsRemover = StopWordsRemover(inputCol=regexTokenizer.getOutputCol(), outputCol="filtered", 
                                   stopWords = stops)

ng = NGram(inputCol=stopwordsRemover.getOutputCol(), n=2)
# bag of words count
countVectors = CountVectorizer(inputCol=ng.getOutputCol(), outputCol="rawFeatures", 
                               vocabSize=30000, minDF=5)


idf = IDF(inputCol="rawFeatures", outputCol="features", minDocFreq=5) #minDocFreq: remove sparse terms


lr = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0.2)

Put everything together in the pipeline:

In [None]:
pipeline = Pipeline(stages=[regexTokenizer, stopwordsRemover, countVectors, idf, lr])

In [None]:
paramGrid = ParamGridBuilder() \
    .addGrid(lr.regParam, [0.1, 0.01]) \
    .addGrid(lr.elasticNetParam, [0,0.2,0.5,0.8]) \
    .build()

In [None]:
crossval = CrossValidator(estimator=pipeline,
                          estimatorParamMaps=paramGrid,
                          evaluator=BinaryClassificationEvaluator(),
                          numFolds=3)

In [None]:
# Run cross-validation, and choose the best set of parameters.
model = crossval.fit(trainingData)

In [None]:
best = model.bestModel
print(best.stages)

Obtain predictions for the test data:

In [None]:
prediction = model.transform(testData)

In [None]:
prediction.columns

In [None]:
prediction.select('review_text','label','probability','prediction').show(10)

Evaluate the model predictions:

In [None]:
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator.evaluate(prediction)

In [None]:
evaluator2 = BinaryClassificationEvaluator()
evaluator2.evaluate(prediction)

In [None]:
dataset = prediction.select(["prediction", "label"])

In [None]:
evaluator3 = BinaryClassificationEvaluator(rawPredictionCol='prediction')
evaluator3.evaluate(dataset)

Save the model locally to access later:

In [None]:
model.write().overwrite().save('/Users/christianbutcher/Desktop/spark/model/')