If you dont have the dataset, run the below code

In [None]:
import gdown

In [None]:
gdown.download("https://drive.google.com/uc?id=1mW974SwZsSMH-nr89c2Pe9PPHhT1ifDr")

In [1]:
import threading

# Helper thread to avoid the Spark StreamingContext from blocking Jupyter
        
class StreamingThread(threading.Thread):
    def __init__(self, ssc):
        super().__init__()
        self.ssc = ssc
    def run(self):
        self.ssc.start()
        self.ssc.awaitTermination()
    def stop(self):
        print('----- Stopping... this may take a few seconds -----')
        self.ssc.stop(stopSparkContext=False, stopGraceFully=True)

In [2]:
sc

In [3]:
spark

In [4]:
#import necessary packages
import random
from pyspark.streaming import StreamingContext
from pyspark.sql import Row
from pyspark.sql.functions import udf, struct, array, col, lit, lower
from pyspark.sql.types import StringType
from pyspark.ml.feature import RegexTokenizer, StopWordsRemover, CountVectorizer, SQLTransformer
from pyspark.ml.classification import NaiveBayes
from pyspark.ml import Pipeline
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler
from pyspark.ml.feature import HashingTF, IDF
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.feature import StandardScaler
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

Load in data

In [5]:
df = spark.read.json('/Users/christianbutcher/Desktop/spark/reviews/*')
df.show(10)

                                                                                

+-------+-----+---------+--------------------+
| app_id|label|review_id|         review_text|
+-------+-----+---------+--------------------+
|2156300|    1|136759198|The Demo was grea...|
|2372320|    0|136758922|First of, this ga...|
|1498040|    1|136761203|Пример того, как ...|
|1811990|    1|136761840|I have beaten the...|
|1811990|    1|136761635|It really is very...|
|1782810|    1|136633021|Great for its cur...|
|1649740|    1|136629798|THROW YOUR MONEY ...|
|1649740|    1|136629381|I forgot I backed...|
|1649740|    1|136628148|Firstly, if you'r...|
|1649740|    1|136627883|[h1] HUNT THE NIG...|
+-------+-----+---------+--------------------+
only showing top 10 rows



Clean the data set:

In [6]:
df = df.dropDuplicates(['review_id'])
df = df.filter(df['review_text'] != '')

Create a balanced data set:

In [7]:
n = 500
seed = 1

fractions = df.groupBy("label").count().withColumn("required_n", n/col("count"))\
                .drop("count").rdd.collectAsMap()

df_bal = df.stat.sampleBy("label", fractions, seed)
df_bal.groupBy("label").count().show()



+-----+-----+
|label|count|
+-----+-----+
|    0|  499|
|    1|  507|
+-----+-----+



                                                                                

Split data into training and test sets:

In [8]:
(trainingData, testData) = df_bal.randomSplit([0.7, 0.3], seed = 100)
print("Training Dataset Count: " + str(trainingData.count()))
print("Test Dataset Count: " + str(testData.count()))

                                                                                

Training Dataset Count: 708




Test Dataset Count: 298


                                                                                

Inititalise pipeline stages:

In [9]:
# regular expression tokenizer
regexTokenizer = RegexTokenizer(inputCol="review_text", outputCol="words", pattern="\\W")
# stop words
stops = StopWordsRemover.loadDefaultStopWords('english')
stopwordsRemover = StopWordsRemover(inputCol=regexTokenizer.getOutputCol(), outputCol="filtered", 
                                   stopWords = stops)

# bag of words count
countVectors = CountVectorizer(inputCol=stopwordsRemover.getOutputCol(), outputCol="rawFeatures", 
                               vocabSize=30000, minDF=5)

idf = IDF(inputCol="rawFeatures", outputCol="features", minDocFreq=5) #minDocFreq: remove sparse terms


nb = NaiveBayes(smoothing=1.0)

Put everything together in the pipeline:

In [10]:
pipeline = Pipeline(stages=[regexTokenizer, stopwordsRemover, countVectors, idf, nb])

In [11]:
paramGrid = ParamGridBuilder() \
    .addGrid(nb.smoothing, [1.0,0.5,0]) \
    .build()

In [12]:
crossval = CrossValidator(estimator=pipeline,
                          estimatorParamMaps=paramGrid,
                          evaluator=BinaryClassificationEvaluator(),
                          numFolds=3)

In [13]:
# Run cross-validation, and choose the best set of parameters.
model = crossval.fit(trainingData)

                                                                                

In [14]:
best = model.bestModel
print(best.stages)

[RegexTokenizer_da41665598fd, StopWordsRemover_29b8a7ae3a45, CountVectorizerModel: uid=CountVectorizer_ce3274ccca16, vocabularySize=1265, IDFModel: uid=IDF_66f504c7689b, numDocs=708, numFeatures=1265, NaiveBayesModel: uid=NaiveBayes_f76681724789, modelType=multinomial, numClasses=2, numFeatures=1265]


Obtain predictions for the test data:

In [15]:
prediction = model.transform(testData)

In [16]:
prediction.columns

['app_id',
 'label',
 'review_id',
 'review_text',
 'words',
 'filtered',
 'rawFeatures',
 'features',
 'rawPrediction',
 'probability',
 'prediction']

In [17]:
prediction.select('review_text','label','probability','prediction').show(10)



+--------------------+-----+--------------------+----------+
|         review_text|label|         probability|prediction|
+--------------------+-----+--------------------+----------+
|hai so i dont wit...|    1|[0.16930921247935...|       1.0|
|Fun game, a worth...|    1|           [0.0,1.0]|       1.0|
|Cute, simple and ...|    1|[4.80382565299217...|       1.0|
|Wow, where did th...|    1|[0.99994094877458...|       0.0|
|This game is just...|    0|[0.99542692816129...|       0.0|
|Refunded in less ...|    0|           [1.0,0.0]|       0.0|
|Tried it for an h...|    0|           [1.0,0.0]|       0.0|
|BattleBlock Theat...|    1|           [0.0,1.0]|       1.0|
|It gets stale ver...|    0|           [1.0,0.0]|       0.0|
|       Oh absolutely|    1|[0.39675528325361...|       1.0|
+--------------------+-----+--------------------+----------+
only showing top 10 rows



                                                                                

Evaluate the model predictions:

In [19]:
evaluator = BinaryClassificationEvaluator()
evaluator.evaluate(prediction)

0.7026672694394214