In [1]:
import threading

# Helper thread to avoid the Spark StreamingContext from blocking Jupyter
        
class StreamingThread(threading.Thread):
    def __init__(self, ssc):
        super().__init__()
        self.ssc = ssc
    def run(self):
        self.ssc.start()
        self.ssc.awaitTermination()
    def stop(self):
        print('----- Stopping... this may take a few seconds -----')
        self.ssc.stop(stopSparkContext=False, stopGraceFully=True)

In [2]:
sc

In [3]:
spark

In [25]:
import random
from pyspark.streaming import StreamingContext
from pyspark.sql import Row
from pyspark.sql.functions import udf, struct, array, col, lit
from pyspark.sql.types import StringType
from pyspark.ml.feature import RegexTokenizer, StopWordsRemover, CountVectorizer
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler
from pyspark.ml.feature import HashingTF, IDF
from pyspark.ml.feature import StandardScaler

In [5]:
df = spark.read.json('/Users/christianbutcher/Desktop/spark/reviews/*')
df.show()

                                                                                

+-------+-----+---------+--------------------+
| app_id|label|review_id|         review_text|
+-------+-----+---------+--------------------+
|2156300|    1|136759198|The Demo was grea...|
|2372320|    0|136758922|First of, this ga...|
|1498040|    1|136761203|Пример того, как ...|
|1811990|    1|136761840|I have beaten the...|
|1811990|    1|136761635|It really is very...|
|1782810|    1|136633021|Great for its cur...|
|1649740|    1|136629798|THROW YOUR MONEY ...|
|1649740|    1|136629381|I forgot I backed...|
|1649740|    1|136628148|Firstly, if you'r...|
|1649740|    1|136627883|[h1] HUNT THE NIG...|
|1798010|    1|136814124|Out of all the “b...|
|2273470|    1|136811469|After just doing ...|
|2273470|    1|136810289|Well developed de...|
|2329130|    1|136812852|Another title pub...|
|2329130|    1|136810307|Rewind or Die is ...|
|1928420|    0|137490805|No option to chan...|
| 986130|    1|137493446|Awesome game! For...|
| 986130|    1|137493372|This is a decent ...|
| 986130|    

# Preparing the data

In [6]:
df = df.dropDuplicates(['review_id'])

In [7]:
df = df.filter(df['review_text'] != '')

In [None]:
#df = df.select(['review_text','label'])

In [None]:
df.show(5)

# Text preprocessing

In [None]:
df.show(5)

In [10]:
#Create a balanced data set
n = 500
seed = 1

fractions = df.groupBy("label").count().withColumn("required_n", n/col("count"))\
                .drop("count").rdd.collectAsMap()

df_bal = df.stat.sampleBy("label", fractions, seed)
df_bal.groupBy("label").count().show()



+-----+-----+
|label|count|
+-----+-----+
|    0|  499|
|    1|  507|
+-----+-----+



                                                                                

In [11]:
#Split data into training and test sets
(trainingData, testData) = df_bal.randomSplit([0.7, 0.3], seed = 100)
print("Training Dataset Count: " + str(trainingData.count()))
print("Test Dataset Count: " + str(testData.count()))

                                                                                

Training Dataset Count: 708




Test Dataset Count: 298


                                                                                

# Create pipeline

In [26]:
# regular expression tokenizer
regexTokenizer = RegexTokenizer(inputCol="review_text", outputCol="words", pattern="\\W")
# stop words
stops = StopWordsRemover.loadDefaultStopWords('english')
stopwordsRemover = StopWordsRemover(inputCol=regexTokenizer.getOutputCol(), outputCol="filtered", 
                                   stopWords = stops)
# bag of words count
countVectors = CountVectorizer(inputCol=stopwordsRemover.getOutputCol(), outputCol="features", 
                               vocabSize=30000, minDF=5)


hashingTF = HashingTF(inputCol="filtered", outputCol="rawFeatures", numFeatures=10000)
idf = IDF(inputCol="rawFeatures", outputCol="features", minDocFreq=5) #minDocFreq: remove sparse terms
lr = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0)


pipeline = Pipeline(stages=[regexTokenizer, stopwordsRemover, hashingTF, idf, lr])

# Fit model

In [27]:
model = pipeline.fit(trainingData)
prediction = model.transform(testData)

                                                                                

In [28]:
prediction.select('review_text','label','rawPrediction','probability','prediction').show(10)

+--------------------+-----+--------------------+--------------------+----------+
|         review_text|label|       rawPrediction|         probability|prediction|
+--------------------+-----+--------------------+--------------------+----------+
|hai so i dont wit...|    1|[-0.4733160037383...|[0.38383169089462...|       1.0|
|Fun game, a worth...|    1|[-0.5438055216783...|[0.36730276583699...|       1.0|
|Cute, simple and ...|    1|[-0.8808018891340...|[0.29301163533991...|       1.0|
|Wow, where did th...|    1|[0.50605823199372...|[0.62388197841504...|       0.0|
|This game is just...|    0|[-0.2981013542337...|[0.42602168839814...|       1.0|
|Refunded in less ...|    0|[3.00531139019601...|[0.95281350171842...|       0.0|
|Tried it for an h...|    0|[1.56729156828224...|[0.82739715706425...|       0.0|
|BattleBlock Theat...|    1|[-0.7038035291780...|[0.33096947657906...|       1.0|
|It gets stale ver...|    0|[0.67798488463021...|[0.66328879623877...|       0.0|
|       Oh absol

In [16]:
prediction.columns

['app_id',
 'label',
 'review_id',
 'review_text',
 'words',
 'filtered',
 'rawFeatures',
 'features',
 'rawPrediction',
 'probability',
 'prediction']

In [29]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator.evaluate(prediction)

                                                                                

0.7504104984853837

In [18]:
model.write().overwrite().save('/Users/christianbutcher/Desktop/spark/model/')

                                                                                