In [1]:
reviews = spark.read.csv('C:\\E\\Medicaments_Reviews.csv', header=True, inferSchema=True, sep=',' )
reviews.printSchema()

root
 |-- Review_ID: integer (nullable = true)
 |-- review: string (nullable = true)
 |-- rating: integer (nullable = true)



In [2]:
drop_list = ['Review_ID']
reviews = reviews.select([column for column in reviews.columns if column not in drop_list])
reviews.printSchema()

root
 |-- review: string (nullable = true)
 |-- rating: integer (nullable = true)



In [3]:
from pyspark.sql.functions import udf, col, regexp_replace, when 
from pyspark.sql.types import StringType,BooleanType,DateType,NumericType, DoubleType
from pyspark.ml.feature import Tokenizer, RegexTokenizer, StopWordsRemover, HashingTF, IDF
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler
from pyspark.ml.feature import CountVectorizer
from pyspark.ml.classification import LinearSVC, RandomForestClassifier, NaiveBayes 
from pyspark.ml import Pipeline
import time
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator

In [4]:
reviews.show(5)
reviews = reviews.withColumn("rating", when(reviews.rating >=6  ,"pos") \
      .otherwise("neg"))
reviews.show(5)

+--------------------+------+
|              review|rating|
+--------------------+------+
|About two months ...|     1|
|I've tried a few ...|    10|
|My son has Crohn'...|     8|
|Quick reduction o...|     9|
|Contrave combines...|     9|
+--------------------+------+
only showing top 5 rows

+--------------------+------+
|              review|rating|
+--------------------+------+
|About two months ...|   neg|
|I've tried a few ...|   pos|
|My son has Crohn'...|   pos|
|Quick reduction o...|   pos|
|Contrave combines...|   pos|
+--------------------+------+
only showing top 5 rows



In [5]:
reviews = reviews.dropna()
clean_reviews = reviews.withColumn("review", regexp_replace(col("review"),"[^A-Za-z' ]",""))   
reviews = clean_reviews

In [6]:
regexTokenizer = RegexTokenizer(inputCol="review", outputCol="words", pattern="\\W")
stopwordsRemover = StopWordsRemover(inputCol="words", outputCol="filtered", caseSensitive=False)
countVectors = CountVectorizer(inputCol="filtered", outputCol="features",vocabSize=100000,minDF=5)

In [7]:
label_stringIdx = StringIndexer(inputCol = "rating", outputCol = "label")

In [8]:
pipeline = Pipeline(stages=[regexTokenizer, stopwordsRemover, countVectors, label_stringIdx])

In [9]:
pipelineFit = pipeline.fit(reviews)

In [10]:
dataset = pipelineFit.transform(reviews)

In [11]:
dataset.show(5)

+--------------------+------+--------------------+--------------------+--------------------+-----+
|              review|rating|               words|            filtered|            features|label|
+--------------------+------+--------------------+--------------------+--------------------+-----+
|About two months ...|   neg|[about, two, mont...|[two, months, ago...|(11200,[1,2,4,7,1...|  1.0|
|I've tried a few ...|   pos|[i, ve, tried, a,...|[ve, tried, antid...|(11200,[2,4,5,6,7...|  0.0|
|My son has Crohn'...|   pos|[my, son, has, cr...|[son, crohn, dise...|(11200,[0,2,4,7,1...|  0.0|
|Quick reduction o...|   pos|[quick, reduction...|[quick, reduction...|(11200,[87,787,17...|  0.0|
|Contrave combines...|   pos|[contrave, combin...|[contrave, combin...|(11200,[1,3,4,11,...|  0.0|
+--------------------+------+--------------------+--------------------+--------------------+-----+
only showing top 5 rows



In [12]:
(trainingData, testData) = dataset.randomSplit([0.75, 0.25], seed = 2000)
print("Training Dataset Count: " + str(trainingData.count()))
print("Test Dataset Count: " + str(testData.count()))

Training Dataset Count: 40346
Test Dataset Count: 13419


In [18]:
start = time.time()

nb = NaiveBayes(smoothing=1,  labelCol = "label", featuresCol = "features")
model = nb.fit(trainingData)
predictions = model.transform(testData)
predictions.select("review","rating","filtered","features","probability","prediction", "label") \
    .show(n = 5)

end = time.time()

+--------------------+------+--------------------+--------------------+--------------------+----------+-----+
|              review|rating|            filtered|            features|         probability|prediction|label|
+--------------------+------+--------------------+--------------------+--------------------+----------+-----+
| I have nerve dam...|   pos|[nerve, damage, l...|(11200,[9,16,31,4...|[0.64200552725922...|       0.0|  0.0|
| Made my hand sha...|   pos|[made, hand, shak...|(11200,[16,64,93,...|[0.70968052963671...|       0.0|  0.0|
| Maintenance of c...|   pos|[maintenance, cor...|(11200,[2,5,7,21,...|[0.91003143067319...|       0.0|  0.0|
| RO isentress  ti...|   pos|[ro, isentress, t...|(11200,[9,30,77,9...|[0.72255792176882...|       0.0|  0.0|
| days ago I start...|   neg|[days, ago, start...|(11200,[8,10,12,1...|[0.76529456495899...|       0.0|  1.0|
+--------------------+------+--------------------+--------------------+--------------------+----------+-----+
only showi

In [19]:
print("temps d'exécution :" + str(end-start) + " s")
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
print("La précision de Naive Baise = ", evaluator.evaluate(predictions) )

temps d'exécution :4.852296352386475 s
La précision de Naive Baise =  0.601522715903923


In [21]:
start = time.time()
rf = RandomForestClassifier(labelCol ="label", featuresCol = "features",numTrees=500)
model = rf.fit(trainingData)
prediction_rf = model.transform(testData)
prediction_rf.select("review", "rating", 'filtered' , "probability", "prediction","label" ).show(5)
end = time.time()


+--------------------+------+--------------------+--------------------+----------+-----+
|              review|rating|            filtered|         probability|prediction|label|
+--------------------+------+--------------------+--------------------+----------+-----+
| I have nerve dam...|   pos|[nerve, damage, l...|[0.69917992005808...|       0.0|  0.0|
| Made my hand sha...|   pos|[made, hand, shak...|[0.69814315305765...|       0.0|  0.0|
| Maintenance of c...|   pos|[maintenance, cor...|[0.69761917346074...|       0.0|  0.0|
| RO isentress  ti...|   pos|[ro, isentress, t...|[0.69896840098304...|       0.0|  0.0|
| days ago I start...|   neg|[days, ago, start...|[0.70074224190031...|       0.0|  1.0|
+--------------------+------+--------------------+--------------------+----------+-----+
only showing top 5 rows



In [23]:
print("temps d'exécution :" + str(end-start) + " s")
evaluator = BinaryClassificationEvaluator()
print("La précision de Random Forest : " + \
      str(evaluator.evaluate(prediction_rf, {evaluator.metricName: "areaUnderROC"})))

temps d'exécution :128.24345588684082 s
La précision de Random Forest : 0.5017123076665149


In [24]:
start = time.time()

lsvc = LinearSVC(labelCol ="label", featuresCol = "features", maxIter=100, regParam=0.5)
model_svm = lsvc.fit(trainingData)
prediction_svm = model_svm.transform(testData)
prediction_svm.select("review", "rating", "filtered" , "prediction","label" ).show(5)

end = time.time()

+--------------------+------+--------------------+----------+-----+
|              review|rating|            filtered|prediction|label|
+--------------------+------+--------------------+----------+-----+
| I have nerve dam...|   pos|[nerve, damage, l...|       0.0|  0.0|
| Made my hand sha...|   pos|[made, hand, shak...|       0.0|  0.0|
| Maintenance of c...|   pos|[maintenance, cor...|       0.0|  0.0|
| RO isentress  ti...|   pos|[ro, isentress, t...|       0.0|  0.0|
| days ago I start...|   neg|[days, ago, start...|       0.0|  1.0|
+--------------------+------+--------------------+----------+-----+
only showing top 5 rows



In [25]:
print("temps d'exécution :" + str(end-start) + " s")
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
print("La précision de SVM = ", evaluator.evaluate(prediction_svm) )

temps d'exécution :24.013376235961914 s
La précision de SVM =  0.5854622362911159
