In [41]:
from pyspark.sql.functions import *
import string

wine_data =spark.read.option("header", "true").option("mode", "DROPMALFORMED").csv("winemag-data-130k-v2.csv")

In [42]:
wine_data.describe('description').show()


+-------+--------------------+
|summary|         description|
+-------+--------------------+
|  count|              129974|
|   mean|                20.0|
| stddev|                 NaN|
|    min|         """Chremisa|
|    max|“Wow” is the firs...|
+-------+--------------------+



In [43]:
wine_data.describe('points').show()


+-------+--------------------+
|summary|              points|
+-------+--------------------+
|  count|              129966|
|   mean|   88.44617681462783|
| stddev|  3.0478934351221008|
|    min| and it doesn't s...|
|    max|              Umriss|
+-------+--------------------+



In [44]:
wine_data.printSchema()

root
 |-- _c0: string (nullable = true)
 |-- country: string (nullable = true)
 |-- description: string (nullable = true)
 |-- designation: string (nullable = true)
 |-- points: string (nullable = true)
 |-- price: string (nullable = true)
 |-- province: string (nullable = true)
 |-- region_1: string (nullable = true)
 |-- region_2: string (nullable = true)
 |-- taster_name: string (nullable = true)
 |-- taster_twitter_handle: string (nullable = true)
 |-- title: string (nullable = true)
 |-- variety: string (nullable = true)
 |-- winery: string (nullable = true)



# 1 . clustering
# 2 . sarcazem
# 3 . support more features
# 4 . concat last work to this file

# what is the average length of review? 

In [45]:
wine_data.count()

129975

In [46]:
#clean data - missing values and duplications

In [47]:
wine_data = wine_data.select('points','description').dropna()
wine_data.count()

129965

In [48]:
#remove shorts reviews

In [49]:
wine_data = wine_data.where(length(col('description'))>=80)

In [50]:
wine_data.count()

129728

In [57]:
# clean our data, here we’re gonna remove punctuations and empty spaces.
def removePunctuation(column):
#     return column.translate(str.maketrans('', '', string.punctuation))
    return trim(lower(regexp_replace(column, '[^\sa-zA-Z0-9]', ''))).alias('review')

wine = wine_data.select([removePunctuation(wine_data['description']),col("points").alias("label")])

# we choose  to work on positive negtive problem
wine = wine.withColumn("label", when(col("label")>80, 1.0).otherwise(0.0))
# wine.show()

In [58]:
import pyspark.sql.functions as f
df = wine_data.withColumn('descriptionWordCount', f.size(f.split(f.col('description'), ' ')))
df.agg(f.mean('descriptionWordCount'), f.count('descriptionWordCount')).show()

+-------------------------+---------------------------+
|avg(descriptionWordCount)|count(descriptionWordCount)|
+-------------------------+---------------------------+
|        40.43258201776024|                     129728|
+-------------------------+---------------------------+



In [59]:
## preprocecing

from pyspark.ml.feature import Tokenizer
from pyspark.ml.feature import StopWordsRemover
from pyspark.ml.feature import HashingTF, IDF


In [60]:
# Now split each sentence into words, also called word tokenization.

tokenizer = Tokenizer(inputCol="review", outputCol="split_sentence_into_words")
wordsDF = tokenizer.transform(wine)
# wordsDF.show()

In [61]:
# Remove stop words
remover = StopWordsRemover(inputCol="split_sentence_into_words", outputCol="clean_word")
wordsDF2 = remover.transform(wordsDF)
# wordsDF2.show()

In [62]:
# pip install vaderSentiment

In [63]:
# convert to binary
hashingTF_binary = HashingTF(inputCol="clean_word", outputCol="features", binary = True)
words_binary_df = hashingTF_binary.transform(wordsDF2)
words_binary_df.show()

+--------------------+-----+-------------------------+--------------------+--------------------+
|              review|label|split_sentence_into_words|          clean_word|            features|
+--------------------+-----+-------------------------+--------------------+--------------------+
|aromas include tr...|  1.0|     [aromas, include,...|[aromas, include,...|(262144,[11076,21...|
|this is ripe and ...|  1.0|     [this, is, ripe, ...|[ripe, fruity, wi...|(262144,[5460,213...|
|tart and snappy t...|  1.0|     [tart, and, snapp...|[tart, snappy, fl...|(262144,[21336,25...|
|pineapple rind le...|  1.0|     [pineapple, rind,...|[pineapple, rind,...|(262144,[9575,251...|
|much like the reg...|  1.0|     [much, like, the,...|[much, like, regu...|(262144,[16422,32...|
|blackberry and ra...|  1.0|     [blackberry, and,...|[blackberry, rasp...|(262144,[3521,148...|
|heres a bright in...|  1.0|     [heres, a, bright...|[heres, bright, i...|(262144,[11076,12...|
|this dry and rest...|  1.0|  

In [72]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.evaluation import MulticlassClassificationEvaluator, BinaryClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

# Split data into training and testing set 
(training, test) = words_binary_df.randomSplit([0.7, 0.3],seed=100)

In [73]:
class Evaluator():
    def __init__(self,model):
        self.model = model
    def run(self,training,test):
        fitModel = self.model.fit(training)
        self.predictions = fitModel.transform(test)
    def classification_evaluators(self):
        # Evaluate result with accuracy
        evaluator1 = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
        accuracy = evaluator1.evaluate(self.predictions)
        print("Model Accuracy: ", accuracy)

        # Evaluate result with ROC
        evaluator2 = BinaryClassificationEvaluator(labelCol="label", metricName="areaUnderROC")
        AUC = evaluator2.evaluate(self.predictions)
        print("area under ROC: ", AUC)
        
    def get_score(self):
        lr_label_predictions_df = self.predictions.select(["label", "prediction"])

        tp_count = lr_label_predictions_df.filter("label=1.0 and prediction=1.0").count()
        
        prediction_one = lr_label_predictions_df.filter("prediction=1.0").count()
        precision = float(tp_count) / prediction_one

        label_one = lr_label_predictions_df.filter("label=1.0").count()
        recall = float(tp_count) / label_one

        print("recall", recall, ", precision", precision)

# NaiveBayes on boolean vector

In [74]:
nb = NaiveBayes(smoothing=1.0, modelType="multinomial")
naiveBayesEvaluator = Evaluator(nb)
naiveBayesEvaluator.run(training,test)
naiveBayesEvaluator.classification_evaluators()
naiveBayesEvaluator.get_score()

Model Accuracy:  0.9971985195846612
area under ROC:  0.23404462093627212
recall 1.0 , precision 0.9971985195846612


# LogisticRegression on boolean vector

In [75]:
lr = LogisticRegression(maxIter=10)
logisticRegressionEvaluator = Evaluator(lr)
logisticRegressionEvaluator.run(training,test)
logisticRegressionEvaluator.classification_evaluators()
logisticRegressionEvaluator.get_score()

Model Accuracy:  0.9955022103423461
area under ROC:  0.8867652646869035
recall 0.9980154127683704 , precision 0.9974755280783102


In [76]:
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator

class EvaluatorClustering():
    def __init__(self,model):
        self.modelInstance = model
    def run(self,featureTable,reviewTable):
        self.model =  self.modelInstance.fit(featureTable)
        self.predictions = self.model.transform(reviewTable)

    def evaluators(self):
        evaluator = ClusteringEvaluator()
        silhouette = evaluator.evaluate(self.predictions)
        print("Silhouette with squared euclidean distance = " + str(silhouette))
        # Shows the result.
        centers = self.model.clusterCenters()
        print("Cluster Centers: ")
        for center in centers:
            print(center)

In [77]:
ec = EvaluatorClustering(KMeans().setK(2).setSeed(100))
featureTable = words_binary_df.select('features')
reviewTable = words_binary_df.select(['label','review','features'])

ec.run(featureTable,reviewTable)
ec.evaluators()


Silhouette with squared euclidean distance = 0.0403100977032412
Cluster Centers: 
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]


# creating tfidf vectors

In [78]:
wordsDF2.show()

+--------------------+-----+-------------------------+--------------------+
|              review|label|split_sentence_into_words|          clean_word|
+--------------------+-----+-------------------------+--------------------+
|aromas include tr...|  1.0|     [aromas, include,...|[aromas, include,...|
|this is ripe and ...|  1.0|     [this, is, ripe, ...|[ripe, fruity, wi...|
|tart and snappy t...|  1.0|     [tart, and, snapp...|[tart, snappy, fl...|
|pineapple rind le...|  1.0|     [pineapple, rind,...|[pineapple, rind,...|
|much like the reg...|  1.0|     [much, like, the,...|[much, like, regu...|
|blackberry and ra...|  1.0|     [blackberry, and,...|[blackberry, rasp...|
|heres a bright in...|  1.0|     [heres, a, bright...|[heres, bright, i...|
|this dry and rest...|  1.0|     [this, dry, and, ...|[dry, restrained,...|
|savory dried thym...|  1.0|     [savory, dried, t...|[savory, dried, t...|
|this has great de...|  1.0|     [this, has, great...|[great, depth, fl...|
|soft supple

In [79]:
hashingTF = HashingTF(inputCol = "clean_word", outputCol="tf")
wordsDF = hashingTF.transform(wordsDF2)
wordsDF.show()

+--------------------+-----+-------------------------+--------------------+--------------------+
|              review|label|split_sentence_into_words|          clean_word|                  tf|
+--------------------+-----+-------------------------+--------------------+--------------------+
|aromas include tr...|  1.0|     [aromas, include,...|[aromas, include,...|(262144,[11076,21...|
|this is ripe and ...|  1.0|     [this, is, ripe, ...|[ripe, fruity, wi...|(262144,[5460,213...|
|tart and snappy t...|  1.0|     [tart, and, snapp...|[tart, snappy, fl...|(262144,[21336,25...|
|pineapple rind le...|  1.0|     [pineapple, rind,...|[pineapple, rind,...|(262144,[9575,251...|
|much like the reg...|  1.0|     [much, like, the,...|[much, like, regu...|(262144,[16422,32...|
|blackberry and ra...|  1.0|     [blackberry, and,...|[blackberry, rasp...|(262144,[3521,148...|
|heres a bright in...|  1.0|     [heres, a, bright...|[heres, bright, i...|(262144,[11076,12...|
|this dry and rest...|  1.0|  

In [80]:
idf = IDF(inputCol="tf", outputCol="features")
idfModel = idf.fit(wordsDF)
words_tf_idf_DF = idfModel.transform(wordsDF)
words_tf_idf_DF.show()

+--------------------+-----+-------------------------+--------------------+--------------------+--------------------+
|              review|label|split_sentence_into_words|          clean_word|                  tf|            features|
+--------------------+-----+-------------------------+--------------------+--------------------+--------------------+
|aromas include tr...|  1.0|     [aromas, include,...|[aromas, include,...|(262144,[11076,21...|(262144,[11076,21...|
|this is ripe and ...|  1.0|     [this, is, ripe, ...|[ripe, fruity, wi...|(262144,[5460,213...|(262144,[5460,213...|
|tart and snappy t...|  1.0|     [tart, and, snapp...|[tart, snappy, fl...|(262144,[21336,25...|(262144,[21336,25...|
|pineapple rind le...|  1.0|     [pineapple, rind,...|[pineapple, rind,...|(262144,[9575,251...|(262144,[9575,251...|
|much like the reg...|  1.0|     [much, like, the,...|[much, like, regu...|(262144,[16422,32...|(262144,[16422,32...|
|blackberry and ra...|  1.0|     [blackberry, and,...|[b

In [81]:
(training, test) = words_tf_idf_DF.randomSplit([0.7, 0.3],seed=100)

# NaiveBayes on tdidf vectors

In [82]:
nb = NaiveBayes(smoothing=1.0, modelType="multinomial")
naiveBayesEvaluator = Evaluator(nb)
naiveBayesEvaluator.run(training,test)
naiveBayesEvaluator.classification_evaluators()
naiveBayesEvaluator.get_score()

Model Accuracy:  0.9971728179294747
area under ROC:  0.3772172317881077
recall 0.999948452279698 , precision 0.9972240071970184


# LogisticRegression on tdidf vectors

In [83]:
lr = LogisticRegression(maxIter=10)
logisticRegressionEvaluator = Evaluator(lr)
logisticRegressionEvaluator.run(training,test)
logisticRegressionEvaluator.classification_evaluators()
logisticRegressionEvaluator.get_score()

Model Accuracy:  0.9956564202734656
area under ROC:  0.881628463421572
recall 0.9981700559292765 , precision 0.9974759181991449


In [84]:
ec = EvaluatorClustering(KMeans().setK(2).setSeed(1))
featureTable = words_tf_idf_DF.select('features')
reviewTable = words_tf_idf_DF.select(['label','review','features'])

ec.run(featureTable,reviewTable)
ec.evaluators()


Silhouette with squared euclidean distance = 0.0206856609768209
Cluster Centers: 
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
