In [86]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, avg,when
import hashlib
from pyspark.sql.functions import udf

from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import norm 

from pyspark.sql.types import *
from pyspark.sql import Row

from pyspark.ml.feature import StringIndexer

### Tokenizers
from pyspark.ml.feature import Tokenizer, RegexTokenizer
from pyspark.sql.functions import col, udf
from pyspark.sql.types import IntegerType
from pyspark.ml.feature import StopWordsRemover
## convert to word v review matrix 
from pyspark.ml.feature import HashingTF, IDF
##Naive Bayes
from pyspark.ml.classification import NaiveBayes, NaiveBayesModel
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.mllib.evaluation import MultilabelMetrics,MulticlassMetrics

In [2]:
conf = SparkConf().setAppName("test").setMaster("*")
spark = SparkSession.builder.getOrCreate()

In [3]:
review = spark.read.json('data/dataset/review.json')
(review,spill) = review.randomSplit([0.40,0.60])
review = review.select('user_id',col('stars').cast('int'),'business_id','text' )

In [4]:
review.show(5)

+--------------------+-----+--------------------+--------------------+
|             user_id|stars|         business_id|                text|
+--------------------+-----+--------------------+--------------------+
|HPtjvIrhzAUkKsiVk...|    3|--6MefnULPED_I942...|This restaurant i...|
|FEg8v92qx3kK4Hu4T...|    4|--6MefnULPED_I942...|This is one of my...|
|_5keMAmic7zzrnM5m...|    4|--7zmmkVg-IMGaXbu...|Nice small place ...|
|lBfKHlgTgxjv1SL4Z...|    3|--8LPVSo5i0Oo61X0...|Dr. Purcell is go...|
|ZPspeNuGDw5PYcu0m...|    2|--9QQLMTbFzLJ_oT-...|Went to this loca...|
+--------------------+-----+--------------------+--------------------+
only showing top 5 rows



In [5]:
def fix_ids(s):
    return int(int(hashlib.sha1(s).hexdigest(), 16) % 179426083)

fix_ids_udf = udf(fix_ids)

def fix_decimal_values(s):
    return round(s,2)

fix_round_udf = udf(fix_decimal_values)

def conv_to_int(v):
    return int(v)

to_int_udf = udf(conv_to_int)

In [6]:
business = spark.read.json('data/dataset/business.json')
business = business.select('business_id','name','city','categories')
business.show(5)

+--------------------+--------------------+--------------+--------------------+
|         business_id|                name|          city|          categories|
+--------------------+--------------------+--------------+--------------------+
|FYWN1wneV18bWNgQj...|    Dental by Design|     Ahwatukee|[Dentists, Genera...|
|He-G7vWjzVUysIKrf...| Stephen Szabo Salon|      McMurray|[Hair Stylists, H...|
|KQPW8lFf1y5BT2Mxi...|Western Motor Veh...|       Phoenix|[Departments of M...|
|8DShNS-LuFqpEWIp0...|    Sports Authority|         Tempe|[Sporting Goods, ...|
|PfOCPjBrlQAnz__NX...|Brick House Taver...|Cuyahoga Falls|[American (New), ...|
+--------------------+--------------------+--------------+--------------------+
only showing top 5 rows



In [7]:
df = review.join(business,'business_id')
df.show(5)

+--------------------+--------------------+-----+--------------------+--------------------+---------+--------------------+
|         business_id|             user_id|stars|                text|                name|     city|          categories|
+--------------------+--------------------+-----+--------------------+--------------------+---------+--------------------+
|--9e1ONYQuAa-CB_R...|vtLGogUiDd1Xla-Rt...|    4|Looking for a goo...|Delmonico Steakhouse|Las Vegas|[Cajun/Creole, St...|
|--9e1ONYQuAa-CB_R...|T98RxxU5UB5WL05yl...|    4|Service was great...|Delmonico Steakhouse|Las Vegas|[Cajun/Creole, St...|
|--9e1ONYQuAa-CB_R...|et_GDGFfG2BFVkLzR...|    5|Being from Califo...|Delmonico Steakhouse|Las Vegas|[Cajun/Creole, St...|
|--9e1ONYQuAa-CB_R...|aVOGlN9fZ-BXcbtj6...|    5|Truly Fantastic! ...|Delmonico Steakhouse|Las Vegas|[Cajun/Creole, St...|
|--9e1ONYQuAa-CB_R...|mD-JAucjMeNnp50On...|    2|My wife and I cam...|Delmonico Steakhouse|Las Vegas|[Cajun/Creole, St...|
+---------------

In [10]:
df= df.withColumn('userId',fix_ids_udf(df['user_id']).cast('int'))
df= df.withColumn('businessId',fix_ids_udf(df['business_id']).cast('int'))

In [11]:
df.show(10)

+--------------------+--------------------+-----+--------------------+--------------------+---------+--------------------+---------+----------+
|         business_id|             user_id|stars|                text|                name|     city|          categories|   userId|businessId|
+--------------------+--------------------+-----+--------------------+--------------------+---------+--------------------+---------+----------+
|--9e1ONYQuAa-CB_R...|vtLGogUiDd1Xla-Rt...|    4|Looking for a goo...|Delmonico Steakhouse|Las Vegas|[Cajun/Creole, St...|170488927|  65725733|
|--9e1ONYQuAa-CB_R...|T98RxxU5UB5WL05yl...|    4|Service was great...|Delmonico Steakhouse|Las Vegas|[Cajun/Creole, St...|  1624445|  65725733|
|--9e1ONYQuAa-CB_R...|et_GDGFfG2BFVkLzR...|    5|Being from Califo...|Delmonico Steakhouse|Las Vegas|[Cajun/Creole, St...| 62436065|  65725733|
|--9e1ONYQuAa-CB_R...|aVOGlN9fZ-BXcbtj6...|    5|Truly Fantastic! ...|Delmonico Steakhouse|Las Vegas|[Cajun/Creole, St...| 25606357|  65

In [12]:
df = df.select('userId','businessId',col('stars').alias('rating'),col('text').alias('comment'))

In [13]:
df.show(10)

+---------+----------+------+--------------------+
|   userId|businessId|rating|             comment|
+---------+----------+------+--------------------+
| 76417945|  99011368|     3|This restaurant i...|
| 16647810|  99011368|     4|This is one of my...|
|118810687| 157675875|     4|Nice small place ...|
|124068344|   3918020|     3|Dr. Purcell is go...|
|  3200749|  69783790|     2|Went to this loca...|
|170488927|  65725733|     4|Looking for a goo...|
|  1624445|  65725733|     4|Service was great...|
| 62436065|  65725733|     5|Being from Califo...|
| 25606357|  65725733|     5|Truly Fantastic! ...|
| 12076070|  65725733|     2|My wife and I cam...|
+---------+----------+------+--------------------+
only showing top 10 rows



## Tokenizing comments and Removing stop words

In [14]:
countTokens = udf(lambda words: len(words), IntegerType())

In [15]:
tokenizer = Tokenizer(inputCol="comment", outputCol="words")
tokenized = tokenizer.transform(df)
tokenized.select("comment", "words").withColumn("tokens", countTokens(col("words"))).show()

+--------------------+--------------------+------+
|             comment|               words|tokens|
+--------------------+--------------------+------+
|This restaurant i...|[this, restaurant...|    43|
|This is one of my...|[this, is, one, o...|    39|
|Nice small place ...|[nice, small, pla...|    31|
|Dr. Purcell is go...|[dr., purcell, is...|   144|
|Went to this loca...|[went, to, this, ...|   210|
|Looking for a goo...|[looking, for, a,...|   144|
|Service was great...|[service, was, gr...|    75|
|Being from Califo...|[being, from, cal...|    87|
|Truly Fantastic! ...|[truly, fantastic...|    26|
|My wife and I cam...|[my, wife, and, i...|   357|
|This is mine and ...|[this, is, mine, ...|    59|
|Absolutely impres...|[absolutely, impr...|    38|
|We decided to giv...|[we, decided, to,...|   202|
|(Jan/ 2006)

Tuna...|[(jan/, 2006), , ...|    92|
|I went here for m...|[i, went, here, f...|   137|
|Great steak house...|[great, steak, ho...|    48|
|We had early rese...|[we, had,

In [16]:
remover = StopWordsRemover(inputCol="words", outputCol="filtered")
remover.transform(tokenized).select('comment','words','filtered').withColumn("tokens", countTokens(col("filtered"))).show()

+--------------------+--------------------+--------------------+------+
|             comment|               words|            filtered|tokens|
+--------------------+--------------------+--------------------+------+
|This restaurant i...|[this, restaurant...|[restaurant, famo...|    21|
|This is one of my...|[this, is, one, o...|[one, top, 3, pla...|    23|
|Nice small place ...|[nice, small, pla...|[nice, small, pla...|    17|
|Dr. Purcell is go...|[dr., purcell, is...|[dr., purcell, go...|    66|
|Went to this loca...|[went, to, this, ...|[went, location, ...|    92|
|Looking for a goo...|[looking, for, a,...|[looking, good, s...|    71|
|Service was great...|[service, was, gr...|[service, great.....|    42|
|Being from Califo...|[being, from, cal...|[california, foie...|    45|
|Truly Fantastic! ...|[truly, fantastic...|[truly, fantastic...|    18|
|My wife and I cam...|[my, wife, and, i...|[wife, came, anni...|   188|
|This is mine and ...|[this, is, mine, ...|[mine, fiancé's, ...|

In [17]:
filtered = remover.transform(tokenized).select('userId','businessId','rating','comment','filtered').withColumn("tokens", countTokens(col("filtered")))

In [18]:
filtered.show()

+---------+----------+------+--------------------+--------------------+------+
|   userId|businessId|rating|             comment|            filtered|tokens|
+---------+----------+------+--------------------+--------------------+------+
| 76417945|  99011368|     3|This restaurant i...|[restaurant, famo...|    21|
| 16647810|  99011368|     4|This is one of my...|[one, top, 3, pla...|    23|
|118810687| 157675875|     4|Nice small place ...|[nice, small, pla...|    17|
|124068344|   3918020|     3|Dr. Purcell is go...|[dr., purcell, go...|    66|
|  3200749|  69783790|     2|Went to this loca...|[went, location, ...|    92|
|170488927|  65725733|     4|Looking for a goo...|[looking, good, s...|    71|
|  1624445|  65725733|     4|Service was great...|[service, great.....|    42|
| 62436065|  65725733|     5|Being from Califo...|[california, foie...|    45|
| 25606357|  65725733|     5|Truly Fantastic! ...|[truly, fantastic...|    18|
| 12076070|  65725733|     2|My wife and I cam...|[w

## making word v comment matrix

In [19]:
hashingTF = HashingTF(inputCol="filtered", outputCol="rawFeatures")
idf = IDF(inputCol="rawFeatures", outputCol="features")

#hashingTF = HashingTF()
tf = hashingTF.transform(filtered)

# While applying HashingTF only needs a single pass to the data, applying IDF needs two passes:
# First to compute the IDF vector and second to scale the term frequencies by IDF.
#tf.cache()
idf = idf.fit(tf)
tfidf = idf.transform(tf)

In [20]:
data = tfidf.select('userId','businessId','comment','filtered','features',col('rating').alias('label'))
data.show(5)

+---------+----------+--------------------+--------------------+--------------------+-----+
|   userId|businessId|             comment|            filtered|            features|label|
+---------+----------+--------------------+--------------------+--------------------+-----+
| 76417945|  99011368|This restaurant i...|[restaurant, famo...|(262144,[34565,37...|    3|
| 16647810|  99011368|This is one of my...|[one, top, 3, pla...|(262144,[6308,680...|    4|
|118810687| 157675875|Nice small place ...|[nice, small, pla...|(262144,[22346,32...|    4|
|124068344|   3918020|Dr. Purcell is go...|[dr., purcell, go...|(262144,[666,1836...|    3|
|  3200749|  69783790|Went to this loca...|[went, location, ...|(262144,[1536,850...|    2|
+---------+----------+--------------------+--------------------+--------------------+-----+
only showing top 5 rows



In [21]:
def c_zero_one(n):
    if n==1:
        return 0
    elif n == 5:
        return 1


conv_zero_one = udf(c_zero_one)

good_bad = data.filter( (col('label') == 1) | (col('label') == 5) ).withColumn('label',conv_zero_one(col('label')).cast('int'))
good_bad.show(5)

+--------+----------+--------------------+--------------------+--------------------+-----+
|  userId|businessId|             comment|            filtered|            features|label|
+--------+----------+--------------------+--------------------+--------------------+-----+
|62436065|  65725733|Being from Califo...|[california, foie...|(262144,[2325,791...|    1|
|25606357|  65725733|Truly Fantastic! ...|[truly, fantastic...|(262144,[991,1199...|    1|
|52090707|  65725733|This is mine and ...|[mine, fiancé's, ...|(262144,[3358,345...|    1|
|30232171|  65725733|Absolutely impres...|[absolutely, impr...|(262144,[35263,35...|    1|
| 5094160|  65725733|We decided to giv...|[decided, give, d...|(262144,[5232,130...|    1|
+--------+----------+--------------------+--------------------+--------------------+-----+
only showing top 5 rows



In [22]:
(training,test) = good_bad.randomSplit([0.80,.20])

In [23]:
#model = NaiveBayes.train(training, 1.0)
nb = NaiveBayes(smoothing=1)
model = nb.fit(training)

# Predictions

## Predictions on polarized ratings 

In [24]:
predictions = model.transform(test)


In [25]:
predictions.filter(predictions['prediction'] == 0).select('comment','filtered','label','prediction').show(10)

+--------------------+--------------------+-----+----------+
|             comment|            filtered|label|prediction|
+--------------------+--------------------+-----+----------+
|Well I hate to gi...|[well, hate, give...|    0|       0.0|
|Paid $10 for 2 hr...|[paid, $10, 2, hr...|    1|       0.0|
|Horrible experien...|[horrible, experi...|    0|       0.0|
|Horrible customer...|[horrible, custom...|    0|       0.0|
|DO NOT COME HERE....|[come, here., , ,...|    0|       0.0|
|Ordered two bacon...|[ordered, two, ba...|    0|       0.0|
|Horrible food, sl...|[horrible, food,,...|    0|       0.0|
|This place is awe...|[place, awesome.,...|    1|       0.0|
|I used this shop ...|[used, shop, bolt...|    0|       0.0|
|I have came to Na...|[came, nail, club...|    0|       0.0|
+--------------------+--------------------+-----+----------+
only showing top 10 rows



In [26]:
predictions.filter(predictions['prediction'] == 1).select('comment','filtered','label','prediction').show(10)

+--------------------+--------------------+-----+----------+
|             comment|            filtered|label|prediction|
+--------------------+--------------------+-----+----------+
|I Moved in Skepti...|[moved, skeptical...|    1|       1.0|
|I 've worked here...|['ve, worked, yea...|    1|       1.0|
|Wow,love this new...|[wow,love, new, p...|    1|       1.0|
|In a word - AMAZI...|[word, -, amazing...|    1|       1.0|
|Great food. I hig...|[great, food., hi...|    1|       1.0|
|Absolutely delici...|[absolutely, deli...|    1|       1.0|
|Capital Grill is ...|[capital, grill, ...|    1|       1.0|
|Well what can I s...|[well, say,, plac...|    1|       1.0|
|This place is ama...|[place, amazing.,...|    1|       1.0|
|Went here 2 days ...|[went, 2, days, a...|    1|       1.0|
+--------------------+--------------------+-----+----------+
only showing top 10 rows



In [27]:
predictions.filter(predictions['prediction'] == 0).select('comment').show(truncate=False)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [28]:
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator.evaluate(predictions)

0.9301222654134694

### precision, recall, f1score

In [122]:
metrics_rdd = predictions.select(col('label').cast('float'),col('prediction').cast('float')).rdd

In [123]:
metrics = MulticlassMetrics(metrics_rdd)

#### precision, recall, f1score for positve reviews

In [124]:
metrics.precision(1.0)

0.9352343832414456

In [125]:
metrics.recall(1.0)

0.9694733812949641

In [126]:
metrics.fMeasure(1.0)

0.952046142411851

#### precision, recall, f1score for positve reviews

In [127]:
metrics.precision(0.0)

0.9095898817032012

In [128]:
metrics.recall(0.0)

0.8206129761483691

In [129]:
metrics.fMeasure(0.0)

0.8628135787797208

### Predictions on all ratings using user average as threshold

In [29]:
(sample,spill) = data.randomSplit([0.20,.80])

In [30]:
sample.show(5)

+------+----------+--------------------+--------------------+--------------------+-----+
|userId|businessId|             comment|            filtered|            features|label|
+------+----------+--------------------+--------------------+--------------------+-----+
| 11177| 149746170|I Moved in Skepti...|[moved, skeptical...|(262144,[2624,334...|    5|
| 24963| 154324441|I love frys! They...|[love, frys!, nic...|(262144,[14,22346...|    5|
| 31745| 159636968|I bought my weddi...|[bought, wedding,...|(262144,[5381,119...|    5|
| 43859|  82938410|I visited US Pati...|[visited, us, pat...|(262144,[1466,252...|    5|
| 45363| 112823939|My love treated m...|[love, treated, l...|(262144,[2711,306...|    4|
+------+----------+--------------------+--------------------+--------------------+-----+
only showing top 5 rows



In [31]:
avg_rating = sample.groupBy('userId').agg(avg(col('label')).alias('avg-user-rating'))
avg_rating.show(5)

+-------+------------------+
| userId|   avg-user-rating|
+-------+------------------+
|1611041|3.7777777777777777|
|6002391|               2.0|
|6630078|               1.0|
|7597505|               5.0|
|8779122|               5.0|
+-------+------------------+
only showing top 5 rows



In [32]:
sample_test = sample.join(avg_rating,'userId')
sample_test.show(5)

+------+----------+--------------------+--------------------+--------------------+-----+---------------+
|userId|businessId|             comment|            filtered|            features|label|avg-user-rating|
+------+----------+--------------------+--------------------+--------------------+-----+---------------+
| 26583| 123465947|Airy and colorful...|[airy, colorful, ...|(262144,[10879,21...|    4|            4.0|
|109068| 130676348|Want a loud, obno...|[want, loud,, obn...|(262144,[28172,50...|    2|            2.0|
|687716|  45848465|Great food at gre...|[great, food, gre...|(262144,[6979,223...|    5|            5.0|
|694746| 102984270|Do the price sear...|[price, search, b...|(262144,[40861,43...|    1|            1.0|
|922409|  12790888|Sherri Did a real...|[sherri, really, ...|(262144,[14,54961...|    5|            4.0|
+------+----------+--------------------+--------------------+--------------------+-----+---------------+
only showing top 5 rows



In [33]:
sample_test.show(50)

+-------+----------+--------------------+--------------------+--------------------+-----+------------------+
| userId|businessId|             comment|            filtered|            features|label|   avg-user-rating|
+-------+----------+--------------------+--------------------+--------------------+-----+------------------+
|  26583| 123465947|Airy and colorful...|[airy, colorful, ...|(262144,[10879,21...|    4|               4.0|
| 109068| 130676348|Want a loud, obno...|[want, loud,, obn...|(262144,[28172,50...|    2|               2.0|
| 687716|  45848465|Great food at gre...|[great, food, gre...|(262144,[6979,223...|    5|               5.0|
| 694746| 102984270|Do the price sear...|[price, search, b...|(262144,[40861,43...|    1|               1.0|
| 922409|  12790888|Sherri Did a real...|[sherri, really, ...|(262144,[14,54961...|    5|               4.0|
| 922409|  66199679|Very good filet m...|[good, filet, mig...|(262144,[1536,420...|    3|               4.0|
|1001043|  39216299

In [34]:
# def make_label(n):
#     if 

# fix_label = udf(make_label)

sample_labeled = sample_test.withColumn('label', when(col('label') < col('avg-user-rating') , 0 ).otherwise(1)  )

In [36]:
sample_labeled.show(5)

+------+----------+--------------------+--------------------+--------------------+-----+---------------+
|userId|businessId|             comment|            filtered|            features|label|avg-user-rating|
+------+----------+--------------------+--------------------+--------------------+-----+---------------+
| 26583| 123465947|Airy and colorful...|[airy, colorful, ...|(262144,[10879,21...|    1|            4.0|
|109068| 130676348|Want a loud, obno...|[want, loud,, obn...|(262144,[28172,50...|    1|            2.0|
|687716|  45848465|Great food at gre...|[great, food, gre...|(262144,[6979,223...|    1|            5.0|
|694746| 102984270|Do the price sear...|[price, search, b...|(262144,[40861,43...|    1|            1.0|
|922409|  12790888|Sherri Did a real...|[sherri, really, ...|(262144,[14,54961...|    1|            4.0|
+------+----------+--------------------+--------------------+--------------------+-----+---------------+
only showing top 5 rows



In [37]:
predictions_sample = model.transform(sample_labeled)

In [38]:
predictions_sample.show(5)

+------+----------+--------------------+--------------------+--------------------+-----+---------------+--------------------+--------------------+----------+
|userId|businessId|             comment|            filtered|            features|label|avg-user-rating|       rawPrediction|         probability|prediction|
+------+----------+--------------------+--------------------+--------------------+-----+---------------+--------------------+--------------------+----------+
| 26583| 123465947|Airy and colorful...|[airy, colorful, ...|(262144,[10879,21...|    1|            4.0|[-452.29707061457...|[8.28507806653137...|       1.0|
|109068| 130676348|Want a loud, obno...|[want, loud,, obn...|(262144,[28172,50...|    1|            2.0|[-578.17653127418...|[3.22209856627827...|       1.0|
|687716|  45848465|Great food at gre...|[great, food, gre...|(262144,[6979,223...|    1|            5.0|[-1144.5015790820...|[1.86254620001386...|       1.0|
|694746| 102984270|Do the price sear...|[price, sear

In [39]:
predictions_sample.filter(predictions_sample['prediction'] == 0).select('comment','filtered','label','prediction').show(10)

+--------------------+--------------------+-----+----------+
|             comment|            filtered|label|prediction|
+--------------------+--------------------+-----+----------+
|Do the price sear...|[price, search, b...|    1|       0.0|
|This review is sp...|[review, specific...|    0|       0.0|
|Hot meal, flavor ...|[hot, meal,, flav...|    0|       0.0|
|Not really impres...|[really, impresse...|    1|       0.0|
|I read about the ...|[read, 1, hr, par...|    1|       0.0|
|I've been going t...|[going, cleaners,...|    0|       0.0|
|This place sucks ...|[place, sucks, eg...|    1|       0.0|
|Nobody greets you...|[nobody, greets, ...|    1|       0.0|
|It's a shame that...|[shame, thai, opt...|    0|       0.0|
|I had my F-350 de...|[f-350, detailed,...|    1|       0.0|
+--------------------+--------------------+-----+----------+
only showing top 10 rows



In [40]:
evaluator.evaluate(predictions_sample)

0.7279135536807858

### precision, recall, f1score

In [107]:
metrics_rdd1 = predictions_sample.select(col('label').cast('float'),col('prediction').cast('float')).rdd

In [108]:
metrics1 = MulticlassMetrics(metrics_rdd1)

#### precision, recall, f1score for positve reviews

In [112]:
metrics1.precision(1.0)

0.7136437490633051

In [113]:
metrics1.recall(1.0)

0.7136437490633051

In [120]:
metrics1.fMeasure(1.0)

0.8145702811492443

#### precision, recall, f1score for positve reviews

In [None]:
metrics1.precision(0.0)

In [None]:
metrics1.recall(0.0)

In [121]:
metrics1.fMeasure(0.0)

0.37163364915302904

### Predictions on all ratings using 2.5 as threshold

In [41]:
sample_labeled2 = sample_test.withColumn('label', when(col('label') < 2.5 , 0 ).otherwise(1)  )

In [42]:
predictions_sample2 = model.transform(sample_labeled2)

In [43]:
evaluator.evaluate(predictions_sample2)

0.8855857416700659

In [50]:
predictions_sample2.filter(predictions_sample2['prediction'] == 1).select('comment','filtered','label','prediction').show(50)

+--------------------+--------------------+-----+----------+
|             comment|            filtered|label|prediction|
+--------------------+--------------------+-----+----------+
|Airy and colorful...|[airy, colorful, ...|    1|       1.0|
|Want a loud, obno...|[want, loud,, obn...|    0|       1.0|
|Great food at gre...|[great, food, gre...|    1|       1.0|
|Sherri Did a real...|[sherri, really, ...|    1|       1.0|
|Very good filet m...|[good, filet, mig...|    1|       1.0|
|We decided to com...|[decided, come, l...|    1|       1.0|
|This place almost...|[place, almost, r...|    1|       1.0|
|This is about at ...|[3, 5, stars, get...|    1|       1.0|
|Dineen's Mocha's ...|[dineen's, mocha'...|    0|       1.0|
|Went here for the...|[went, ccw, renew...|    1|       1.0|
|The ambiance of t...|[ambiance, place,...|    1|       1.0|
|Mt friend and I w...|[mt, friend, went...|    1|       1.0|
|I've heard a lot ...|[heard, lot, plac...|    1|       1.0|
|What I like abt L...|[l

In [86]:
predictions.filter(predictions['prediction'] == 0).select('comment','filtered','label','prediction').show(50)

+--------------------+--------------------+-----+----------+
|             comment|            filtered|label|prediction|
+--------------------+--------------------+-----+----------+
|It wasn't that th...|[food, awful,, fo...|    2|       0.0|
|The manager, Garr...|[manager,, garret...|    1|       0.0|
|This place was go...|[place, good, bad...|    3|       0.0|
|To be honest with...|[honest, personal...|    2|       0.0|
|Drove by a couple...|[drove, couple, d...|    1|       0.0|
|Literally the day...|[literally, day, ...|    5|       0.0|
|wow...for a high ...|[wow...for, high,...|    1|       0.0|
|I live here my dr...|[live, dryer, got...|    1|       0.0|
|The tablet charge...|[tablet, charger,...|    1|       0.0|
|I will NEVER buy ...|[never, buy, anot...|    1|       0.0|
|Food was ok, it s...|[food, ok,, seems...|    2|       0.0|
|Drove around Home...|[drove, around, h...|    2|       0.0|
|have been receivi...|[receiving, bills...|    1|       0.0|
|Nice pets, reason...|[n

### precision, recall, f1score

In [101]:
metrics_rdd2 = predictions_sample2.select(col('label').cast('float'),col('prediction').cast('float')).rdd

In [103]:
metrics2 = MulticlassMetrics(metrics_rdd2)

#### precision, recall, f1score for positve reviews

In [104]:
metrics2.precision(1.0)

0.899683723204327

In [105]:
metrics2.recall(1.0)

0.9465688859167427

In [115]:
metrics2.fMeasure(1.0)

0.9225309855690518

#### precision, recall, f1score for negetive reviews

In [116]:
metrics2.precision(0.0)

0.8215236908801925

In [117]:
metrics2.recall(0.0)

0.6997210153212897

In [118]:
metrics2.fMeasure(0.0)

0.7557461186222159

## Predictions on all ratings using global average as threshold

In [44]:
sample_labeled3 = sample_test.withColumn('label', when(col('label') < 3.7 , 0 ).otherwise(1)  )

In [45]:
predictions_sample3 = model.transform(sample_labeled3)

In [46]:
evaluator.evaluate(predictions_sample3)

0.8264710966435571

In [73]:
predictions_sample3.filter(predictions_sample3['prediction'] == 1).select('comment','label','prediction').show(50)

+--------------------+-----+----------+
|             comment|label|prediction|
+--------------------+-----+----------+
|Airy and colorful...|    1|       1.0|
|Want a loud, obno...|    0|       1.0|
|Great food at gre...|    1|       1.0|
|Sherri Did a real...|    1|       1.0|
|Very good filet m...|    0|       1.0|
|We decided to com...|    1|       1.0|
|This place almost...|    0|       1.0|
|This is about at ...|    0|       1.0|
|Dineen's Mocha's ...|    0|       1.0|
|Went here for the...|    1|       1.0|
|The ambiance of t...|    1|       1.0|
|Mt friend and I w...|    1|       1.0|
|I've heard a lot ...|    1|       1.0|
|What I like abt L...|    1|       1.0|
|I've hiked this m...|    1|       1.0|
|I'd never heard o...|    0|       1.0|
|I stopped by this...|    1|       1.0|
|I visited Slickab...|    0|       1.0|
|Love love love th...|    1|       1.0|
|Sushi this good a...|    1|       1.0|
|Okay, this is the...|    1|       1.0|
|Not really a fan ...|    0|       1.0|


### precision, recall, f1score

In [83]:
metrics_rdd3 = predictions_sample3.select(col('label').cast('float'),col('prediction').cast('float')).rdd

In [85]:
metrics_rdd3.first()

Row(label=1.0, prediction=1.0)

In [87]:
metrics3 = MulticlassMetrics(metrics_rdd3)

#### precision, recall, f1score for positve reviews

In [93]:
metrics3.precision(1.0)

0.932425780814929

In [99]:
metrics3.recall(1.0)

0.8343825152072429

In [95]:
metrics3.fMeasure(1.0)

0.8806838523364898

#### precision, recall, f1score for negetive reviews

In [94]:
metrics3.precision(0.0)

0.6373090192212912

In [100]:
metrics3.recall(0.0)

0.8279624971415505

In [96]:
metrics3.fMeasure(0.0)

0.7202323407133338