In [3]:
sc

In [4]:
from pyspark import SparkContext,SparkConf
from pyspark.sql.types import *
from pyspark.sql import Row,SQLContext
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.feature import CountVectorizer, Tokenizer, StringIndexer, \
    NGram, VectorAssembler
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
from pyspark.sql.functions import countDistinct, udf, col

In [5]:
sqlContext = SQLContext(sc)

# Load Data

In [7]:
# Loads parquet file located in AWS S3 into RDD Data Frame
parquetFileTip = sqlContext.read.parquet("s3://dknsyelp/tip.parquet")
# Stores the DataFrame into an "in-memory temporary table"
parquetFileTip.registerTempTable("parquetFileTip")
# Run standard SQL queries against temporary table
df_tip = sqlContext.sql("SELECT * FROM parquetFileTip")

parquetFileReview = sqlContext.read.parquet("s3://dknsyelp/review.parquet")
parquetFileReview.registerTempTable("parquetFileReview")
df_review = sqlContext.sql("SELECT * FROM parquetFileReview")

# Clean Data

In [8]:
df_review = df_review.select(df_review['text'], df_review['stars']).dropDuplicates()
# Replace all punctuation
import string
import re
to_remove = string.punctuation + '0-9\\r\\t\\n'
to_remove = r"[{}]".format(to_remove)       # correct format for regex
my_regex = re.compile(to_remove)
# Replace instances of every element in to_remove with empty string
text_clean = udf(lambda text: my_regex.sub('', text))
df_review = df_review.withColumn('clean_text', text_clean(col('text'))).drop('text').withColumnRenamed("clean_text", "text").cache()

# Prepare Data for Training the Naive Bayes Model

## Create Labels for Sentiment Analysis

In [6]:
print 'counts of reviews by stars:'
df_review.groupBy(df_review.stars).agg(countDistinct(df_review.text)).show()

counts of reviews by stars:
+-----+--------------------+
|stars|count(DISTINCT text)|
+-----+--------------------+
|    1|              122600|
|    3|              104366|
|    5|              357558|
|    4|              204525|
|    2|               73779|
+-----+--------------------+



In [9]:
# Treat 1, 2, 3 star ratings as 'bad' reviews & 4, 5 star ratings as 'good' reviews
binning_udf = udf(lambda x: 1 if x > 3 else 0)
df_review = df_review.withColumn('label', binning_udf(col('stars')))
# convert label to integer type
df_review = df_review.withColumn("label", df_review["label"].cast(IntegerType())).drop('stars')

## Split into Training and Validation Sets

In [8]:
train_data, valid_data = df_review.randomSplit([0.8, 0.2])
train_data.cache()
valid_data.cache()

DataFrame[text: string, label: int]

# Training the Naive Bayes Model

In [10]:
def nb_train_pipeline(n=3):
    # convert the input string to lowercase and split it by spaces
    tokenizer = [Tokenizer(inputCol="text", outputCol="words")]
    ngrams = [NGram(n=i, inputCol="words", outputCol="{0}_grams".format(i)) for i in range(1, n + 1)]
    # create vocabulary (use binary=True for Binarized Multinomial Naive Bayes)
    vectorizers = [CountVectorizer(inputCol="{0}_grams".format(i), 
                                   outputCol="{0}_counts".format(i), binary=True) for i in range(1, n + 1)]
    # combine all n-grams
    assembler = [VectorAssembler(inputCols=["{0}_counts".format(i) for i in range(1, n + 1)], outputCol="features")]
    nb = [NaiveBayes(smoothing=1.0, modelType="multinomial")]
    return Pipeline(stages=tokenizer + ngrams + vectorizers + assembler + nb)

In [10]:
model = nb_train_pipeline(n=3).fit(train_data)
preds_valid = model.transform(valid_data)

# Model Evaluation

### Area Under ROC

In [11]:
bceval = BinaryClassificationEvaluator()
print bceval.getMetricName() +":" + str(round(bceval.evaluate(preds_valid), 3))

areaUnderROC:0.613


### Area Under PR

In [12]:
bceval.setMetricName("areaUnderPR")
print bceval.getMetricName() +":" + str(round(bceval.evaluate(preds_valid), 3))

areaUnderPR:0.729


### F1 Score

In [13]:
#Evaluate the model. metric : F1 score...... f1:0.865
# with text_clean: 0.858
# with text_clean + nb_train_pipeline(n=2): 0.882
mceval = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="f1")
print mceval.getMetricName() +":" + str(round(mceval.evaluate(preds_valid), 3))

f1:0.889


### Accuracy

In [14]:
mceval.setMetricName("accuracy")
print (mceval.getMetricName() +":" + str(round(mceval.evaluate(preds_valid), 3)))

accuracy:0.89


# Train Model using All Training Data

In [11]:
model = nb_train_pipeline(n=3).fit(df_review)

# Make Predictions on the Tips Data

The `tips.json` file includes tips (short reviews) for restaurants.  However, ratings are not available for each tip/review. Here, we use the Naive Bayes model we trained on the data from `reviews.json` (containing full reviews + ratings for restaurants) and use the model to make predictions of the sentiment of the "tips" left by users.  Finally, we can convert the predicted probability of a tip being positive into star reviews.

In [18]:
df_tip.show(5)

+--------------------+--------------------+--------------------+----------+-----+--------------------+--------+-------+
|         business_id|             user_id|                 _id|      date|likes|                text| user_ix| biz_ix|
+--------------------+--------------------+--------------------+----------+-----+--------------------+--------+-------+
|-8fOqUWFX_1qiKggI...|UcAQjopfBULit3uVL...|[5a5eb6ea07002a9b...|2013-01-19|    0|"It's our pleasur...|454892.0|28595.0|
|-8fOqUWFX_1qiKggI...|mmnKrqV1W8WSVH5vK...|[5a5eb6ea07002a9b...|2015-08-03|    0|Quick service, ho...|430157.0|28595.0|
|-8fOqUWFX_1qiKggI...|sjSXoxakmlFuVHQlp...|[5a5eb6e207002a9b...|2010-08-10|    0|Kids can play, cl...|190165.0|28595.0|
|-8fOqUWFX_1qiKggI...|Cl2pyB8mWn3RCMV5n...|[5a5eb6e207002a9b...|2012-06-22|    0|Watch out for tra...|429184.0|28595.0|
|-8fOqUWFX_1qiKggI...|Cl2pyB8mWn3RCMV5n...|[5a5eb6e207002a9b...|2012-07-16|    0|Milkshake Mondays...|429184.0|28595.0|
+--------------------+------------------

## Clean Tips Data

In [12]:
df_tip = df_tip.select('user_ix', 'biz_ix', 'text').dropDuplicates()

In [13]:
# Replace instances of every element in to_remove with empty string
df_tip = df_tip.withColumn('clean_text', text_clean(col('text'))).drop('text').withColumnRenamed("clean_text", "text").cache()

In [14]:
df_tip.show()

+--------+--------+--------------------+
| user_ix|  biz_ix|                text|
+--------+--------+--------------------+
|249438.0| 25645.0|Pork Adobo with s...|
|144775.0| 23984.0|         Great gyros|
|207763.0| 33687.0|Great doctor Goin...|
|523391.0| 93635.0|Bulls choked  Ros...|
|152442.0| 45971.0|AYCE is the only ...|
| 62412.0| 79963.0|Hit the dessert b...|
|195045.0|118722.0|Make sure the hea...|
|132911.0|118722.0|              Smelly|
|178406.0|118722.0|Nice rooms the ma...|
| 75242.0|118722.0|Last stop on the ...|
|125925.0| 15184.0|Cool atmosphere a...|
|365503.0| 74326.0|Yipee Its my Aloh...|
|439386.0| 74326.0|Late night shoppi...|
|217030.0|  4773.0|Owl of Minerva in...|
|253770.0| 84624.0|Most sickening re...|
|149331.0| 84624.0|No Dont do it Go ...|
|146661.0| 69781.0|Excellent burgers...|
|259265.0| 17132.0|The red house win...|
|220735.0|   607.0|Ask for Jamie whe...|
| 77784.0|  1438.0|Its not closed to...|
+--------+--------+--------------------+
only showing top

## Predict Ratings using Tips

In [15]:
preds_test = model.transform(df_tip)

In [19]:
preds_test.printSchema()

root
 |-- user_ix: double (nullable = true)
 |-- biz_ix: double (nullable = true)
 |-- text: string (nullable = true)
 |-- words: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- 1_grams: array (nullable = true)
 |    |-- element: string (containsNull = false)
 |-- 2_grams: array (nullable = true)
 |    |-- element: string (containsNull = false)
 |-- 3_grams: array (nullable = true)
 |    |-- element: string (containsNull = false)
 |-- 1_counts: vector (nullable = true)
 |-- 2_counts: vector (nullable = true)
 |-- 3_counts: vector (nullable = true)
 |-- features: vector (nullable = true)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = true)



In [16]:
preds_test.select('probability', 'prediction').show(truncate=False)

+-------------------------------------------+----------+
|probability                                |prediction|
+-------------------------------------------+----------+
|[0.5278635068641435,0.47213649313585637]   |0.0       |
|[0.04387179250090863,0.9561282074990914]   |1.0       |
|[0.08834707434778237,0.9116529256522177]   |1.0       |
|[0.07224206044503252,0.9277579395549674]   |1.0       |
|[0.23726898700743274,0.7627310129925673]   |1.0       |
|[0.09332752385244389,0.9066724761475562]   |1.0       |
|[0.013316465612433367,0.9866835343875665]  |1.0       |
|[0.6139301677384645,0.3860698322615355]    |0.0       |
|[0.025529287134888737,0.9744707128651112]  |1.0       |
|[0.0024199363550010816,0.997580063644999]  |1.0       |
|[5.119764214083161E-23,1.0]                |1.0       |
|[0.005206494423857925,0.994793505576142]   |1.0       |
|[0.24394183604055572,0.7560581639594443]   |1.0       |
|[2.2412511803097664E-7,0.999999775874882]  |1.0       |
|[0.6474195341771555,0.35258046

In [24]:
preds_test.select('probability', 'prediction').show(truncate=False)

[Row(probability=DenseVector([0.5279, 0.4721])),
 Row(probability=DenseVector([0.0439, 0.9561])),
 Row(probability=DenseVector([0.0883, 0.9117])),
 Row(probability=DenseVector([0.0722, 0.9278])),
 Row(probability=DenseVector([0.2373, 0.7627]))]

In [25]:
type(preds_test)

pyspark.sql.dataframe.DataFrame

In [27]:
preds_test.write.parquet("s3://dknsyelp/preds_test.parquet", mode='overwrite')

In [6]:
preds_test = sqlContext.read.parquet("s3://dknsyelp/preds_test.parquet")

In [10]:
def get_ratings(problist):
    prob_positive = problist[1]
    rating = 1
    if 0.2 < prob_positive <= 0.4:
        rating = 2
    elif 0.4 < prob_positive <= 0.6:
        rating = 3
    elif 0.6 < prob_positive <= 0.8:
        rating = 4
    elif 0.8 < prob_positive:
        rating = 5
    return rating

get_ratings_udf = udf(lambda x: get_ratings(x))

In [11]:
pred_df = preds_test.select('biz_ix', 'user_ix', 'probability')
pred_df = pred_df.withColumn('stars', get_ratings_udf(col('probability')))

In [15]:
pred_df.show(truncate = False)

+--------+--------+-------------------------------------------+-----+
|biz_ix  |user_ix |probability                                |stars|
+--------+--------+-------------------------------------------+-----+
|32321.0 |201498.0|[2.644997420539402E-4,0.999735500257946]   |5    |
|114236.0|12019.0 |[0.13097987356047133,0.8690201264395286]   |5    |
|2671.0  |439127.0|[0.26264037708196536,0.7373596229180346]   |4    |
|14767.0 |179368.0|[0.007764449068323095,0.9922355509316769]  |5    |
|83636.0 |291490.0|[1.1974511383295448E-10,0.9999999998802549]|5    |
|79593.0 |281722.0|[6.187727967169642E-9,0.9999999938122719]  |5    |
|76403.0 |532075.0|[3.433558889126534E-6,0.9999965664411109]  |5    |
|13165.0 |535189.0|[1.3304242765850174E-5,0.9999866957572342] |5    |
|51748.0 |250862.0|[7.062982599137161E-11,0.9999999999293703] |5    |
|54948.0 |75633.0 |[2.1453685483600898E-5,0.9999785463145163] |5    |
|61308.0 |350016.0|[0.013486783177591982,0.986513216822408]   |5    |
|98539.0 |281879.0|[

In [16]:
tips_stars = pred_df.drop('probability')

In [17]:
tips_stars.write.parquet("s3://dknsyelp/tips_stars.parquet", mode='overwrite')