# Part1

In [5]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

In [6]:
## Importing Basic Packages
import matplotlib.pyplot as plt
import numpy as np

# Import NLP Packages
import re
import nltk
from wordcloud import WordCloud

# NLTK Stop words
# nltk.download('stopwords')
# nltk.download('wordnet')
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use', 'not', 'would', 'say', 'could', '_', 'be', 'know', 'good', 'go', 'get', 'do', 'done', 'try', 'many', 'some', 'nice', 'thank', 'think', 'see', 'rather', 'easy', 'easily', 'lot', 'lack', 'make', 'want', 'seem', 'run', 'need', 'even', 'right', 'line', 'even', 'also', 'may', 'take', 'come'])


In [7]:
spark = SparkSession.builder.config('spark.driver.memory', '16g').config('spark.executor.memory','16g').config('spark.driver.maxResultSize','0').config('spark.sql.autoBroadcastJoinThreshold','-1').config('spark.sql.broadcastTimeout','1200').config('spark.default.parallelism','8').appName("part123").getOrCreate()

# Bussiness

In [8]:
path = 'yelp_academic_dataset_business.json'
business = spark.read.json(path)
business.count()

160585

In [9]:
business_col = ['business_id','name','city','state','stars','review_count','categories','latitude','longitude','is_open', 'postal_code']
business = business.select(business_col)
#business = business.withColumn('category', split(business['categories'],',')).withColumn('category',explode('category')).withColumn('category', trim('category'))


In [10]:
tmp = business.withColumn('category', explode(split(business['categories'],','))).withColumn('category', trim('category'))
tmp.groupBy('category').count().orderBy('count',ascending=False).show()
#selected_category = ['Restaurants','Food','Coffee & Tea','Sandwiches','Breakfast & Brunch']
regex_expr = r'\b(Restaurants|Food|Coffee|Tea|Sandwiches|Breakfast|Brunch)\b'
business = business.filter(business['categories'].rlike(regex_expr))


+--------------------+-----+
|            category|count|
+--------------------+-----+
|         Restaurants|50763|
|                Food|29469|
|            Shopping|26205|
|       Beauty & Spas|16574|
|       Home Services|16465|
|    Health & Medical|15102|
|      Local Services|12192|
|           Nightlife|11990|
|                Bars|10741|
|          Automotive|10119|
|Event Planning & ...| 9644|
|         Active Life| 9231|
|        Coffee & Tea| 7725|
|          Sandwiches| 7272|
|             Fashion| 6599|
|American (Traditi...| 6541|
|         Hair Salons| 5900|
|               Pizza| 5756|
|     Hotels & Travel| 5703|
|  Breakfast & Brunch| 5505|
+--------------------+-----+
only showing top 20 rows



In [11]:
business.count()

64092

In [12]:
business.groupBy('city').count().orderBy('count',ascending = False).show()
selected_city = ['Portland']
business = business.filter(col('city').isin(selected_city))


+---------------+-----+
|           city|count|
+---------------+-----+
|       Portland| 7212|
|      Vancouver| 6495|
|         Austin| 6373|
|        Atlanta| 5092|
|        Orlando| 4592|
|         Boston| 3536|
|       Columbus| 3164|
|       Richmond| 1127|
|      Cambridge|  957|
|        Burnaby|  919|
|        Boulder|  882|
|      Kissimmee|  846|
|      Beaverton|  735|
|        Decatur|  627|
|     Somerville|  510|
|North Vancouver|  500|
|    Winter Park|  495|
|         Quincy|  414|
|      Coquitlam|  383|
|         Surrey|  344|
+---------------+-----+
only showing top 20 rows



In [13]:
business = business[business['is_open']==1]
business.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in business.columns]).show()

+-----------+----+----+-----+-----+------------+----------+--------+---------+-------+-----------+
|business_id|name|city|state|stars|review_count|categories|latitude|longitude|is_open|postal_code|
+-----------+----+----+-----+-----+------------+----------+--------+---------+-------+-----------+
|          0|   0|   0|    0|    0|           0|         0|       0|        0|      0|          0|
+-----------+----+----+-----+-----+------------+----------+--------+---------+-------+-----------+



In [14]:
business.count()

4127

# Review

In [15]:
path = 'yelp_academic_dataset_review.json'
review = spark.read.json(path)

In [16]:
review.count()

8635403

In [17]:
review.printSchema()

root
 |-- business_id: string (nullable = true)
 |-- cool: long (nullable = true)
 |-- date: string (nullable = true)
 |-- funny: long (nullable = true)
 |-- review_id: string (nullable = true)
 |-- stars: double (nullable = true)
 |-- text: string (nullable = true)
 |-- useful: long (nullable = true)
 |-- user_id: string (nullable = true)



In [18]:
review.show(10)
review = review.withColumn('year', substring('date',1,4).astype('int'))
review = review.withColumn('month', substring('date',6,2).astype('int'))
review = review.withColumn('day', substring('date',9,2).astype('int'))

+--------------------+----+-------------------+-----+--------------------+-----+--------------------+------+--------------------+
|         business_id|cool|               date|funny|           review_id|stars|                text|useful|             user_id|
+--------------------+----+-------------------+-----+--------------------+-----+--------------------+------+--------------------+
|buF9druCkbuXLX526...|   1|2014-10-11 03:34:02|    1|lWC-xP3rd6obsecCY...|  4.0|Apparently Prides...|     3|ak0TdVmGKo4pwqdJS...|
|RA4V8pr014UyUbDvI...|   0|2015-07-03 20:38:25|    0|8bFej1QE5LXp4O05q...|  4.0|This store is pre...|     1|YoVfDbnISlW0f7abN...|
|_sS2LBIGNT5NQb6PD...|   0|2013-05-28 20:38:06|    0|NDhkzczKjLshODbqD...|  5.0|I called WVM on t...|     0|eC5evKn1TWDyHCyQA...|
|0AzLzHfOJgL7ROwhd...|   1|2010-01-08 02:29:15|    1|T5fAqjjFooT4V0OeZ...|  2.0|I've stayed at ma...|     1|SFQ1jcnGguO0LYWnb...|
|8zehGz9jnxPqXtOc7...|   0|2011-07-28 18:05:01|    0|sjm_uUcQVxab_EeLC...|  4.0|The food i

In [19]:
review_col = ['review_id','business_id','user_id','year','month','day','stars','useful','funny','cool','text']
review = review.select(review_col)

In [20]:
review.show(10)

+--------------------+--------------------+--------------------+----+-----+---+-----+------+-----+----+--------------------+
|           review_id|         business_id|             user_id|year|month|day|stars|useful|funny|cool|                text|
+--------------------+--------------------+--------------------+----+-----+---+-----+------+-----+----+--------------------+
|lWC-xP3rd6obsecCY...|buF9druCkbuXLX526...|ak0TdVmGKo4pwqdJS...|2014|   10| 11|  4.0|     3|    1|   1|Apparently Prides...|
|8bFej1QE5LXp4O05q...|RA4V8pr014UyUbDvI...|YoVfDbnISlW0f7abN...|2015|    7|  3|  4.0|     1|    0|   0|This store is pre...|
|NDhkzczKjLshODbqD...|_sS2LBIGNT5NQb6PD...|eC5evKn1TWDyHCyQA...|2013|    5| 28|  5.0|     0|    0|   0|I called WVM on t...|
|T5fAqjjFooT4V0OeZ...|0AzLzHfOJgL7ROwhd...|SFQ1jcnGguO0LYWnb...|2010|    1|  8|  2.0|     1|    1|   1|I've stayed at ma...|
|sjm_uUcQVxab_EeLC...|8zehGz9jnxPqXtOc7...|0kA0PAJ8QFMeveQWH...|2011|    7| 28|  4.0|     0|    0|   0|The food is alway...|


In [21]:
review.count()

8635403

# filter review that in bussiness dataset

In [22]:
business_id_set = set()
for i in list(business.select('business_id').collect()):
    business_id_set.add(i[0])

In [23]:
review.count()

8635403

In [24]:
review = review.filter(review['business_id'].isin(business_id_set))

In [25]:
review.count()

605597

# remove reviews before 2010

In [26]:
review.groupBy('year').count().orderBy('year').show()

+----+-----+
|year|count|
+----+-----+
|2004|    1|
|2005|   72|
|2006|  607|
|2007| 3369|
|2008| 8183|
|2009|13023|
|2010|20037|
|2011|27092|
|2012|30442|
|2013|38252|
|2014|52003|
|2015|66122|
|2016|69181|
|2017|74100|
|2018|80903|
|2019|76977|
|2020|42588|
|2021| 2645|
+----+-----+



In [27]:
review = review.filter(review.year>2010)

In [28]:
review.count()

560305

## Part 2 :Sentiment Analysis

In [29]:
review.select(['text']).show(5)

+--------------------+
|                text|
+--------------------+
|The ramen here is...|
|"Even the mad Cap...|
|It's crazy how es...|
|5 stars for the l...|
|That was very gra...|
+--------------------+
only showing top 5 rows



### Negative Reviews Forsight

In [30]:
neg_reviews = review.filter(review.stars <= 2.5)  # reviews with star <= 2.5
# print(neg_reviews.count(), len(neg_reviews.columns))

In [31]:
neg_reviews.show(5)

+--------------------+--------------------+--------------------+----+-----+---+-----+------+-----+----+--------------------+
|           review_id|         business_id|             user_id|year|month|day|stars|useful|funny|cool|                text|
+--------------------+--------------------+--------------------+----+-----+---+-----+------+-----+----+--------------------+
|EO5rALvJMkK8QEvUN...|9P-lp3AWDXGayDqJz...|u2xPfv6_wcKt-lW-C...|2018|    2| 11|  2.0|     0|    0|   0|The ramen here is...|
|ggecU8oSt68aGEuiE...|7EbGTD7ZF30vEFBiH...|uQSBQI8hKuNRxyxhP...|2013|   10|  3|  2.0|     2|    0|   0|I don't remember ...|
|UaxxixKaWiAL7_Oc_...|szCpLKuocAQnErkNi...|MrA1ib9jw_tw-uDzn...|2013|    8|  3|  2.0|     1|    0|   0|The renovation is...|
|Q9Jh7uDqUYpM8aR3F...|ftc6tzrCBJVbuIi_y...|UNgMSeVC-Jk2q6ZhI...|2015|   10| 17|  1.0|     3|    0|   0|This pharmacy has...|
|Q9qveSMALhyPZn7st...|Wv1A_nvyUuMEThZFu...|tI8Lve0J6JPklfUcJ...|2016|    6| 22|  2.0|     0|    0|   0|I actually love W...|


In [32]:
from pyspark.ml.feature import CountVectorizer, StopWordsRemover, RegexTokenizer

neg_reviews_text = neg_reviews.select(['text'])

tokenizer = RegexTokenizer().setInputCol("text").setOutputCol("words").setPattern("\\W")
wordsData = tokenizer.transform(neg_reviews_text)


remover = StopWordsRemover(inputCol="words", outputCol="filtered")
filter_data = remover.transform(wordsData)


cv = CountVectorizer(inputCol='filtered', outputCol='vectors')
model = cv.fit(filter_data)

counts = model.transform(filter_data).select(['filtered','vectors'])

neg_words_top10 = counts.select('vectors', explode("filtered").alias("word")).groupBy("word").count()
neg_words_top10 = neg_words_top10.sort("count", ascending=False)
neg_words_top10.show(10) # list

+-------+-----+
|   word|count|
+-------+-----+
|   food|70991|
|  place|43913|
|   like|42428|
|service|39138|
|   good|38361|
|    one|36118|
|    get|33493|
|   time|33280|
|   back|32405|
|     us|31669|
+-------+-----+
only showing top 10 rows



In [33]:
# Remove non-informative words
tokenizer = RegexTokenizer().setInputCol("text").setOutputCol("words").setPattern("\\W")
wordsData = tokenizer.transform(neg_reviews_text)


remover = StopWordsRemover(inputCol="words", outputCol="filtered")
filter_data = remover.transform(wordsData)


cv = CountVectorizer(inputCol='filtered', outputCol='vectors',minDF=2.0, vocabSize=1000)
model = cv.fit(filter_data)

counts = model.transform(filter_data).select(['filtered','vectors'])

non_informative_words = ['food', 'place', 'like', 'service','good','one','get','time','back','us']

neg_words_top150 = counts.select('vectors', explode("filtered").alias("word")).where(col("word").isin(non_informative_words)==False).groupBy("word").count()
neg_words_top150 = neg_words_top150.sort("count", ascending=False)
neg_words_top150 = neg_words_top150.select(['word']).take(150) # list

In [34]:
neg_words_top150_list = []
for nwt150 in neg_words_top150:
    if nwt150['word'].isnumeric():
        continue
    neg_words_top150_list.append(nwt150['word'])

In [35]:
print(neg_words_top150_list)

['order', 'go', 'ordered', 'even', 'got', 'really', 'didn', 'never', 'said', 'minutes', 'came', 've', 'm', 'people', 'asked', 'restaurant', 'great', 'went', 'table', 'bad', 'two', 'better', 'first', 'also', 'told', 'way', 'chicken', 'know', 'much', 'going', 'wait', 'experience', 'bar', 'come', 'make', 'portland', 'staff', 'want', 'well', 'took', 'customer', 'menu', 'made', 'eat', 'another', 'wasn', 'take', 'ever', 'give', 'pizza', 'server', 're', 'say', 'still', 'nice', 'pretty', 'times', 'left', 'long', 'try', 'think', 'last', 'rude', 'sauce', 'hour', 'around', 'something', 'night', 'wanted', 'drinks', 'little', 'drink', 'right', 'day', 'nothing', 'coffee', 'see', 'sure', 'd', 'meal', 'taste', 'maybe', 'many', 'waiting', 'since', 'waitress', 'flavor', 'small', 'll', 'ok', 'salad', 'disappointed', 'cheese', 'tasted', 'won', 'away', 'waited', 'worst', 'customers', 'money', 'meat', 'thing', 'though', 'ask', 'location', 'next', 'line', 'new', 'tried', 'cold', 'half', 'quality', 'put', 'be

In [None]:
wordcloud_low = WordCloud(background_color="white").generate(' '.join(neg_words_top150_list))
plt.figure(figsize = (12,10))
plt.imshow(wordcloud_low, interpolation='bilinear')
plt.title('Word Cloud - Negative Yelp Restaurant Reviews', fontsize=16, y=1.01)
plt.axis("off")

(-0.5, 399.5, 199.5, -0.5)

### Positive Reviews Forsight

In [None]:
pos_reviews = review.filter(review.stars >= 4.5)  # reviews with star >= 4.5
print(pos_reviews.count(), len(pos_reviews.columns))

In [None]:
pos_reviews.show(5)

In [None]:
pos_reviews_text = pos_reviews.select(['text'])

tokenizer = RegexTokenizer().setInputCol("text").setOutputCol("words").setPattern("\\W")
wordsData = tokenizer.transform(pos_reviews_text)


remover = StopWordsRemover(inputCol="words", outputCol="filtered")
filter_data = remover.transform(wordsData)


cv = CountVectorizer(inputCol='filtered', outputCol='vectors')
model = cv.fit(filter_data)

counts = model.transform(filter_data).select(['filtered','vectors'])

pos_words_top10 = counts.select('vectors', explode("filtered").alias("word")).groupBy("word").count()
pos_words_top10 = pos_words_top10.sort("count", ascending=False)
pos_words_top10.show(10) # list

In [None]:
# Remove non-informative words
tokenizer = RegexTokenizer().setInputCol("text").setOutputCol("words").setPattern("\\W")
wordsData = tokenizer.transform(pos_reviews_text)


remover = StopWordsRemover(inputCol="words", outputCol="filtered")
filter_data = remover.transform(wordsData)


cv = CountVectorizer(inputCol='filtered', outputCol='vectors',minDF=2.0, vocabSize=1000)
model = cv.fit(filter_data)

counts = model.transform(filter_data).select(['filtered','vectors'])

non_informative_words = ['food', 'place', 'like', 'service','portland','one']

pos_words_top150 = counts.select('vectors', explode("filtered").alias("word")).where(col("word").isin(non_informative_words)==False).groupBy("word").count()
pos_words_top150 = pos_words_top150.sort("count", ascending=False)
pos_words_top150 = pos_words_top150.select(['word']).take(150) # list

In [None]:
pos_words_top150_list = []
for pwt150 in pos_words_top150:
    if pwt150['word'].isnumeric():
        continue
    pos_words_top150_list.append(pwt150['word'])

In [None]:
print(pos_words_top150_list)

In [None]:
wordcloud_low = WordCloud(background_color="white").generate(' '.join(pos_words_top150_list))
plt.figure(figsize = (12,10))
plt.imshow(wordcloud_low, interpolation='bilinear')
plt.title('Word Cloud - Positive Yelp Restaurant Reviews', fontsize=16, y=1.01)
plt.axis("off");

### Rich Features for Review

In [None]:
from pyspark.sql.types import *
from textblob import Word

# review = review.withColumn('char_count', length(review['text']))

# def cal_word_count(value):
#     return len(value.split(" "))

# udfcal_word_count = udf(cal_word_count, IntegerType())
# review = review.withColumn("word_count", lit(udfcal_word_count("text")))

# def cal_avg_word_len(value):
#     words = value.split()
#     word_sum = 0
#     for word in words:
#         word_sum += len(word)
#     result = word_sum / len(words)
#     return result

# udfcal_avg_word_len = udf(cal_avg_word_len, DoubleType())
# review = review.withColumn("avg_word_len", lit(udfcal_avg_word_len("text")))

# def cal_stopword_count(value):
#     words = value.split(" ")
#     spcount = 0
#     for word in words:
#         if word in stop_words:
#             spcount += 1
#     return spcount

# udfcal_stopword_count = udf(cal_stopword_count, IntegerType())
# review = review.withColumn("stopword_count", lit(udfcal_stopword_count("text")))

# review.show(3)


In [None]:
def get_cleaned_text(value):
    words = re.split("(?:[^a-zA-Z']+)", value.lower())
    result = " "
    result = result.join(Word(w.lower()).lemmatize() for w in words if w.lower() not in stop_words)
    return result

udfget_cleaned_text = udf(get_cleaned_text, StringType())
review = review.withColumn("cleaned_text", lit(udfget_cleaned_text("text")))

# review.show(3, truncate=False)

In [None]:
review.printSchema()

### Text Classification

In [None]:
# Add column 'polarity' and 'subjectivity'
from textblob import TextBlob

def cal_polarity(value):
    return TextBlob(value).sentiment[0]

udfcal_polarity = udf(cal_polarity, DoubleType())
review = review.withColumn("polarity", lit(udfcal_polarity("cleaned_text")))

def cal_subjectivity(value):
    return TextBlob(value).sentiment[1]

udfcal_subjectivity = udf(cal_subjectivity, DoubleType())
review = review.withColumn("subjectivity", lit(udfcal_subjectivity("cleaned_text")))

# review.show(3, truncate=False)

In [None]:
# Calculate Vader Sentiment Analysis Scores

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()

# test = analyzer.polarity_scores("The ramen here is less than great. It came out luke warm and oily. The service we received was NOT good. I will not be returning.")

# print(test)

def cal_compound(value):
    return analyzer.polarity_scores(value)['compound']

udfcal_compound = udf(cal_compound, DoubleType())
review = review.withColumn("compound", lit(udfcal_compound("text")))

### Vader Sentiment

In [None]:
# def cal_neg(value):
#     return analyzer.polarity_scores(value)['neg']

# udfcal_neg = udf(cal_neg, DoubleType())
# review = review.withColumn("neg", lit(udfcal_neg("text")))

# def cal_neu(value):
#     return analyzer.polarity_scores(value)['neu']

# udfcal_neu = udf(cal_neu, DoubleType())
# review = review.withColumn("neu", lit(udfcal_neu("text")))

# def cal_pos(value):
#     return analyzer.polarity_scores(value)['pos']

# udfcal_pos = udf(cal_pos, DoubleType())
# review = review.withColumn("pos", lit(udfcal_pos("text")))

review.show(2, truncate=False)

In [None]:
review.printSchema()

In [None]:
print(review.count(), len(review.columns))

## Part 3: Topic Modelling

In [None]:
dropped_review = review.filter(review.polarity == 0.0)
dropped_review = dropped_review.filter(dropped_review.subjectivity == 0.0)
dropped_review = dropped_review.filter(dropped_review.compound == 0.0)

In [None]:
dropped_id_set = set()
for i in list(dropped_review.select('review_id').collect()):
    dropped_id_set.add(i[0])

In [None]:
print(len(dropped_id_set))

In [None]:
combined_review = review.filter(review['review_id'].isin(dropped_id_set) == False)
combined_review = combined_review.dropna()

In [None]:
print(combined_review.count()) # 559334 + 971 = 560305

In [None]:
combined_review.show(3)

In [None]:
dropped_review = dropped_review.dropna()
dropped_review = dropped_review.select(['text'])

In [None]:
dropped_review.show(10, truncate=False)

In [None]:
print(dropped_review.count())

In [None]:
combined_review.printSchema()

In [None]:
# Topic Modeling
from pyspark.sql import SQLContext, Row
from pyspark.mllib.clustering import LDA, LDAModel
from pyspark.mllib.linalg import Vector, Vectors
from pyspark.ml.feature import CountVectorizer, StopWordsRemover, RegexTokenizer

tp_reviews = combined_review.select(['text'])

tokenizer = RegexTokenizer().setInputCol("text").setOutputCol("words").setPattern("\\W")
wordsData = tokenizer.transform(tp_reviews)


remover = StopWordsRemover(inputCol="words", outputCol="filtered")
filter_data = remover.transform(wordsData)


cv = CountVectorizer(inputCol='filtered', outputCol='vectors',minDF=2.0, vocabSize=1000)
model = cv.fit(filter_data)


In [None]:
result = model.transform(filter_data)

In [None]:
result = result.select("*").withColumn("id", monotonically_increasing_id())
result.show(3)

In [None]:
result = result.repartition(100)

In [None]:
from pyspark.ml.feature import IDF
idf = IDF(inputCol="vectors", outputCol="features")
idfModel = idf.fit(result)
result_tfidf = idfModel.transform(result)

In [None]:
corpus = result.select(['id','vectors']).rdd.map(lambda x: [x[0],Vectors.fromML(x[1])]).cache()

In [None]:
ldaModel = LDA.train(corpus, k=7,maxIterations=50,optimizer='online')
topics = ldaModel.topicsMatrix()
vocabArray = model.vocabulary

In [None]:
wordNumbers = 10  # number of words per topic
topicIndices = spark.sparkContext.parallelize(ldaModel.describeTopics(maxTermsPerTopic = wordNumbers))

In [None]:
def topic_render(topic):  # specify vector id of words to actual words
    terms = topic[0]
    result = []
    for i in range(wordNumbers):
        term = vocabArray[terms[i]]
        result.append(term)
    return result

topics_final = topicIndices.map(lambda topic: topic_render(topic)).collect()

In [None]:
for topic in range(len(topics_final)):
    print ("Topic" + str(topic) + ":")
    for term in topics_final[topic]:
        print (term)
    print ('\n')

In [None]:
from matplotlib import pyplot as plt
from wordcloud import WordCloud

fig, axes = plt.subplots(7, 1, figsize=(20,20), sharex=True, sharey=True)

for i, ax in enumerate(axes.flatten()):
    fig.add_subplot(ax)
    topic_words = topics_final[i]
    wordcloud_low = WordCloud(background_color="white").generate(' '.join(topic_words))
    plt.gca().imshow(wordcloud_low)
    plt.gca().set_title('Topic ' + str(i), fontdict=dict(size=16))
    plt.gca().axis('off')


plt.subplots_adjust(wspace=0, hspace=0)
plt.axis('off')
plt.margins(x=5, y=5)
plt.tight_layout()
plt.show()

In [None]:
from pyspark.ml.clustering import LDA

dataset = result.select(['text','vectors'])

mllda = LDA(featuresCol="vectors",k=7, maxIter=20)
mlmodel = mllda.fit(dataset)

In [None]:
# Describe topics.
topics = mlmodel.describeTopics()
topics.show(truncate=False)

In [None]:
from operator import itemgetter 
key_topics = [[] for _ in range(7)]

for i in topics['topic','termIndices'].collect():
    key_topics[i['topic']] = list(itemgetter(*i['termIndices'])(model.vocabulary))

print(key_topics)

In [None]:
# Shows the result
transformed = mlmodel.transform(dataset)
transformed.show(3,truncate=False)

In [None]:
transformed = mlmodel.transform(dataset)

In [None]:
transformed.printSchema()

In [None]:
from pyspark.ml.functions import vector_to_array
transformed = transformed.withColumn("topicDistribution", vector_to_array("topicDistribution"))

In [None]:
transformed = transformed.drop('vectors')

In [None]:
transformed = transformed.drop('topicDistribution')

In [None]:
def tran_keywords(value):
    max_index = int(np.argmax(value))
    return key_topics[max_index]

udftran_keywords = udf(tran_keywords, ArrayType(StringType()))
transformed = transformed.withColumn("Keywords", lit(udftran_keywords("topicDistribution")))


In [None]:
transformed.show(2,truncate=False)

In [None]:
final_review = combined_review.join(transformed, combined_review.text == transformed.text, 'inner')

In [None]:
final_review = final_review.drop('text')
final_review = final_review.drop('year')
final_review = final_review.drop('month')
final_review = final_review.drop('day')
final_review = final_review.drop('useful')
final_review = final_review.drop('funny')
final_review = final_review.drop('cool')

In [None]:
final_review.printSchema()

# User

In [None]:
path = 'yelp_academic_dataset_user.json'
user = spark.read.json(path)

In [None]:
user.printSchema()

In [None]:
user_col = ['user_id','name','review_count','yelping_since','friends','useful','funny','cool','fans','average_stars']
user = user.select(user_col)

In [None]:
user.show(10)

In [None]:
# get active user
user = user[user.review_count>10]

In [None]:
user.count()

# save to csv

In [None]:
# col = ['business_id','name','city','state','stars','review_count','categories','latitude','longitude','is_open', 'postal_code']
# city = ['Portland']
# category =  ['Restaurants','Food','Coffee & Tea','Sandwiches','Breakfast & Brunch']
# is_open = 1
business_df = business.toPandas()
business_df.to_csv('business.csv',index=False)

In [None]:
# col = ['review_id','business_id','user_id','year','month','day','stars','useful','funny','cool','text']
# only include reviews which "business_id" in "bussiness.json"
review_df = review.toPandas()
review_df.to_csv('review.csv',index=False)

In [None]:
# col = ['user_id','name','review_count','yelping_since','friends','useful','funny','cool','fans','average_stars']
user_df = user.toPandas()
user_df.to_csv('user.csv',index=False)