# Part 4 Topic Modeling

### Setup

In [5]:
from operator import add
from pyspark.ml.clustering import LDA, LocalLDAModel
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.ml.feature import *
import matplotlib.pyplot as plt
%matplotlib inline
from wordcloud import WordCloud
import numpy as np
from pyspark.ml.linalg import DenseVector
from pyspark.sql.types import IntegerType

In [28]:
spark = SparkSession.builder\
    .config("spark.driver.memory", "32g")\
    .config("spark.executor.memory","32g")\
    .config("spark.driver.maxResultSize","0")\
    .config("spark.sql.autoBroadcastJoinThreshold","-1")\
    .config("spark.sql.broadcastTimeout","1200")\
    .config("spark.default.parallelism", "180")\
    .config("spark.executor.heartbeatInterval", "3600")\
    .appName("part4").getOrCreate()

In [None]:
business = spark.read.json("yelp_academic_dataset_business.json")
review = spark.read.json("yelp_academic_dataset_review.json")

### Join Review Text by Business ID

In [7]:
# filter review by year
review = review.withColumn("year", substring("date",1,4).astype("int"))
review = review.filter(review.year>2010)

# convert rdd to df
reviews_text_rdd = review.select("business_id", "text").rdd
reviews_by_business_rdd = reviews_text_rdd.map(tuple).reduceByKey(add)  
reviews_by_business_df = spark.createDataFrame(reviews_by_business_rdd)
reviews_by_business_df = reviews_by_business_df \
                            .withColumnRenamed("_1", "business_id") \
                            .withColumnRenamed("_2", "text")

### Text Processing
- Tokenize
- Remove Stopwords
- IDF
- Word2Vector
- Topic Modeling

In [8]:
# tokenize

regex_tokenizer = RegexTokenizer(gaps = False, pattern = "\w+", inputCol = "text", outputCol = "token")
reviews_by_business_token_df = regex_tokenizer.transform(reviews_by_business_df)
reviews_by_business_token_df.show(3)

+--------------------+--------------------+--------------------+
|         business_id|                text|               token|
+--------------------+--------------------+--------------------+
|Irp5sgl7XASH5ZTw2...|Enjoyed the food,...|[enjoyed, the, fo...|
|39vR4dh70QwBqoY-Q...|Absolutely overpr...|[absolutely, over...|
|J44x_m383C2GWtzj6...|This place is an ...|[this, place, is,...|
+--------------------+--------------------+--------------------+
only showing top 3 rows



In [9]:
# remove Stopwords

stopWordsRemover = StopWordsRemover(inputCol = "token", outputCol = "non_stop_word")
reviews_by_business_token_non_stop_word_df = stopWordsRemover.transform(reviews_by_business_token_df)
reviews_by_business_token_non_stop_word_df.show(3)

+--------------------+--------------------+--------------------+--------------------+
|         business_id|                text|               token|       non_stop_word|
+--------------------+--------------------+--------------------+--------------------+
|Irp5sgl7XASH5ZTw2...|Enjoyed the food,...|[enjoyed, the, fo...|[enjoyed, food, m...|
|39vR4dh70QwBqoY-Q...|Absolutely overpr...|[absolutely, over...|[absolutely, over...|
|J44x_m383C2GWtzj6...|This place is an ...|[this, place, is,...|[place, interesti...|
+--------------------+--------------------+--------------------+--------------------+
only showing top 3 rows



In [10]:
# create feature vectors

count_vectorizer = CountVectorizer(inputCol="non_stop_word", outputCol="raw_feature")
cv_model = count_vectorizer.fit(reviews_by_business_token_non_stop_word_df)

# save all words
all_words = cv_model.vocabulary

# show result
reviews_featurized_df = cv_model.transform(reviews_by_business_token_non_stop_word_df)
reviews_featurized_df.show(3)

+--------------------+--------------------+--------------------+--------------------+--------------------+
|         business_id|                text|               token|       non_stop_word|         raw_feature|
+--------------------+--------------------+--------------------+--------------------+--------------------+
|Irp5sgl7XASH5ZTw2...|Enjoyed the food,...|[enjoyed, the, fo...|[enjoyed, food, m...|(262144,[0,1,2,3,...|
|39vR4dh70QwBqoY-Q...|Absolutely overpr...|[absolutely, over...|[absolutely, over...|(262144,[5,10,12,...|
|J44x_m383C2GWtzj6...|This place is an ...|[this, place, is,...|[place, interesti...|(262144,[0,1,2,3,...|
+--------------------+--------------------+--------------------+--------------------+--------------------+
only showing top 3 rows



In [11]:
# create features using IDF

idf = IDF(inputCol="raw_feature", outputCol="idf_vector")
idf_model = idf.fit(reviews_featurized_df)
reviews_rescaled_df = idf_model.transform(reviews_featurized_df) 
reviews_rescaled_df.show(3)

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|         business_id|                text|               token|       non_stop_word|         raw_feature|          idf_vector|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|Irp5sgl7XASH5ZTw2...|Enjoyed the food,...|[enjoyed, the, fo...|[enjoyed, food, m...|(262144,[0,1,2,3,...|(262144,[0,1,2,3,...|
|39vR4dh70QwBqoY-Q...|Absolutely overpr...|[absolutely, over...|[absolutely, over...|(262144,[5,10,12,...|(262144,[5,10,12,...|
|J44x_m383C2GWtzj6...|This place is an ...|[this, place, is,...|[place, interesti...|(262144,[0,1,2,3,...|(262144,[0,1,2,3,...|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
only showing top 3 rows



### Word2Vec Model

In [12]:
# create features using Word2Vec
"""
print("start building Word2Vec model")
word_2_vector = Word2Vec(vectorSize = 100, minCount = 5, 
                         numPartitions=10,
                         inputCol = "non_stop_word", outputCol = "word_vector")
word_2_vector_model = word_2_vector.fit(reviews_rescaled_df)

# save the Word2Vec model

word_2_vector_model.write().overwrite().save("word_2_vector_model")
print("end building Word2Vec model")
"""

start building Word2Vec model
end building Word2Vec model


In [29]:
# load Word2Vec model

word_2_vector_model = Word2VecModel.load("word_2_vector_model")
reviews_vector_df = word_2_vector_model.transform(reviews_rescaled_df)
reviews_vector_df.show(3)

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|         business_id|                text|               token|       non_stop_word|         raw_feature|          idf_vector|         word_vector|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|Irp5sgl7XASH5ZTw2...|Enjoyed the food,...|[enjoyed, the, fo...|[enjoyed, food, m...|(262144,[0,1,2,3,...|(262144,[0,1,2,3,...|[0.02270535124461...|
|39vR4dh70QwBqoY-Q...|Absolutely overpr...|[absolutely, over...|[absolutely, over...|(262144,[5,10,12,...|(262144,[5,10,12,...|[-0.0736666149279...|
|J44x_m383C2GWtzj6...|This place is an ...|[this, place, is,...|[place, interesti...|(262144,[0,1,2,3,...|(262144,[0,1,2,3,...|[0.00487508992722...|
+--------------------+--------------------+--------------------+--------------------+--------------------+

In [14]:
# test Word2Vec model
print("chinese")

word_2_vector_model.findSynonyms("chinese", 5).show()

print("western")
word_2_vector_model.findSynonyms("western", 5).show()

print("seafood")
word_2_vector_model.findSynonyms("seafood", 5).show()

chinese
+------------+------------------+
|        word|        similarity|
+------------+------------------+
|   cantonese|0.8301430940628052|
|   taiwanese|0.8289218544960022|
|shanghainese|0.8207446932792664|
|americanized|0.7875234484672546|
|    filipino|0.7843402624130249|
+------------+------------------+

western
+------------+------------------+
|        word|        similarity|
+------------+------------------+
|    american|0.7176516056060791|
|southeastern|0.6309979557991028|
|  hemisphere|0.6186148524284363|
|    northern|0.6179028749465942|
|        fare| 0.601344108581543|
+------------+------------------+

seafood
+-----------+------------------+
|       word|        similarity|
+-----------+------------------+
|   cioppino|0.7607556581497192|
|      clams| 0.743668794631958|
|   seafoods|0.7104874849319458|
|spaghettoni|0.7079223394393921|
|    newburg| 0.704081118106842|
+-----------+------------------+



### Topic Modeling

In [17]:
# build lda model
"""
print("start building LDA model")
lda = LDA(k=10, seed=1, optimizer="online", featuresCol="idf_vector")
lda_model = lda.fit(reviews_rescaled_df)

# save lda model

lda_model.write().overwrite().save("lda_model")
print("end building LDA model")
"""

start building LDA model
end building LDA model


In [18]:
# load the LDA_model

lda_model = LocalLDAModel.load("lda_model")

topics = lda_model.describeTopics()
topics_words = topics.rdd\
       .map(lambda row: row["termIndices"])\
       .map(lambda idx_list: [all_words[i] for i in idx_list])\
       .collect()

for idx, topic in enumerate(topics_words):
    print("####################")
    topic_words = ""
    for word in topic:
        topic_words = topic_words + " " + word
    print("topic "+ str(idx) + ": " + topic_words)

####################
topic 0:  food cupcakes gyro falafel lamb pita greek cupcake hummus chicken
####################
topic 1:  hair burger beer dog haircut vet salon store stylist burgers
####################
topic 2:  pho thai food chicken pad vietnamese shrimp fried grits biscuit
####################
topic 3:  car nails massage nail salon pedicure appointment company manicure vehicle
####################
topic 4:  hotel disney room park rooms tour pool ride dress stay
####################
topic 5:  dr gym classes dentist class dental doctor yoga studio workout
####################
topic 6:  pizza tacos food taco bar mexican wings salsa cheese drinks
####################
topic 7:  ramen donuts indian donut naan doughnuts voodoo food masala doughnut
####################
topic 8:  food coffee delicious menu restaurant breakfast sandwich brunch cheese cream
####################
topic 9:  sushi food chicken rice pork restaurant fried sauce bbq menu


In [21]:
# add the token count 

token_len = udf(lambda s: len(s), IntegerType())
reviews_vector_df = reviews_vector_df.withColumn("token_count", token_len(reviews_vector_df.non_stop_word))
reviews_vector_df.show(3)

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+-----------+
|         business_id|                text|               token|       non_stop_word|         raw_feature|          idf_vector|         word_vector|token_count|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+-----------+
|Irp5sgl7XASH5ZTw2...|Enjoyed the food,...|[enjoyed, the, fo...|[enjoyed, food, m...|(262144,[0,1,2,3,...|(262144,[0,1,2,3,...|[0.02270535124461...|      18024|
|39vR4dh70QwBqoY-Q...|Absolutely overpr...|[absolutely, over...|[absolutely, over...|(262144,[5,10,12,...|(262144,[5,10,12,...|[-0.0736666149279...|        233|
|J44x_m383C2GWtzj6...|This place is an ...|[this, place, is,...|[place, interesti...|(262144,[0,1,2,3,...|(262144,[0,1,2,3,...|[0.00487508992722...|       8524|
+--------------------+------------

In [1]:
# create a business summary for more analysis

a = reviews_vector_df.select("business_id","idf_vector", "word_vector", "non_stop_word").alias("a")
b = business.select("business_id", "name", "categories", "review_count").alias("b")

summary = a.join(b, col("a.business_id") == col("b.business_id"), "inner") 
summary.show(1)

NameError: name 'reviews_vector_df' is not defined

### Word Cloud Analysis

In [None]:
# prepare for generating word cloud

def word_cloud(text):
    wc = WordCloud().generate(text)
    plt.imshow(wc)
    plt.axis("off")
    plt.show()
summary.select("business_id", "business_name", "categories") \
        .filter((summary.stars == 5) & (summary.review_count > 30)) \
        .show(5, truncate=False)

In [None]:
# test word cloud

bid = "J9vAdD2dCpFuGsxPIn184w"
all_reviews = "".join(summary.select("non_stop_word").filter(summary.business_id == bid).rdd.take(1)[0][0])
word_cloud(all_reviews)


bid = "ixfpsy7M6vLAe0Xf-EWH4g"
all_reviews = "".join(summary.select("non_stop_word").filter(summary.business_id == bid).rdd.take(1)[0][0])
word_cloud(all_reviews)

### Check Similarity Score

In [None]:
# define similarity function
def cosine_similarity(vector1, vector2): 
    return np.dot(vector1, vector2) / np.sqrt(np.dot(vector1, vector1)) / np.sqrt(np.dot(vector2, vector2)) 

def check_similarity(business_id1, business_id2):
    summary.select("token_count", "business_name", "categories") \
        .filter(summary.business_id.isin(business_id1, business_id2)) \
        .show(truncate=False)
        
    bus_review_text1 = " ".join(summary.select("non_stop_word") \
                                .filter(summary.business_id == business_id1).rdd.take(1)[0][0])
    bus_review_text2 = " ".join(summary.select("non_stop_word") \
                                .filter(summary.business_id == business_id2).rdd.take(1)[0][0])
    
 
    word_cloud(bus_review_text1)
    word_cloud(bus_review_text2)
    
    # cosine similarity from IDF vectors
    bus_vector1 = summary.select("idf_vector").filter(summary.business_id == business_id1).rdd.take(1)[0][0]
    bus_vector2 = summary.select("idf_vector").filter(summary.business_id == business_id2).rdd.take(1)[0][0]
    
    # convert Sparse vectors to Dense vectors
    vector1 = DenseVector(bus_vector1.toArray())
    vector2 = DenseVector(bus_vector2.toArray())
    print("cosine similarity based on IDF vectors     : " + str(cosine_similarity(vector1, vector2)))
    
    # cosine similarity from Word2Vec vectors
    vector1 = summary.select("word_vector").filter(summary.business_id == business_id1).rdd.take(1)[0][0]
    vector2 = summary.select("word_vector").filter(summary.business_id == business_id2).rdd.take(1)[0][0]
    print("cosine similarity based on Word2Vec vectors: " + str(cosine_similarity(vector1, vector2)))

In [None]:
# test similarity score
# test case 1

business1 = "RtUvSWO_UZ8V3Wpj0n077w" 
business2 = "CN5nuUQod0f8g3oh99qq0w"
check_similarity(business1, business2)

In [None]:
# test case 2

business1 = 'ZumOnWbstgsIE6bJlxw0_Q'
business2 = 'JJ8ypBu3b--fy4HA5RB1gg'
check_similarity(business1, business2)