# Part 4 Topic Modeling

### Setup

In [16]:
from operator import add
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.ml.feature import *

In [9]:
spark = SparkSession.builder.config('spark.driver.memory', '16g').config('spark.executor.memory','16g').config('spark.driver.maxResultSize','0').config('spark.sql.autoBroadcastJoinThreshold','-1').config('spark.sql.broadcastTimeout','1200').config('spark.default.parallelism','8').appName("part123").getOrCreate()
business = spark.read.json('yelp_academic_dataset_business.json')
review = spark.read.json('yelp_academic_dataset_review.json')


### Join Review Text by Business ID

In [10]:
reviews_text_rdd = review.select("business_id", "text").rdd
reviews_by_business_rdd = reviews_text_rdd.map(tuple).reduceByKey(add)  
reviews_by_business_df = spark.createDataFrame(reviews_by_business_rdd)
reviews_by_business_df = reviews_by_business_df \
                            .withColumnRenamed('_1', 'business_id') \
                            .withColumnRenamed('_2', 'text')

160585

### Text Processing
- Tokenize
- Remove Stopwords
- Topic Modeling
reviews_by_business_df.take(3)

In [15]:
# Tokenize

regexTokenizer = RegexTokenizer(gaps = False, pattern = '\w+', inputCol = 'text', outputCol = 'token')
reviews_by_business_token_df = regexTokenizer.transform(reviews_by_business_df)
reviews_by_business_token_df.show(3)

+--------------------+--------------------+--------------------+
|         business_id|                text|               token|
+--------------------+--------------------+--------------------+
|or-Kyw7kmNin1pWbu...|As a native of th...|[as, a, native, o...|
|R40JzBT7jrPVbXCr6...|checked out Symph...|[checked, out, sy...|
|pVBbYcNqoYT1ZXNUv...|This place is del...|[this, place, is,...|
+--------------------+--------------------+--------------------+
only showing top 3 rows



In [17]:
# Remove Stopwords

stopWordsRemover = StopWordsRemover(inputCol = 'token', outputCol = 'nostopwrd')
reviews_by_business_token_nostopwrd_df = stopWordsRemover.transform(reviews_by_business_token_df)
reviews_by_business_token_nostopwrd_df.show(3)

+--------------------+--------------------+--------------------+--------------------+
|         business_id|                text|               token|           nostopwrd|
+--------------------+--------------------+--------------------+--------------------+
|Sxr9FykZWGK3QL_od...|My husband is fro...|[my, husband, is,...|[husband, bolivia...|
|R40JzBT7jrPVbXCr6...|checked out Symph...|[checked, out, sy...|[checked, symphon...|
|JX0-7dN3i34Fe_uJ6...|Don't live here. ...|[don, t, live, he...|[live, worth, ser...|
+--------------------+--------------------+--------------------+--------------------+
only showing top 3 rows



In [None]:
# create vectors (features)

count_vectorizer = CountVectorizer(inputCol="nostopwrd", outputCol="rawFeature")
cv_model = count_vectorizer.fit(reviews_by_business_token_nostopwrd_df)
reviews_by_business_featurized_df = cv_model.transform(reviews_by_business_token_nostopwrd_df)

# get the vovablary  (all words)
vocab = cv_model.vocabulary

reviews_by_business_featurized_df.show(3)