In [75]:
# Import our SparkSession so we can use it
from pyspark.sql import SparkSession, SQLContext, functions as F
from pyspark.sql.functions import col

# Create our SparkSession, this can take a couple minutes locally
spark = SparkSession.builder.appName("UserJSON") \
.config('spark.sql.broadcastTimeout','24000') \
.config('spark.network.timeout','24000') \
.getOrCreate()


In [1]:
df_user = spark.read.json("../data_source/user.json")
df_review = spark.read.json("../data_source/review.json")
df_business = spark.read.json("../data_source/business.json")

In [77]:
df_user.printSchema()

root
 |-- average_stars: double (nullable = true)
 |-- compliment_cool: long (nullable = true)
 |-- compliment_cute: long (nullable = true)
 |-- compliment_funny: long (nullable = true)
 |-- compliment_hot: long (nullable = true)
 |-- compliment_list: long (nullable = true)
 |-- compliment_more: long (nullable = true)
 |-- compliment_note: long (nullable = true)
 |-- compliment_photos: long (nullable = true)
 |-- compliment_plain: long (nullable = true)
 |-- compliment_profile: long (nullable = true)
 |-- compliment_writer: long (nullable = true)
 |-- cool: long (nullable = true)
 |-- elite: string (nullable = true)
 |-- fans: long (nullable = true)
 |-- friends: string (nullable = true)
 |-- funny: long (nullable = true)
 |-- name: string (nullable = true)
 |-- review_count: long (nullable = true)
 |-- useful: long (nullable = true)
 |-- user_id: string (nullable = true)
 |-- yelping_since: string (nullable = true)



In [78]:
df_review.printSchema()

root
 |-- business_id: string (nullable = true)
 |-- cool: long (nullable = true)
 |-- date: string (nullable = true)
 |-- funny: long (nullable = true)
 |-- review_id: string (nullable = true)
 |-- stars: double (nullable = true)
 |-- text: string (nullable = true)
 |-- useful: long (nullable = true)
 |-- user_id: string (nullable = true)



In [79]:
df_clean_review = df_review.select("business_id", "stars", "text")

In [80]:
df_clean_review = df_clean_review.withColumn("Positive_Review", (df_clean_review.stars > 3))

In [81]:
df_positive_review = df_clean_review.where(col("Positive_Review") == True).select("business_id", "text")

In [82]:
df_positive_review.show(truncate = True)

+--------------------+--------------------+
|         business_id|                text|
+--------------------+--------------------+
|NZnhc2sEQy3RmzKTZ...|I *adore* Travis ...|
|WTqjgwHlXbSFevF32...|I have to say tha...|
|ikCg8xy5JIg_NGPx-...|Went in for a lun...|
|eU_713ec6fTGNO4Be...|I'll be the first...|
|8mIrX_LrOnAqWsB5J...|Like walking back...|
|FxLfqxdYPA6Z85PFK...|Wow. So surprised...|
|LUN6swQYa4xJKaM_U...|Michael from Red ...|
|YvrylyuWgbP90RgMq...|You can't really ...|
|NyLYY8q1-H3hfsTwu...|Great lunch today...|
|6lj2BJ4tJeu7db5as...|We've been a huge...|
|qx6WhZ42eDKmBchZD...|Our family LOVES ...|
|Mem13A3C202RzT53n...|If you are lookin...|
|I4Nr-MVc26qWr08-S...|The food is alway...|
|d_L-rfS1vT3JMzgCU...|Pick any meat on ...|
|Sfc8Haz2Yri8Mo1L0...|Great food, great...|
|FQ1wBQb3aNeRMThSQ...|PlumbSmart provid...|
|Gyrez6K8f1AyR7dzW...|their pettuccine ...|
|dm6sO_Y8JdKTE1ZM9...|ended up here bec...|
|3JxKzWquEbPC3yPIf...|Best chinese rest...|
|sMzNLdhJZGzYirIWt...|This place

In [83]:
df_clean_business = df_business.select("business_id", "name")

In [89]:
### Save the join into a file.
df_clean_business.coalesce(1).write.format('json').save('raw_data/clean_business')



In [90]:
json_clean_business = spark.read.json("raw_data/clean_business/part-00000-52a71286-af99-4501-809e-828aaf0a661e-c000.json")


In [91]:
join_business_review = json_clean_business.alias("business").join(df_positive_review.alias("review"), json_clean_business.business_id == df_positive_review.business_id).select([col('business.'+xx) for xx in json_clean_business.columns] + [col("review.text")])



In [92]:
join_business_review.show()

+--------------------+--------------------+--------------------+
|         business_id|                name|                text|
+--------------------+--------------------+--------------------+
|--9e1ONYQuAa-CB_R...|Delmonico Steakhouse|Walked in on a Su...|
|--9e1ONYQuAa-CB_R...|Delmonico Steakhouse|On yelp 5 stars =...|
|--9e1ONYQuAa-CB_R...|Delmonico Steakhouse|this place hasnt ...|
|--9e1ONYQuAa-CB_R...|Delmonico Steakhouse|This is part of m...|
|--9e1ONYQuAa-CB_R...|Delmonico Steakhouse|A great culinary ...|
|--9e1ONYQuAa-CB_R...|Delmonico Steakhouse|A Las Vegas class...|
|--9e1ONYQuAa-CB_R...|Delmonico Steakhouse|World class servi...|
|--9e1ONYQuAa-CB_R...|Delmonico Steakhouse|I ate the duck an...|
|--9e1ONYQuAa-CB_R...|Delmonico Steakhouse|Had the steak sal...|
|--9e1ONYQuAa-CB_R...|Delmonico Steakhouse|Had Dinner at Del...|
|--9e1ONYQuAa-CB_R...|Delmonico Steakhouse|Third visit here ...|
|--9e1ONYQuAa-CB_R...|Delmonico Steakhouse|There's a reason ...|
|--9e1ONYQuAa-CB_R...|Del