In [1]:
# Import our SparkSession so we can use it
from pyspark.sql import SparkSession, SQLContext, functions as F
from pyspark.sql.functions import col

# Create our SparkSession, this can take a couple minutes locally
spark = SparkSession.builder.appName("TipsJSON").config('spark.sql.broadcastTimeout',-1).getOrCreate()


In [2]:
df_tip = spark.read.json("../data_source/tip.json")
df_business = spark.read.json("../data_source/business.json")


In [4]:
df_tip.printSchema()


root
 |-- business_id: string (nullable = true)
 |-- compliment_count: long (nullable = true)
 |-- date: string (nullable = true)
 |-- text: string (nullable = true)
 |-- user_id: string (nullable = true)



In [11]:
data = df_tip.select("business_id").groupBy("business_id").count()

data.where("count > 1").orderBy(col("count").desc()).show(200, False)


+----------------------+-----+
|business_id           |count|
+----------------------+-----+
|FaHADZARwnY4yvlvpnsfGA|3614 |
|JmI9nslLD7KZqRr__Bg6NQ|2440 |
|DkYS3arLOhA8si5uUEmHOw|1503 |
|5LNZ67Yw9RD6nf4_UhXOjw|1487 |
|K7lWdNUhCbcnEvI0NhGewg|1386 |
|hihud--QRriCYZw1zZvW4g|1371 |
|RESDUcs7fIiihp38-d6_6g|1264 |
|4JNXUYY8wbaaDmk3BPzlWw|1121 |
|yfxDa8RFOvJPQh0rNtakHA|1117 |
|iCQpiavjjPzJ5_3gPD5Ebg|1105 |
|SMPbvZLSMMb7KU76YNYMGg|1025 |
|7sPNbCx7vGAaH7SbNPZ6oA|1018 |
|UPIYuRaZvknINOd1w8kqRQ|1013 |
|eoHdUeQDNgQ6WYEnP2aiRw|954  |
|yQab5dxZzgBLTEHCw9V7_w|938  |
|LNGBEEelQx4zbfWnlc66cw|904  |
|JyxHvtj-syke7m9rbza7mA|903  |
|f4x1YBxkLrZg652xt2KR5g|860  |
|WUq8HJHIZU4uteB154XN7w|846  |
|Wxxvi3LZbHNIDwJ-ZimtnA|792  |
|El4FC8jcawUVgw_0EIcbaQ|783  |
|KskYqH1Bi7Z_61pH6Om8pg|783  |
|cYwJA2A6I12KNkm2rtXd5g|776  |
|JzOp695tclcNCNMuBl7oxA|762  |
|na4Th5DrNauOv-c43QQFvA|759  |
|YBLVD61RFdP5H-RGLSIPUw|752  |
|RwMLuOkImBIqqYj4SSKSPg|748  |
|eAc9Vd6loOgRQolMXQt6FA|719  |
|g8OnV26ywJlZpezdBnOWUQ|711  |
|awI4hHM

In [13]:
df_business.printSchema()


root
 |-- address: string (nullable = true)
 |-- attributes: struct (nullable = true)
 |    |-- AcceptsInsurance: string (nullable = true)
 |    |-- AgesAllowed: string (nullable = true)
 |    |-- Alcohol: string (nullable = true)
 |    |-- Ambience: string (nullable = true)
 |    |-- BYOB: string (nullable = true)
 |    |-- BYOBCorkage: string (nullable = true)
 |    |-- BestNights: string (nullable = true)
 |    |-- BikeParking: string (nullable = true)
 |    |-- BusinessAcceptsBitcoin: string (nullable = true)
 |    |-- BusinessAcceptsCreditCards: string (nullable = true)
 |    |-- BusinessParking: string (nullable = true)
 |    |-- ByAppointmentOnly: string (nullable = true)
 |    |-- Caters: string (nullable = true)
 |    |-- CoatCheck: string (nullable = true)
 |    |-- Corkage: string (nullable = true)
 |    |-- DietaryRestrictions: string (nullable = true)
 |    |-- DogsAllowed: string (nullable = true)
 |    |-- DriveThru: string (nullable = true)
 |    |-- GoodForDancing: str

In [31]:
df_business.where("business_id == 'FaHADZARwnY4yvlvpnsfGA'").select("name").show(truncate = False)


+------------------------------+
|name                          |
+------------------------------+
|McCarran International Airport|
+------------------------------+



In [27]:
df_tip.filter(col("compliment_count") > 0).select("compliment_count").groupBy("compliment_count").count().show(200, truncate = True)



+----------------+-----+
|compliment_count|count|
+----------------+-----+
|               7|    4|
|               6|    8|
|               9|    1|
|               5|   18|
|               1|16239|
|               3|  227|
|              12|    1|
|               8|    1|
|              11|    1|
|               2| 1402|
|               4|   51|
|              15|    1|
+----------------+-----+



In [32]:
df_tip.filter(col("business_id") == 'FaHADZARwnY4yvlvpnsfGA').filter(col("compliment_count") > 0).select("text").show(3, truncate = False)


+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|text                                                                                                                                                                                             |
+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|Just park in the lot at pickup and pay the $2.  It's not worth the hassle to drive through over and over. Plus, you get brownie points from whoever you pick up for meeting them in the terminal.|
|Major construction going on avoid this shit hole                                                                                                                                                 |
|:( leaving the 702.