In [36]:
import ast 
import json
from pyspark.sql import SparkSession, SQLContext, functions, types
from google.cloud import storage

In [11]:
bucket_name = 'sa-yelp-dataset'
bucket_path = 'gs://sa-yelp-dataset'

In [12]:
gcs_client = storage.Client()
bucket = gcs_client.bucket(bucket_name)

for obj in list(bucket.list_blobs(prefix='yelp_academic')):
    print(obj)

<Blob: sa-yelp-dataset, yelp_academic_ca_provinces.json, 1606496025167812>
<Blob: sa-yelp-dataset, yelp_academic_dataset_business.json, 1605935606649005>
<Blob: sa-yelp-dataset, yelp_academic_dataset_checkin.json, 1605936241859751>
<Blob: sa-yelp-dataset, yelp_academic_dataset_review.json, 1605980374655635>
<Blob: sa-yelp-dataset, yelp_academic_dataset_tip.json, 1605980751339428>
<Blob: sa-yelp-dataset, yelp_academic_dataset_user.json, 1605987064578747>


In [16]:
!hdfs dfs -ls 'gs://sa-yelp-dataset'

Found 9 items
drwx------   - root root          0 1970-01-01 00:00 gs://sa-yelp-dataset/.ipynb_checkpoints
drwx------   - root root          0 1970-01-01 00:00 gs://sa-yelp-dataset/google-cloud-dataproc-metainfo
drwx------   - root root          0 2020-11-21 06:14 gs://sa-yelp-dataset/notebooks
-rwx------   3 root root        535 2020-11-27 16:53 gs://sa-yelp-dataset/yelp_academic_ca_provinces.json
-rwx------   3 root root  152898689 2020-11-21 05:13 gs://sa-yelp-dataset/yelp_academic_dataset_business.json
-rwx------   3 root root  449663480 2020-11-21 05:24 gs://sa-yelp-dataset/yelp_academic_dataset_checkin.json
-rwx------   3 root root 6325565224 2020-11-21 17:39 gs://sa-yelp-dataset/yelp_academic_dataset_review.json
-rwx------   3 root root  263489322 2020-11-21 17:45 gs://sa-yelp-dataset/yelp_academic_dataset_tip.json
-rwx------   3 root root 3268069927 2020-11-21 19:31 gs://sa-yelp-dataset/yelp_academic_dataset_user.json


In [32]:
!scala -version

Scala code runner version 2.12.10 -- Copyright 2002-2019, LAMP/EPFL and Lightbend, Inc.


In [4]:
spark = SparkSession.builder.appName('YelpAnalysis')\
    .config('spark.jars.packages', 'com.google.cloud.spark:spark-bigquery-with-dependencies_2.12:0.15.1-beta') \
    .getOrCreate()

In [18]:
df_provinces = spark.read.json(f'{bucket_path}/yelp_academic_ca_provinces.json')
df_provinces.createOrReplaceTempView('VW_Province')

spark.sql('SELECT * FROM VW_Province').show(truncate=False)

+----+-------------------------+
|code|province                 |
+----+-------------------------+
|ON  |Ontario                  |
|QC  |Quebec                   |
|NS  |Nova Scotia              |
|NB  |New Brunswich            |
|MB  |Manitoba                 |
|BC  |British Columbia         |
|PE  |Prince Edward Island     |
|SK  |Saskatchewan             |
|AB  |Alberta                  |
|NL  |Newfoundland and Labrador|
|NT  |Northwest Territories    |
|YT  |Yukon                    |
|NU  |Nunavut                  |
+----+-------------------------+



In [19]:
SCHEMA = types.StructType([
    types.StructField("address", types.StringType()),
    types.StructField("attributes", types.StringType()),
    types.StructField("business_id", types.StringType()),
    types.StructField("categories", types.StringType()),
    types.StructField("city", types.StringType()),
    types.StructField("hours", types.StringType()),
    types.StructField("is_open", types.LongType()),
    types.StructField("latitude", types.DoubleType()),
    types.StructField("longitude", types.DoubleType()),
    types.StructField("name", types.StringType()),
    types.StructField("postal_code", types.StringType()),
    types.StructField("review_count", types.LongType()),
    types.StructField("stars", types.DoubleType()),
    types.StructField("state", types.StringType()),
])

In [70]:
df_business = spark.read.json(f'{bucket_path}/yelp_academic_dataset_business.json')
df_business.createOrReplaceTempView("VW_Business")

In [71]:
df_business.printSchema()

root
 |-- address: string (nullable = true)
 |-- attributes: struct (nullable = true)
 |    |-- AcceptsInsurance: string (nullable = true)
 |    |-- AgesAllowed: string (nullable = true)
 |    |-- Alcohol: string (nullable = true)
 |    |-- Ambience: string (nullable = true)
 |    |-- BYOB: string (nullable = true)
 |    |-- BYOBCorkage: string (nullable = true)
 |    |-- BestNights: string (nullable = true)
 |    |-- BikeParking: string (nullable = true)
 |    |-- BusinessAcceptsBitcoin: string (nullable = true)
 |    |-- BusinessAcceptsCreditCards: string (nullable = true)
 |    |-- BusinessParking: string (nullable = true)
 |    |-- ByAppointmentOnly: string (nullable = true)
 |    |-- Caters: string (nullable = true)
 |    |-- CoatCheck: string (nullable = true)
 |    |-- Corkage: string (nullable = true)
 |    |-- DietaryRestrictions: string (nullable = true)
 |    |-- DogsAllowed: string (nullable = true)
 |    |-- DriveThru: string (nullable = true)
 |    |-- GoodForDancing: str

In [23]:
spark.sql('''
            SELECT STATE, COUNT(*) AS STATE_COUNT
            FROM VW_Business 
            GROUP BY STATE
            ORDER BY 2 DESC''').show()

+-----+-----------+
|STATE|STATE_COUNT|
+-----+-----------+
|   AZ|      60803|
|   NV|      39084|
|   ON|      36627|
|   OH|      16392|
|   NC|      16218|
|   PA|      12376|
|   QC|      10233|
|   AB|       8682|
|   WI|       5525|
|   IL|       2034|
|   SC|       1328|
|   CA|         23|
|   NY|         22|
|   TX|          6|
|   WA|          5|
|   AL|          3|
|   GA|          3|
|   FL|          3|
|   CT|          2|
|   VT|          2|
+-----+-----------+
only showing top 20 rows



In [72]:
df_business_CA = spark.sql('''
                            SELECT
                                BUSINESS_ID,
                                NAME,
                                STATE,
                                CITY,
                                POSTAL_CODE,
                                LATITUDE,
                                LONGITUDE,
                                CATEGORIES,
                                ATTRIBUTES
                            FROM
                                VW_Business VB 
                                    INNER JOIN VW_Province VP ON VB.STATE = VP.CODE
                            WHERE
                                    IS_OPEN = '1'
                                AND POSTAL_CODE IS NOT NULL
                                AND LENGTH(POSTAL_CODE) = 7
                                AND LATITUDE IS NOT NULL
                                AND LONGITUDE IS NOT NULL
                                AND STARS IS NOT NULL
                                AND REVIEW_COUNT IS NOT NULL
                                AND CATEGORIES IS NOT NULL
                                AND ATTRIBUTES IS NOT NULL
                            ''').cache()

df_business_CA.createOrReplaceTempView('VW_BUSINESS_CA')

spark.sql('SELECT * FROM VW_BUSINESS_CA').show(10)

+--------------------+--------------------+-----+-------------+-----------+-------------+--------------+--------------------+--------------------+
|         BUSINESS_ID|                NAME|STATE|         CITY|POSTAL_CODE|     LATITUDE|     LONGITUDE|          CATEGORIES|          ATTRIBUTES|
+--------------------+--------------------+-----+-------------+-----------+-------------+--------------+--------------------+--------------------+
|EosRKXIGeSWFYWwpk...|      Xtreme Couture|   ON|      Toronto|    M8Z 5G3|43.6245394916|-79.5291079302|Martial Arts, Gym...|[,,,,,,, False,,,...|
|eBEfgOPG7pvFhb2wc...|     Philthy Phillys|   ON|       Aurora|    L4G 7J1|   44.0109618|    -79.448677|Restaurants, Chee...|[,, u'none',,,,, ...|
|lu7vtrp_bE9PnxWfA...|        Banzai Sushi|   ON|    Thornhill|    L3T 5W4|   43.8204923|   -79.3984661|Japanese, Fast Fo...|[,, u'none',,,,,,...|
|1wWneWD_E1pBIyVpd...|    Air Jordan Store|   ON|      Toronto|    M5B 1R4|   43.6565424|   -79.3813076|Shopping, Shoe

In [25]:
spark.sql('''
            SELECT STATE, COUNT(*) AS STATE_COUNT 
            FROM VW_BUSINESS_CA 
            GROUP BY STATE 
            ORDER BY 2 DESC''').show()

+-----+-----------+
|STATE|STATE_COUNT|
+-----+-----------+
|   ON|      22575|
|   QC|       7099|
|   AB|       5317|
|   BC|          2|
|   MB|          1|
|   YT|          1|
+-----+-----------+



In [26]:
spark.sql('''
            SELECT CITY, COUNT(*) AS CITY_COUNT 
            FROM VW_BUSINESS_CA
            GROUP BY CITY 
            ORDER BY 2 DESC''').show()

+-------------+----------+
|         CITY|CITY_COUNT|
+-------------+----------+
|      Toronto|     12074|
|      Calgary|      5128|
|     Montréal|      4695|
|  Mississauga|      2324|
|      Markham|      1225|
|   North York|       818|
|  Scarborough|       774|
|     Brampton|       768|
|Richmond Hill|       691|
|      Vaughan|       686|
|    Etobicoke|       558|
|        Laval|       365|
|    Thornhill|       275|
|    Newmarket|       273|
|     Oakville|       272|
|    Pickering|       249|
|         Ajax|       228|
|       Whitby|       217|
|       Aurora|       193|
|   Woodbridge|       182|
+-------------+----------+
only showing top 20 rows



In [31]:
df_business_CA_cat = spark.sql('''
                                SELECT 
                                    BUSINESS_ID,
                                    TRIM(CATEGORY) as CATEGORY
                                FROM
                                    (
                                    SELECT 
                                        BUSINESS_ID, 
                                        EXPLODE(SPLIT(CATEGORIES, ',')) as CATEGORY
                                    FROM
                                        VW_BUSINESS_CA)
                                ''')

df_business_CA_cat.createOrReplaceTempView('VW_BUSINESS_CA_CAT')

spark.sql('''SELECT * FROM VW_BUSINESS_CA_CAT''').show(20, False)

+----------------------+---------------------+
|BUSINESS_ID           |CATEGORY             |
+----------------------+---------------------+
|EosRKXIGeSWFYWwpkbhNnA|Martial Arts         |
|EosRKXIGeSWFYWwpkbhNnA|Gyms                 |
|EosRKXIGeSWFYWwpkbhNnA|Fitness & Instruction|
|EosRKXIGeSWFYWwpkbhNnA|Active Life          |
|eBEfgOPG7pvFhb2wcG9I7w|Restaurants          |
|eBEfgOPG7pvFhb2wcG9I7w|Cheesesteaks         |
|eBEfgOPG7pvFhb2wcG9I7w|Poutineries          |
|lu7vtrp_bE9PnxWfA8g4Pg|Japanese             |
|lu7vtrp_bE9PnxWfA8g4Pg|Fast Food            |
|lu7vtrp_bE9PnxWfA8g4Pg|Food Court           |
|lu7vtrp_bE9PnxWfA8g4Pg|Restaurants          |
|1wWneWD_E1pBIyVpdHMaQg|Shopping             |
|1wWneWD_E1pBIyVpdHMaQg|Shoe Stores          |
|1wWneWD_E1pBIyVpdHMaQg|Fashion              |
|9sRGfSVEfLhN_km60YruTA|Persian/Iranian      |
|9sRGfSVEfLhN_km60YruTA|Turkish              |
|9sRGfSVEfLhN_km60YruTA|Middle Eastern       |
|9sRGfSVEfLhN_km60YruTA|Restaurants          |
|9sRGfSVEfLhN

In [29]:
spark.sql('''
            SELECT CATEGORY, COUNT(*) AS CATEGORY_COUNT 
            FROM VW_BUSINESS_CA_CAT 
            GROUP BY CATEGORY
            ORDER BY 2 DESC''').show()

+--------------------+--------------+
|            CATEGORY|CATEGORY_COUNT|
+--------------------+--------------+
|         Restaurants|         16711|
|                Food|          8525|
|            Shopping|          6077|
|       Beauty & Spas|          3810|
|           Nightlife|          2949|
|        Coffee & Tea|          2665|
|                Bars|          2623|
|    Health & Medical|          2071|
|Event Planning & ...|          1841|
|           Fast Food|          1697|
|               Pizza|          1680|
|      Specialty Food|          1672|
|  Breakfast & Brunch|          1668|
|         Active Life|          1637|
|          Sandwiches|          1596|
|             Fashion|          1565|
|             Chinese|          1476|
|         Hair Salons|          1381|
|               Cafes|          1365|
|             Burgers|          1350|
+--------------------+--------------+
only showing top 20 rows



In [150]:
def get_attribute_keys(struct_name):
    fields = json.loads(df_business_CA.schema.json())['fields']

    attributes = []

    for field in fields:
        if (field['name'] == struct_name):

            sub_fields = field['type']['fields']

            for sub_field in sub_fields:
                attributes += [sub_field['name']]
    
    return attributes

attributes = get_attribute_keys('ATTRIBUTES')

print(attributes)

['AcceptsInsurance', 'AgesAllowed', 'Alcohol', 'Ambience', 'BYOB', 'BYOBCorkage', 'BestNights', 'BikeParking', 'BusinessAcceptsBitcoin', 'BusinessAcceptsCreditCards', 'BusinessParking', 'ByAppointmentOnly', 'Caters', 'CoatCheck', 'Corkage', 'DietaryRestrictions', 'DogsAllowed', 'DriveThru', 'GoodForDancing', 'GoodForKids', 'GoodForMeal', 'HairSpecializesIn', 'HappyHour', 'HasTV', 'Music', 'NoiseLevel', 'Open24Hours', 'OutdoorSeating', 'RestaurantsAttire', 'RestaurantsCounterService', 'RestaurantsDelivery', 'RestaurantsGoodForGroups', 'RestaurantsPriceRange2', 'RestaurantsReservations', 'RestaurantsTableService', 'RestaurantsTakeOut', 'Smoking', 'WheelchairAccessible', 'WiFi']


In [184]:
@functions.udf(returnType=types.StringType())
def flatten_attributes(col_data):
    output = {}

    for attribute in attributes:
        if (col_data[attribute] == None):
            output[attribute] = None
        elif str(col_data[attribute]).startswith('{'):
            col_sub_data = str(col_data[attribute]).split(',')
            
            for sub_data in col_sub_data:
                if (len(sub_data.split(':')) == 2):
                    sub_attr_key = sub_data.split(':')[0].replace('{', '').replace('\'', '').strip()
                    sub_attr_val = sub_data.split(':')[1].replace('}', '').replace('\'', '').strip()
                    output[attribute + "_" + sub_attr_key] = sub_attr_val
        else:
            output[attribute] = col_data[attribute]

    return str(output) \
                .replace('{', '') \
                .replace('}','') \
                .replace('\'', '')

spark.udf.register("FLATTEN", flatten_attributes)

df_business_CA_attr = spark.sql('''
                                SELECT
                                    BUSINESS_ID,
                                    REPLACE(SPLIT(TRIM(ATTRIBUTE), ':')[0], "'", "") AS ATTR_KEY,
                                    TRIM(SPLIT(ATTRIBUTE, ':')[1]) AS ATTR_VAL
                                FROM
                                (
                                    SELECT
                                        BUSINESS_ID,
                                        EXPLODE(SPLIT(FLATTEN(ATTRIBUTES), ',')) AS ATTRIBUTE
                                    FROM 
                                        VW_BUSINESS_CA
                                )''')

df_business_CA_attr.createOrReplaceTempView('VW_BUSINESS_CA_ATTR')

spark.sql("SELECT * FROM VW_BUSINESS_CA_ATTR").show(30, False)

+----------------------+--------------------------+--------+
|BUSINESS_ID           |ATTR_KEY                  |ATTR_VAL|
+----------------------+--------------------------+--------+
|EosRKXIGeSWFYWwpkbhNnA|AcceptsInsurance          |None    |
|EosRKXIGeSWFYWwpkbhNnA|AgesAllowed               |None    |
|EosRKXIGeSWFYWwpkbhNnA|Alcohol                   |None    |
|EosRKXIGeSWFYWwpkbhNnA|Ambience                  |None    |
|EosRKXIGeSWFYWwpkbhNnA|BYOB                      |None    |
|EosRKXIGeSWFYWwpkbhNnA|BYOBCorkage               |None    |
|EosRKXIGeSWFYWwpkbhNnA|BestNights                |None    |
|EosRKXIGeSWFYWwpkbhNnA|BikeParking               |False   |
|EosRKXIGeSWFYWwpkbhNnA|BusinessAcceptsBitcoin    |None    |
|EosRKXIGeSWFYWwpkbhNnA|BusinessAcceptsCreditCards|None    |
|EosRKXIGeSWFYWwpkbhNnA|BusinessParking_garage    |False   |
|EosRKXIGeSWFYWwpkbhNnA|BusinessParking_street    |False   |
|EosRKXIGeSWFYWwpkbhNnA|BusinessParking_validated |False   |
|EosRKXIGeSWFYWwpkbhNnA|

In [187]:
spark.sql('''
            SELECT ATTR_VAL, COUNT(*) AS BUSINESS_COUNT
            FROM VW_BUSINESS_CA_ATTR
            WHERE ATTR_KEY = 'HasTV'
            GROUP BY ATTR_VAL
        ''').show()

+--------+--------------+
|ATTR_VAL|BUSINESS_COUNT|
+--------+--------------+
|    None|         21338|
|   False|          2405|
|    True|         11252|
+--------+--------------+



In [55]:
def write_to_bq(df, temp_bucket_name, ds_name, tbl_name):
    df.write \
        .format('bigquery') \
        .option('table', f'{ds_name}.{tbl_name}') \
        .option("temporaryGcsBucket", temp_bucket_name) \
        .mode('overwrite') \
        .save()

In [56]:
df_business_CA_final = spark.sql('''
                                    SELECT
                                        BUSINESS_ID,
                                        NAME,
                                        STATE,
                                        CITY,
                                        POSTAL_CODE,
                                        LATITUDE,
                                        LONGITUDE
                                    FROM
                                        VW_BUSINESS_CA''')

write_to_bq(df_business_CA_final, bucket_name, 'yelp_dataset', 'businesses')

In [49]:
write_to_bq(df_provinces, bucket_name, 'yelp_dataset', 'provinces')

In [57]:
write_to_bq(df_business_CA_cat, bucket_name, 'yelp_dataset', 'categories')

In [58]:
write_to_bq(df_business_CA_attr, bucket_name, 'yelp_dataset', 'attributes')