In [189]:
import ast 
import json
from pyspark.sql import SparkSession, SQLContext, functions, types
from google.cloud import storage
import seaborn as sns
import matplotlib.pyplot as plt

In [11]:
bucket_name = 'sa-yelp-dataset'
bucket_path = 'gs://sa-yelp-dataset'

In [12]:
gcs_client = storage.Client()
bucket = gcs_client.bucket(bucket_name)

for obj in list(bucket.list_blobs(prefix='yelp_academic')):
    print(obj)

<Blob: sa-yelp-dataset, yelp_academic_ca_provinces.json, 1606496025167812>
<Blob: sa-yelp-dataset, yelp_academic_dataset_business.json, 1605935606649005>
<Blob: sa-yelp-dataset, yelp_academic_dataset_checkin.json, 1605936241859751>
<Blob: sa-yelp-dataset, yelp_academic_dataset_review.json, 1605980374655635>
<Blob: sa-yelp-dataset, yelp_academic_dataset_tip.json, 1605980751339428>
<Blob: sa-yelp-dataset, yelp_academic_dataset_user.json, 1605987064578747>


In [16]:
!hdfs dfs -ls 'gs://sa-yelp-dataset'

Found 9 items
drwx------   - root root          0 1970-01-01 00:00 gs://sa-yelp-dataset/.ipynb_checkpoints
drwx------   - root root          0 1970-01-01 00:00 gs://sa-yelp-dataset/google-cloud-dataproc-metainfo
drwx------   - root root          0 2020-11-21 06:14 gs://sa-yelp-dataset/notebooks
-rwx------   3 root root        535 2020-11-27 16:53 gs://sa-yelp-dataset/yelp_academic_ca_provinces.json
-rwx------   3 root root  152898689 2020-11-21 05:13 gs://sa-yelp-dataset/yelp_academic_dataset_business.json
-rwx------   3 root root  449663480 2020-11-21 05:24 gs://sa-yelp-dataset/yelp_academic_dataset_checkin.json
-rwx------   3 root root 6325565224 2020-11-21 17:39 gs://sa-yelp-dataset/yelp_academic_dataset_review.json
-rwx------   3 root root  263489322 2020-11-21 17:45 gs://sa-yelp-dataset/yelp_academic_dataset_tip.json
-rwx------   3 root root 3268069927 2020-11-21 19:31 gs://sa-yelp-dataset/yelp_academic_dataset_user.json


In [32]:
!scala -version

Scala code runner version 2.12.10 -- Copyright 2002-2019, LAMP/EPFL and Lightbend, Inc.


In [4]:
spark = SparkSession.builder.appName('YelpAnalysis')\
    .config('spark.jars.packages', 'com.google.cloud.spark:spark-bigquery-with-dependencies_2.12:0.15.1-beta') \
    .getOrCreate()

## Process yelp_academic_dataset_business.csv file

In [18]:
df_provinces = spark.read.json(f'{bucket_path}/yelp_academic_ca_provinces.json')
df_provinces.createOrReplaceTempView('VW_Province')

spark.sql('SELECT * FROM VW_Province').show(truncate=False)

+----+-------------------------+
|code|province                 |
+----+-------------------------+
|ON  |Ontario                  |
|QC  |Quebec                   |
|NS  |Nova Scotia              |
|NB  |New Brunswich            |
|MB  |Manitoba                 |
|BC  |British Columbia         |
|PE  |Prince Edward Island     |
|SK  |Saskatchewan             |
|AB  |Alberta                  |
|NL  |Newfoundland and Labrador|
|NT  |Northwest Territories    |
|YT  |Yukon                    |
|NU  |Nunavut                  |
+----+-------------------------+



In [19]:
SCHEMA = types.StructType([
    types.StructField("address", types.StringType()),
    types.StructField("attributes", types.StringType()),
    types.StructField("business_id", types.StringType()),
    types.StructField("categories", types.StringType()),
    types.StructField("city", types.StringType()),
    types.StructField("hours", types.StringType()),
    types.StructField("is_open", types.LongType()),
    types.StructField("latitude", types.DoubleType()),
    types.StructField("longitude", types.DoubleType()),
    types.StructField("name", types.StringType()),
    types.StructField("postal_code", types.StringType()),
    types.StructField("review_count", types.LongType()),
    types.StructField("stars", types.DoubleType()),
    types.StructField("state", types.StringType()),
])

In [70]:
df_business = spark.read.json(f'{bucket_path}/yelp_academic_dataset_business.json')
df_business.createOrReplaceTempView("VW_Business")

In [71]:
df_business.printSchema()

root
 |-- address: string (nullable = true)
 |-- attributes: struct (nullable = true)
 |    |-- AcceptsInsurance: string (nullable = true)
 |    |-- AgesAllowed: string (nullable = true)
 |    |-- Alcohol: string (nullable = true)
 |    |-- Ambience: string (nullable = true)
 |    |-- BYOB: string (nullable = true)
 |    |-- BYOBCorkage: string (nullable = true)
 |    |-- BestNights: string (nullable = true)
 |    |-- BikeParking: string (nullable = true)
 |    |-- BusinessAcceptsBitcoin: string (nullable = true)
 |    |-- BusinessAcceptsCreditCards: string (nullable = true)
 |    |-- BusinessParking: string (nullable = true)
 |    |-- ByAppointmentOnly: string (nullable = true)
 |    |-- Caters: string (nullable = true)
 |    |-- CoatCheck: string (nullable = true)
 |    |-- Corkage: string (nullable = true)
 |    |-- DietaryRestrictions: string (nullable = true)
 |    |-- DogsAllowed: string (nullable = true)
 |    |-- DriveThru: string (nullable = true)
 |    |-- GoodForDancing: str

In [393]:
spark.sql('''SELECT STATE, COUNT(*) AS STATE_COUNT
                FROM VW_Business 
                GROUP BY STATE
                ORDER BY 2 DESC''').show(10)

+-----+-----------+
|STATE|STATE_COUNT|
+-----+-----------+
|   AZ|      60803|
|   NV|      39084|
|   ON|      36627|
|   OH|      16392|
|   NC|      16218|
|   PA|      12376|
|   QC|      10233|
|   AB|       8682|
|   WI|       5525|
|   IL|       2034|
+-----+-----------+
only showing top 10 rows



In [394]:
df_business_CA = spark.sql('''
                            SELECT
                                BUSINESS_ID,
                                NAME,
                                STATE,
                                CITY,
                                POSTAL_CODE,
                                LATITUDE,
                                LONGITUDE,
                                CATEGORIES,
                                ATTRIBUTES
                            FROM
                                VW_Business VB 
                                    INNER JOIN VW_Province VP ON VB.STATE = VP.CODE
                            WHERE
                                    IS_OPEN = '1'
                                AND POSTAL_CODE IS NOT NULL
                                AND LENGTH(POSTAL_CODE) = 7
                                AND LATITUDE IS NOT NULL
                                AND LONGITUDE IS NOT NULL
                                AND STARS IS NOT NULL
                                AND REVIEW_COUNT IS NOT NULL
                                AND CATEGORIES IS NOT NULL
                                AND ATTRIBUTES IS NOT NULL
                            ''').cache()

df_business_CA.createOrReplaceTempView('VW_BUSINESS_CA')

In [396]:
spark.sql('SELECT * FROM VW_BUSINESS_CA').show(5)

+--------------------+------------------+-----+-------------+-----------+-------------+--------------+--------------------+--------------------+
|         BUSINESS_ID|              NAME|STATE|         CITY|POSTAL_CODE|     LATITUDE|     LONGITUDE|          CATEGORIES|          ATTRIBUTES|
+--------------------+------------------+-----+-------------+-----------+-------------+--------------+--------------------+--------------------+
|EosRKXIGeSWFYWwpk...|    Xtreme Couture|   ON|      Toronto|    M8Z 5G3|43.6245394916|-79.5291079302|Martial Arts, Gym...|[,,,,,,, False,,,...|
|eBEfgOPG7pvFhb2wc...|   Philthy Phillys|   ON|       Aurora|    L4G 7J1|   44.0109618|    -79.448677|Restaurants, Chee...|[,, u'none',,,,, ...|
|lu7vtrp_bE9PnxWfA...|      Banzai Sushi|   ON|    Thornhill|    L3T 5W4|   43.8204923|   -79.3984661|Japanese, Fast Fo...|[,, u'none',,,,,,...|
|1wWneWD_E1pBIyVpd...|  Air Jordan Store|   ON|      Toronto|    M5B 1R4|   43.6565424|   -79.3813076|Shopping, Shoe St...|[,,,,,,

In [398]:
spark.sql('''SELECT STATE, COUNT(*) AS STATE_COUNT 
                FROM VW_BUSINESS_CA 
                GROUP BY STATE 
                ORDER BY 2 DESC''').show(10)

+-----+-----------+
|STATE|STATE_COUNT|
+-----+-----------+
|   ON|      22575|
|   QC|       7099|
|   AB|       5317|
|   BC|          2|
|   MB|          1|
|   YT|          1|
+-----+-----------+



In [400]:
spark.sql('''SELECT STATE, CITY, COUNT(*) AS CITY_COUNT 
                FROM VW_BUSINESS_CA
                GROUP BY STATE, CITY 
                ORDER BY 3 DESC''').show(10)

+-----+-------------+----------+
|STATE|         CITY|CITY_COUNT|
+-----+-------------+----------+
|   ON|      Toronto|     12073|
|   AB|      Calgary|      5127|
|   QC|     Montréal|      4695|
|   ON|  Mississauga|      2324|
|   ON|      Markham|      1224|
|   ON|   North York|       817|
|   ON|  Scarborough|       774|
|   ON|     Brampton|       768|
|   ON|Richmond Hill|       690|
|   ON|      Vaughan|       686|
+-----+-------------+----------+
only showing top 10 rows



In [31]:
df_business_CA_cat = spark.sql('''
                                SELECT 
                                    BUSINESS_ID,
                                    TRIM(CATEGORY) as CATEGORY
                                FROM
                                    (
                                    SELECT 
                                        BUSINESS_ID, 
                                        EXPLODE(SPLIT(CATEGORIES, ',')) as CATEGORY
                                    FROM
                                        VW_BUSINESS_CA)
                                ''')

df_business_CA_cat.createOrReplaceTempView('VW_BUSINESS_CA_CAT')

spark.sql('''SELECT * FROM VW_BUSINESS_CA_CAT''').show(20, False)

+----------------------+---------------------+
|BUSINESS_ID           |CATEGORY             |
+----------------------+---------------------+
|EosRKXIGeSWFYWwpkbhNnA|Martial Arts         |
|EosRKXIGeSWFYWwpkbhNnA|Gyms                 |
|EosRKXIGeSWFYWwpkbhNnA|Fitness & Instruction|
|EosRKXIGeSWFYWwpkbhNnA|Active Life          |
|eBEfgOPG7pvFhb2wcG9I7w|Restaurants          |
|eBEfgOPG7pvFhb2wcG9I7w|Cheesesteaks         |
|eBEfgOPG7pvFhb2wcG9I7w|Poutineries          |
|lu7vtrp_bE9PnxWfA8g4Pg|Japanese             |
|lu7vtrp_bE9PnxWfA8g4Pg|Fast Food            |
|lu7vtrp_bE9PnxWfA8g4Pg|Food Court           |
|lu7vtrp_bE9PnxWfA8g4Pg|Restaurants          |
|1wWneWD_E1pBIyVpdHMaQg|Shopping             |
|1wWneWD_E1pBIyVpdHMaQg|Shoe Stores          |
|1wWneWD_E1pBIyVpdHMaQg|Fashion              |
|9sRGfSVEfLhN_km60YruTA|Persian/Iranian      |
|9sRGfSVEfLhN_km60YruTA|Turkish              |
|9sRGfSVEfLhN_km60YruTA|Middle Eastern       |
|9sRGfSVEfLhN_km60YruTA|Restaurants          |
|9sRGfSVEfLhN

In [401]:
spark.sql('''SELECT CATEGORY, COUNT(*) AS CATEGORY_COUNT 
                FROM VW_BUSINESS_CA_CAT 
                GROUP BY CATEGORY
                ORDER BY 2 DESC''').show(10)

+--------------------+--------------+
|            CATEGORY|CATEGORY_COUNT|
+--------------------+--------------+
|         Restaurants|         16711|
|                Food|          8525|
|            Shopping|          6077|
|       Beauty & Spas|          3810|
|           Nightlife|          2949|
|        Coffee & Tea|          2665|
|                Bars|          2623|
|    Health & Medical|          2071|
|Event Planning & ...|          1841|
|           Fast Food|          1697|
+--------------------+--------------+
only showing top 10 rows



In [402]:
def get_attribute_keys(struct_name):
    fields = json.loads(df_business_CA.schema.json())['fields']

    attributes = []

    for field in fields:
        if (field['name'] == struct_name):

            sub_fields = field['type']['fields']

            for sub_field in sub_fields:
                attributes += [sub_field['name']]
    
    return attributes

attributes = get_attribute_keys('ATTRIBUTES')

print(attributes)

['AcceptsInsurance', 'AgesAllowed', 'Alcohol', 'Ambience', 'BYOB', 'BYOBCorkage', 'BestNights', 'BikeParking', 'BusinessAcceptsBitcoin', 'BusinessAcceptsCreditCards', 'BusinessParking', 'ByAppointmentOnly', 'Caters', 'CoatCheck', 'Corkage', 'DietaryRestrictions', 'DogsAllowed', 'DriveThru', 'GoodForDancing', 'GoodForKids', 'GoodForMeal', 'HairSpecializesIn', 'HappyHour', 'HasTV', 'Music', 'NoiseLevel', 'Open24Hours', 'OutdoorSeating', 'RestaurantsAttire', 'RestaurantsCounterService', 'RestaurantsDelivery', 'RestaurantsGoodForGroups', 'RestaurantsPriceRange2', 'RestaurantsReservations', 'RestaurantsTableService', 'RestaurantsTakeOut', 'Smoking', 'WheelchairAccessible', 'WiFi']


In [184]:
@functions.udf(returnType=types.StringType())
def flatten_attributes(col_data):
    output = {}

    for attribute in attributes:
        if (col_data[attribute] == None):
            output[attribute] = None
        elif str(col_data[attribute]).startswith('{'):
            col_sub_data = str(col_data[attribute]).split(',')
            
            for sub_data in col_sub_data:
                if (len(sub_data.split(':')) == 2):
                    sub_attr_key = sub_data.split(':')[0].replace('{', '').replace('\'', '').strip()
                    sub_attr_val = sub_data.split(':')[1].replace('}', '').replace('\'', '').strip()
                    output[attribute + "_" + sub_attr_key] = sub_attr_val
        else:
            output[attribute] = col_data[attribute]

    return str(output) \
                .replace('{', '') \
                .replace('}','') \
                .replace('\'', '')

spark.udf.register("FLATTEN", flatten_attributes)

df_business_CA_attr = spark.sql('''
                                SELECT
                                    BUSINESS_ID,
                                    REPLACE(SPLIT(TRIM(ATTRIBUTE), ':')[0], "'", "") AS ATTR_KEY,
                                    TRIM(SPLIT(ATTRIBUTE, ':')[1]) AS ATTR_VAL
                                FROM
                                (
                                    SELECT
                                        BUSINESS_ID,
                                        EXPLODE(SPLIT(FLATTEN(ATTRIBUTES), ',')) AS ATTRIBUTE
                                    FROM 
                                        VW_BUSINESS_CA
                                )''')

df_business_CA_attr.createOrReplaceTempView('VW_BUSINESS_CA_ATTR')

spark.sql("SELECT * FROM VW_BUSINESS_CA_ATTR").show(30, False)

+----------------------+--------------------------+--------+
|BUSINESS_ID           |ATTR_KEY                  |ATTR_VAL|
+----------------------+--------------------------+--------+
|EosRKXIGeSWFYWwpkbhNnA|AcceptsInsurance          |None    |
|EosRKXIGeSWFYWwpkbhNnA|AgesAllowed               |None    |
|EosRKXIGeSWFYWwpkbhNnA|Alcohol                   |None    |
|EosRKXIGeSWFYWwpkbhNnA|Ambience                  |None    |
|EosRKXIGeSWFYWwpkbhNnA|BYOB                      |None    |
|EosRKXIGeSWFYWwpkbhNnA|BYOBCorkage               |None    |
|EosRKXIGeSWFYWwpkbhNnA|BestNights                |None    |
|EosRKXIGeSWFYWwpkbhNnA|BikeParking               |False   |
|EosRKXIGeSWFYWwpkbhNnA|BusinessAcceptsBitcoin    |None    |
|EosRKXIGeSWFYWwpkbhNnA|BusinessAcceptsCreditCards|None    |
|EosRKXIGeSWFYWwpkbhNnA|BusinessParking_garage    |False   |
|EosRKXIGeSWFYWwpkbhNnA|BusinessParking_street    |False   |
|EosRKXIGeSWFYWwpkbhNnA|BusinessParking_validated |False   |
|EosRKXIGeSWFYWwpkbhNnA|

In [187]:
spark.sql('''
            SELECT ATTR_VAL, COUNT(*) AS BUSINESS_COUNT
            FROM VW_BUSINESS_CA_ATTR
            WHERE ATTR_KEY = 'HasTV'
            GROUP BY ATTR_VAL
        ''').show()

+--------+--------------+
|ATTR_VAL|BUSINESS_COUNT|
+--------+--------------+
|    None|         21338|
|   False|          2405|
|    True|         11252|
+--------+--------------+



In [274]:
def write_to_bq(df, temp_bucket_name, ds_name, tbl_name):
    df.write \
        .format('bigquery') \
        .option('table', f'{ds_name}.{tbl_name}') \
        .option("temporaryGcsBucket", temp_bucket_name) \
        .mode('overwrite') \
        .save()

In [403]:
df_business_CA_final = df_business_CA.drop('CATEGORIES', 'ATTRIBUTES')

df_business_CA_final.createOrReplaceTempView('VW_BUSINESS_CA_FINAL')

df_business_CA_final.show(10)

+--------------------+--------------------+-----+-------------+-----------+-------------+--------------+
|         BUSINESS_ID|                NAME|STATE|         CITY|POSTAL_CODE|     LATITUDE|     LONGITUDE|
+--------------------+--------------------+-----+-------------+-----------+-------------+--------------+
|EosRKXIGeSWFYWwpk...|      Xtreme Couture|   ON|      Toronto|    M8Z 5G3|43.6245394916|-79.5291079302|
|eBEfgOPG7pvFhb2wc...|     Philthy Phillys|   ON|       Aurora|    L4G 7J1|   44.0109618|    -79.448677|
|lu7vtrp_bE9PnxWfA...|        Banzai Sushi|   ON|    Thornhill|    L3T 5W4|   43.8204923|   -79.3984661|
|1wWneWD_E1pBIyVpd...|    Air Jordan Store|   ON|      Toronto|    M5B 1R4|   43.6565424|   -79.3813076|
|9sRGfSVEfLhN_km60...|  Apadana Restaurant|   ON|Richmond Hill|    L4E 1A5|43.9470107964| -79.454861645|
|pcaQDBM6r0PWTXfYZ...|           IGA Extra|   QC|      Lasalle|    H8P 1B4|   45.4256452|   -73.6119339|
|LoRef3ChgZKbxUio-...|                Amir|   QC|     M

In [345]:
write_to_bq(df_business_CA_final, bucket_name, 'yelp_dataset', 'businesses')

In [276]:
write_to_bq(df_provinces, bucket_name, 'yelp_dataset', 'provinces')

In [277]:
write_to_bq(df_business_CA_cat, bucket_name, 'yelp_dataset', 'categories')

In [278]:
write_to_bq(df_business_CA_attr, bucket_name, 'yelp_dataset', 'attributes')

## Process yelp_academic_dataset_checkin.csv file

In [289]:
df_checkin = spark.read.json(f'{bucket_path}/yelp_academic_dataset_checkin.json')
df_checkin.createOrReplaceTempView("VW_Checkin")

In [290]:
df_checkin.printSchema()

root
 |-- business_id: string (nullable = true)
 |-- date: string (nullable = true)



In [404]:
df_checkin.show(10)

+--------------------+--------------------+
|         business_id|                date|
+--------------------+--------------------+
|--1UhMGODdWsrMast...|2016-04-26 19:49:...|
|--6MefnULPED_I942...|2011-06-04 18:22:...|
|--7zmmkVg-IMGaXbu...|2014-12-29 19:25:...|
|--8LPVSo5i0Oo61X0...| 2016-07-08 16:43:30|
|--9QQLMTbFzLJ_oT-...|2010-06-26 17:39:...|
|--9e1ONYQuAa-CB_R...|2010-02-08 05:56:...|
|--DaPTJW3-tB1vP-P...|2012-06-03 17:46:...|
|--DdmeR16TRb3LsjG...|2012-11-02 21:26:...|
|--EF5N7P70J_UYBTP...|2018-05-25 19:52:...|
|--EX4rRznJrltyn-3...|2010-02-26 17:05:...|
+--------------------+--------------------+
only showing top 10 rows



In [346]:
df_checkin_valid = spark.sql('''SELECT 
                                    VW_CHECKIN.*
                                  FROM VW_VW_CHECKIN 
                                      INNER JOIN BUSINESS_CA_FINAL ON VW_BUSINESS_CA_FINAL.BUSINESS_ID = VW_CHECKIN.BUSINESS_ID
                              ''')

df_checkin_valid.createOrReplaceTempView('VW_CHECKIN_VALID')

In [405]:
df_checkin_exp = spark.sql('''
                            SELECT 
                                BUSINESS_ID,
                                FROM_UNIXTIME(UNIX_TIMESTAMP(TRIM(DATE), 'yyyy-MM-dd HH:mm:ss')) AS DATE
                            FROM (
                                SELECT
                                    BUSINESS_ID,
                                    EXPLODE(SPLIT(DATE, ',')) AS DATE
                                FROM 
                                    VW_CHECKIN_VALID)
                            WHERE
                                DATE IS NOT NULL
                                AND DATE != ''
                            ''')

df_checkin_exp.show(10)

+--------------------+-------------------+
|         BUSINESS_ID|               DATE|
+--------------------+-------------------+
|-VAsjhmAbKF3Pb_-8...|2013-09-09 21:24:17|
|-VAsjhmAbKF3Pb_-8...|2013-09-11 16:10:00|
|-VAsjhmAbKF3Pb_-8...|2013-12-07 07:54:18|
|-VAsjhmAbKF3Pb_-8...|2014-03-07 00:14:43|
|-VAsjhmAbKF3Pb_-8...|2014-05-28 05:52:20|
|-VAsjhmAbKF3Pb_-8...|2014-07-06 04:49:53|
|-VAsjhmAbKF3Pb_-8...|2014-08-18 03:37:18|
|-VAsjhmAbKF3Pb_-8...|2014-10-21 03:39:24|
|-VAsjhmAbKF3Pb_-8...|2015-08-23 03:42:21|
|-VAsjhmAbKF3Pb_-8...|2015-08-23 23:07:39|
+--------------------+-------------------+
only showing top 10 rows



In [348]:
write_to_bq(df_checkin_exp, bucket_name, 'yelp_dataset', 'checkins')

## Process yelp_academic_dataset_review.csv file

In [325]:
df_review = spark.read.json(f'{bucket_path}/yelp_academic_dataset_review.json')
df_review.createOrReplaceTempView("VW_Review")

In [370]:
df_review.printSchema()

root
 |-- business_id: string (nullable = true)
 |-- cool: long (nullable = true)
 |-- date: string (nullable = true)
 |-- funny: long (nullable = true)
 |-- review_id: string (nullable = true)
 |-- stars: double (nullable = true)
 |-- text: string (nullable = true)
 |-- useful: long (nullable = true)
 |-- user_id: string (nullable = true)



In [None]:
df_review_valid = spark.sql('''SELECT 
                                    VW_REVIEW.BUSINESS_ID,
                                    USER_ID,
                                    REGEXP_REPLACE(TEXT, '\n', ' ') AS TEXT, 
                                    TO_DATE(DATE) AS DATE,
                                    STARS,
                                    COOL,
                                    FUNNY,
                                    USEFUL
                                  FROM VW_REVIEW 
                                      INNER JOIN VW_BUSINESS_CA_FINAL ON VW_BUSINESS_CA_FINAL.BUSINESS_ID = VW_REVIEW.BUSINESS_ID
                                  WHERE
                                          TEXT IS NOT NULL
                                      AND USEFUL IS NOT NULL
                                      AND COOL IS NOT NULL
                                      AND FUNNY IS NOT NULL
                                      AND STARS IS NOT NULL AND STARS >= 0 AND STARS <= 5
                              ''')

df_review_valid.createOrReplaceTempView('VW_REVIEW_VALID')

df_review_valid.show(5)

In [357]:
write_to_bq(df_review_valid, bucket_name, 'yelp_dataset', 'reviews')

## Process yelp_academic_dataset_user.csv file

In [358]:
df_user = spark.read.json(f'{bucket_path}/yelp_academic_dataset_user.json')
df_user.createOrReplaceTempView("VW_User")

In [368]:
df_user.printSchema()

root
 |-- average_stars: double (nullable = true)
 |-- compliment_cool: long (nullable = true)
 |-- compliment_cute: long (nullable = true)
 |-- compliment_funny: long (nullable = true)
 |-- compliment_hot: long (nullable = true)
 |-- compliment_list: long (nullable = true)
 |-- compliment_more: long (nullable = true)
 |-- compliment_note: long (nullable = true)
 |-- compliment_photos: long (nullable = true)
 |-- compliment_plain: long (nullable = true)
 |-- compliment_profile: long (nullable = true)
 |-- compliment_writer: long (nullable = true)
 |-- cool: long (nullable = true)
 |-- elite: string (nullable = true)
 |-- fans: long (nullable = true)
 |-- friends: string (nullable = true)
 |-- funny: long (nullable = true)
 |-- name: string (nullable = true)
 |-- review_count: long (nullable = true)
 |-- useful: long (nullable = true)
 |-- user_id: string (nullable = true)
 |-- yelping_since: string (nullable = true)



In [407]:
df_user.show(5)

+-------------+---------------+---------------+----------------+--------------+---------------+---------------+---------------+-----------------+----------------+------------------+-----------------+----+--------------------+----+--------------------+-----+--------+------------+------+--------------------+-------------------+
|average_stars|compliment_cool|compliment_cute|compliment_funny|compliment_hot|compliment_list|compliment_more|compliment_note|compliment_photos|compliment_plain|compliment_profile|compliment_writer|cool|               elite|fans|             friends|funny|    name|review_count|useful|             user_id|      yelping_since|
+-------------+---------------+---------------+----------------+--------------+---------------+---------------+---------------+-----------------+----------------+------------------+-----------------+----+--------------------+----+--------------------+-----+--------+------------+------+--------------------+-------------------+
|         3.57| 

In [412]:
df_user_valid = spark.sql('''
                            SELECT
                                VW_USER.USER_ID,
                                NAME,
                                COMPLIMENT_WRITER,
                                COMPLIMENT_PROFILE,
                                COMPLIMENT_PLAIN,
                                COMPLIMENT_PHOTOS,
                                COMPLIMENT_NOTE,
                                COMPLIMENT_MORE,
                                COMPLIMENT_LIST,
                                COMPLIMENT_HOT,
                                COMPLIMENT_FUNNY,
                                COMPLIMENT_CUTE,
                                COMPLIMENT_COOL,
                                VW_USER.USEFUL,
                                VW_USER.FUNNY,
                                FRIENDS,
                                FANS,
                                ELITE,
                                VW_USER.COOL,
                                REVIEW_COUNT,
                                AVERAGE_STARS,
                                YELPING_SINCE
                            FROM VW_USER 
                                INNER JOIN VW_REVIEW_VALID ON VW_REVIEW_VALID.USER_ID = VW_USER.USER_ID
                        ''').cache()

df_user_valid.createOrReplaceTempView('VW_USER_VALID')

In [413]:
df_user_valid.show(5)

+--------------------+---------+-----------------+------------------+----------------+-----------------+---------------+---------------+---------------+--------------+----------------+---------------+---------------+------+-----+--------------------+----+-----+----+------------+-------------+-------------------+
|             USER_ID|     NAME|COMPLIMENT_WRITER|COMPLIMENT_PROFILE|COMPLIMENT_PLAIN|COMPLIMENT_PHOTOS|COMPLIMENT_NOTE|COMPLIMENT_MORE|COMPLIMENT_LIST|COMPLIMENT_HOT|COMPLIMENT_FUNNY|COMPLIMENT_CUTE|COMPLIMENT_COOL|USEFUL|FUNNY|             FRIENDS|FANS|ELITE|COOL|REVIEW_COUNT|AVERAGE_STARS|      YELPING_SINCE|
+--------------------+---------+-----------------+------------------+----------------+-----------------+---------------+---------------+---------------+--------------+----------------+---------------+---------------+------+-----+--------------------+----+-----+----+------------+-------------+-------------------+
|-4Anvj46CWf57KWI9...|   Cookie|                0|        

In [416]:
df_user_friends = spark.sql('''
                            SELECT DISTINCT
                                USER_ID,
                                TRIM(FRIEND_USER_ID) AS FRIEND_USER_ID
                            FROM (SELECT 
                                    USER_ID,
                                    EXPLODE(SPLIT(FRIENDS, ',')) AS FRIEND_USER_ID
                                FROM 
                                    VW_USER_VALID)
                            WHERE
                                    FRIEND_USER_ID IS NOT NULL
                                AND FRIEND_USER_ID <> 'None'
                ''')

df_user_friends.show(10)

+--------------------+--------------------+
|             USER_ID|      FRIEND_USER_ID|
+--------------------+--------------------+
|-4Anvj46CWf57KWI9...|kUWW9YR-2xC9YUSav...|
|-BUamlG3H-7yqpAl1...|nI1M9-fatJdgiSZ-v...|
|-BUamlG3H-7yqpAl1...|KA47Ih5vwcYjAVV2X...|
|-BUamlG3H-7yqpAl1...|RjAzmU7wGVaG3yz3L...|
|-BUamlG3H-7yqpAl1...|xt0sVZVbUXiiP3Lc4...|
|-BUamlG3H-7yqpAl1...|ajuHTq0d4NUJg2L0F...|
|-BUamlG3H-7yqpAl1...|qp4DG-Id72CxXoXWC...|
|-BUamlG3H-7yqpAl1...|p9lh9_5l1c3xEQnHz...|
|-BUamlG3H-7yqpAl1...|M3-ye7FfdNkwGB-52...|
|-BUamlG3H-7yqpAl1...|AZNbOt_Rm7M5LxfBb...|
+--------------------+--------------------+
only showing top 10 rows



In [417]:
df_user_elites = spark.sql('''
                            SELECT DISTINCT
                                USER_ID,
                                TRIM(ELITE) AS ELITE
                            FROM (SELECT 
                                    USER_ID,
                                    EXPLODE(SPLIT(ELITE, ',')) AS ELITE
                                FROM 
                                    VW_USER_VALID)
                            WHERE
                                    ELITE IS NOT NULL
                                AND ELITE <> 'None'
                                AND ELITE <> ''
                ''')

df_user_elites.show(10)

+--------------------+-----+
|             USER_ID|ELITE|
+--------------------+-----+
|-bgszoDnhaUEuVydd...| 2017|
|-bgszoDnhaUEuVydd...| 2018|
|0lbUv-31EVxr8isqk...| 2017|
|0lbUv-31EVxr8isqk...| 2018|
|1pic7w8rRsVO5GJ_q...| 2017|
|26M4SlyVW9RCXuoWH...| 2010|
|26M4SlyVW9RCXuoWH...| 2011|
|26M4SlyVW9RCXuoWH...| 2012|
|26M4SlyVW9RCXuoWH...| 2013|
|26M4SlyVW9RCXuoWH...| 2014|
+--------------------+-----+
only showing top 10 rows



In [418]:
df_user_valid_final = df_user_valid.drop('ELITE', 'FRIENDS')

df_user_valid_final.show(10)

+--------------------+-----------+-----------------+------------------+----------------+-----------------+---------------+---------------+---------------+--------------+----------------+---------------+---------------+------+-----+----+----+------------+-------------+-------------------+
|             USER_ID|       NAME|COMPLIMENT_WRITER|COMPLIMENT_PROFILE|COMPLIMENT_PLAIN|COMPLIMENT_PHOTOS|COMPLIMENT_NOTE|COMPLIMENT_MORE|COMPLIMENT_LIST|COMPLIMENT_HOT|COMPLIMENT_FUNNY|COMPLIMENT_CUTE|COMPLIMENT_COOL|USEFUL|FUNNY|FANS|COOL|REVIEW_COUNT|AVERAGE_STARS|      YELPING_SINCE|
+--------------------+-----------+-----------------+------------------+----------------+-----------------+---------------+---------------+---------------+--------------+----------------+---------------+---------------+------+-----+----+----+------------+-------------+-------------------+
|-4Anvj46CWf57KWI9...|     Cookie|                0|                 0|               0|                0|              0|           

In [419]:
write_to_bq(df_user_valid_final, bucket_name, 'yelp_dataset', 'users')

In [420]:
write_to_bq(df_user_friends, bucket_name, 'yelp_dataset', 'user_friends')

In [421]:
write_to_bq(df_user_elites, bucket_name, 'yelp_dataset', 'user_elites')