In [128]:
import ast 
import json
from pyspark.sql import SparkSession, functions, types
from google.cloud import storage

In [7]:
bucket_name = 'cmpt732-project-bucket'
bucket_path = 'gs://' + bucket_name

In [8]:
gcs_client = storage.Client()
bucket = gcs_client.bucket(bucket_name)

for obj in list(bucket.list_blobs(prefix='yelp_academic')):
    print(obj)

<Blob: cmpt732-project-bucket, yelp_academic_ca_provinces.json, 1608694499755886>
<Blob: cmpt732-project-bucket, yelp_academic_dataset_business.json, 1608694499981573>
<Blob: cmpt732-project-bucket, yelp_academic_dataset_checkin.json, 1608694500209921>
<Blob: cmpt732-project-bucket, yelp_academic_dataset_review.json, 1608694500560158>
<Blob: cmpt732-project-bucket, yelp_academic_dataset_tip.json, 1608694500824867>
<Blob: cmpt732-project-bucket, yelp_academic_dataset_user.json, 1608694501169468>


In [10]:
!hdfs dfs -ls 'gs://cmpt732-project-bucket/*.json' 

-rwx------   3 root root        535 2020-12-23 03:34 gs://cmpt732-project-bucket/yelp_academic_ca_provinces.json
-rwx------   3 root root  152898689 2020-12-23 03:34 gs://cmpt732-project-bucket/yelp_academic_dataset_business.json
-rwx------   3 root root  449663480 2020-12-23 03:35 gs://cmpt732-project-bucket/yelp_academic_dataset_checkin.json
-rwx------   3 root root 6325565224 2020-12-23 03:35 gs://cmpt732-project-bucket/yelp_academic_dataset_review.json
-rwx------   3 root root  263489322 2020-12-23 03:35 gs://cmpt732-project-bucket/yelp_academic_dataset_tip.json
-rwx------   3 root root 3268069927 2020-12-23 03:35 gs://cmpt732-project-bucket/yelp_academic_dataset_user.json


In [11]:
!scala -version

Scala code runner version 2.12.10 -- Copyright 2002-2019, LAMP/EPFL and Lightbend, Inc.


In [12]:
spark = SparkSession.builder.appName('YelpAnalysis')\
    .config('spark.jars.packages', 'com.google.cloud.spark:spark-bigquery-with-dependencies_2.12:0.15.1-beta') \
    .getOrCreate()

In [98]:
dataset_files = {
    'provinces': 'yelp_academic_ca_provinces.json',
    'businesses': 'yelp_academic_dataset_business.json',
    'checkins': 'yelp_academic_dataset_checkin.json',
    'reviews': 'yelp_academic_dataset_review.json',
    'users': 'yelp_academic_dataset_user.json'
}

## Process yelp_academic_dataset_business.csv file

In [25]:
df_provinces = spark.read.json(f'{bucket_path}/{dataset_files["provinces"]}')
df_provinces.createOrReplaceTempView('VW_Province')

In [44]:
#spark.sql('SELECT * FROM VW_Province').show(truncate=False)

In [47]:
df_business = spark.read.json(f'{bucket_path}/{dataset_files["businesses"]}')
df_business.createOrReplaceTempView("VW_Business")

In [43]:
#df_business.printSchema()

In [38]:
#spark.sql('''SELECT STATE, COUNT(*) AS STATE_COUNT FROM VW_Business GROUP BY STATE ORDER BY 2 DESC''').show(10)

In [48]:
df_business_CA = spark.sql('''
                            SELECT
                                BUSINESS_ID, NAME, STATE, CITY, POSTAL_CODE, LATITUDE, LONGITUDE, CATEGORIES, ATTRIBUTES
                            FROM
                                VW_Business VB 
                                    INNER JOIN VW_Province VP ON VB.STATE = VP.CODE
                            WHERE
                                    IS_OPEN = '1'
                                AND POSTAL_CODE IS NOT NULL
                                AND LENGTH(POSTAL_CODE) = 7
                                AND LATITUDE IS NOT NULL
                                AND LONGITUDE IS NOT NULL
                                AND STARS IS NOT NULL
                                AND REVIEW_COUNT IS NOT NULL
                                AND CATEGORIES IS NOT NULL
                                AND ATTRIBUTES IS NOT NULL
                            ''').cache()

df_business_CA.createOrReplaceTempView('VW_BUSINESS_CA')

In [42]:
#spark.sql('SELECT * FROM VW_BUSINESS_CA').show(5)

In [37]:
#spark.sql('''SELECT STATE, COUNT(*) AS STATE_COUNT FROM VW_BUSINESS_CA GROUP BY STATE ORDER BY 2 DESC''').show(10)

In [36]:
#spark.sql('''SELECT STATE, CITY, COUNT(*) AS CITY_COUNT FROM VW_BUSINESS_CA GROUP BY STATE, CITY ORDER BY 3 DESC''').show(10)

In [49]:
df_business_CA_cat = spark.sql('''
                                SELECT 
                                    BUSINESS_ID,
                                    TRIM(CATEGORY) as CATEGORY
                                FROM
                                    (
                                    SELECT 
                                        BUSINESS_ID, 
                                        EXPLODE(SPLIT(CATEGORIES, ',')) as CATEGORY
                                    FROM
                                        VW_BUSINESS_CA)
                                ''')

df_business_CA_cat.createOrReplaceTempView('VW_BUSINESS_CA_CAT')

In [52]:
#spark.sql('''SELECT * FROM VW_BUSINESS_CA_CAT''').show(10, False)

In [54]:
#spark.sql('''SELECT CATEGORY, COUNT(*) AS CATEGORY_COUNT FROM VW_BUSINESS_CA_CAT GROUP BY CATEGORY ORDER BY 2 DESC''').show(10)

In [55]:
def get_attribute_keys(struct_name):
    fields = json.loads(df_business_CA.schema.json())['fields']

    attributes = []

    for field in fields:
        if (field['name'] == struct_name):

            sub_fields = field['type']['fields']

            for sub_field in sub_fields:
                attributes += [sub_field['name']]
    
    return attributes

attributes = get_attribute_keys('ATTRIBUTES')

In [57]:
#print(attributes)

In [59]:
@functions.udf(returnType=types.StringType())
def flatten_attributes(col_data):
    output = {}

    for attribute in attributes:
        if (col_data[attribute] == None):
            output[attribute] = None
        elif str(col_data[attribute]).startswith('{'):
            col_sub_data = str(col_data[attribute]).split(',')
            
            for sub_data in col_sub_data:
                if (len(sub_data.split(':')) == 2):
                    sub_attr_key = sub_data.split(':')[0].replace('{', '').replace('\'', '').strip()
                    sub_attr_val = sub_data.split(':')[1].replace('}', '').replace('\'', '').strip()
                    output[attribute + "_" + sub_attr_key] = sub_attr_val
        else:
            output[attribute] = col_data[attribute]

    return str(output) \
                .replace('{', '') \
                .replace('}','') \
                .replace('\'', '')

spark.udf.register("FLATTEN", flatten_attributes);

In [60]:
df_business_CA_attr = spark.sql('''
                                SELECT
                                    BUSINESS_ID,
                                    REPLACE(SPLIT(TRIM(ATTRIBUTE), ':')[0], "'", "") AS ATTR_KEY,
                                    TRIM(SPLIT(ATTRIBUTE, ':')[1]) AS ATTR_VAL
                                FROM
                                (
                                    SELECT
                                        BUSINESS_ID,
                                        EXPLODE(SPLIT(FLATTEN(ATTRIBUTES), ',')) AS ATTRIBUTE
                                    FROM 
                                        VW_BUSINESS_CA
                                )''')

df_business_CA_attr.createOrReplaceTempView('VW_BUSINESS_CA_ATTR')

In [63]:
#spark.sql("SELECT * FROM VW_BUSINESS_CA_ATTR").show(10, False)

In [66]:
#spark.sql('''SELECT ATTR_VAL, COUNT(*) AS BUSINESS_COUNT FROM VW_BUSINESS_CA_ATTR WHERE ATTR_KEY = "HasTV" GROUP BY ATTR_VAL ''').show()

In [82]:
df_business_CA_final = df_business_CA.drop('CATEGORIES', 'ATTRIBUTES')
df_business_CA_final.createOrReplaceTempView('VW_BUSINESS_CA_FINAL')

In [84]:
#df_business_CA_final.show(10)

## Process yelp_academic_dataset_checkin.csv file

In [85]:
df_checkin = spark.read.json(f'{bucket_path}/{dataset_files["checkins"]}')
df_checkin.createOrReplaceTempView("VW_Checkin")

In [74]:
#df_checkin.printSchema()

In [78]:
#df_checkin.show(5)

In [86]:
df_checkin_valid = spark.sql('''SELECT 
                                    VW_CHECKIN.*
                                  FROM VW_CHECKIN 
                                      INNER JOIN VW_BUSINESS_CA_FINAL ON VW_BUSINESS_CA_FINAL.BUSINESS_ID = VW_CHECKIN.BUSINESS_ID
                              ''')

df_checkin_valid.createOrReplaceTempView('VW_CHECKIN_VALID')

In [87]:
df_checkin_exp = spark.sql('''
                            SELECT 
                                BUSINESS_ID,
                                FROM_UNIXTIME(UNIX_TIMESTAMP(TRIM(DATE), 'yyyy-MM-dd HH:mm:ss')) AS DATE
                            FROM (
                                SELECT
                                    BUSINESS_ID,
                                    EXPLODE(SPLIT(DATE, ',')) AS DATE
                                FROM 
                                    VW_CHECKIN_VALID)
                            WHERE
                                DATE IS NOT NULL
                                AND DATE != ''
                            ''')

In [89]:
#df_checkin_exp.show(10)

## Process yelp_academic_dataset_review.csv file

In [91]:
df_review = spark.read.json(f'{bucket_path}/{dataset_files["reviews"]}')
df_review.createOrReplaceTempView("VW_Review")

In [93]:
#df_review.printSchema()

In [95]:
df_review_valid = spark.sql('''SELECT 
                                    VW_REVIEW.BUSINESS_ID,
                                    USER_ID,
                                    REGEXP_REPLACE(TEXT, '\n', ' ') AS TEXT, 
                                    TO_DATE(DATE) AS DATE,
                                    STARS,
                                    COOL,
                                    FUNNY,
                                    USEFUL
                                  FROM VW_REVIEW 
                                      INNER JOIN VW_BUSINESS_CA_FINAL ON VW_BUSINESS_CA_FINAL.BUSINESS_ID = VW_REVIEW.BUSINESS_ID
                                  WHERE
                                          TEXT IS NOT NULL
                                      AND USEFUL IS NOT NULL
                                      AND COOL IS NOT NULL
                                      AND FUNNY IS NOT NULL
                                      AND STARS IS NOT NULL AND STARS >= 0 AND STARS <= 5
                              ''')

df_review_valid.createOrReplaceTempView('VW_REVIEW_VALID')

In [97]:
#df_review_valid.show(5)

## Process yelp_academic_dataset_user.csv file

In [99]:
df_user = spark.read.json(f'{bucket_path}/{dataset_files["users"]}')
df_user.createOrReplaceTempView("VW_User")

In [102]:
#df_user.printSchema()

In [103]:
#df_user.show(5)

In [104]:
df_user_valid = spark.sql('''
                            SELECT
                                VW_USER.USER_ID,
                                NAME,
                                COMPLIMENT_WRITER,
                                COMPLIMENT_PROFILE,
                                COMPLIMENT_PLAIN,
                                COMPLIMENT_PHOTOS,
                                COMPLIMENT_NOTE,
                                COMPLIMENT_MORE,
                                COMPLIMENT_LIST,
                                COMPLIMENT_HOT,
                                COMPLIMENT_FUNNY,
                                COMPLIMENT_CUTE,
                                COMPLIMENT_COOL,
                                VW_USER.USEFUL,
                                VW_USER.FUNNY,
                                FRIENDS,
                                FANS,
                                ELITE,
                                VW_USER.COOL,
                                REVIEW_COUNT,
                                AVERAGE_STARS,
                                YELPING_SINCE
                            FROM VW_USER 
                                INNER JOIN VW_REVIEW_VALID ON VW_REVIEW_VALID.USER_ID = VW_USER.USER_ID
                        ''').cache()

df_user_valid.createOrReplaceTempView('VW_USER_VALID')

In [105]:
#df_user_valid.show(5)

In [106]:
df_user_friends = spark.sql('''
                            SELECT DISTINCT
                                USER_ID,
                                TRIM(FRIEND_USER_ID) AS FRIEND_USER_ID
                            FROM (SELECT 
                                    USER_ID,
                                    EXPLODE(SPLIT(FRIENDS, ',')) AS FRIEND_USER_ID
                                FROM 
                                    VW_USER_VALID)
                            WHERE
                                    FRIEND_USER_ID IS NOT NULL
                                AND FRIEND_USER_ID <> 'None'
                ''')

In [108]:
#df_user_friends.show(10)

In [109]:
df_user_elites = spark.sql('''
                            SELECT DISTINCT
                                USER_ID,
                                TRIM(ELITE) AS ELITE
                            FROM (SELECT 
                                    USER_ID,
                                    EXPLODE(SPLIT(ELITE, ',')) AS ELITE
                                FROM 
                                    VW_USER_VALID)
                            WHERE
                                    ELITE IS NOT NULL
                                AND ELITE <> 'None'
                                AND ELITE <> ''
                ''')

In [111]:
#df_user_elites.show(10)

In [113]:
df_user_valid_final = df_user_valid.drop('ELITE', 'FRIENDS')

In [115]:
#df_user_valid_final.show(10)

In [116]:
def write_to_bq(df, temp_bucket_name, ds_name, tbl_name):
    df.write \
        .format('bigquery') \
        .option('table', f'{ds_name}.{tbl_name}') \
        .option("temporaryGcsBucket", temp_bucket_name) \
        .mode('overwrite') \
        .save()

In [123]:
write_to_bq(df_provinces, bucket_name, 'yelp_dataset', 'provinces')

In [124]:
write_to_bq(df_business_CA_final, bucket_name, 'yelp_dataset', 'businesses')
write_to_bq(df_business_CA_cat, bucket_name, 'yelp_dataset', 'categories')
write_to_bq(df_business_CA_attr, bucket_name, 'yelp_dataset', 'attributes')

In [125]:
write_to_bq(df_checkin_exp, bucket_name, 'yelp_dataset', 'checkins')

In [126]:
write_to_bq(df_review_valid, bucket_name, 'yelp_dataset', 'reviews')

In [127]:
write_to_bq(df_user_valid_final, bucket_name, 'yelp_dataset', 'users')
write_to_bq(df_user_friends, bucket_name, 'yelp_dataset', 'user_friends')
write_to_bq(df_user_elites, bucket_name, 'yelp_dataset', 'user_elites')