**Install Pyspark and create spark session**

In [21]:
!pip install pyspark
from pyspark.sql import SparkSession

spark = SparkSession.builder\
        .master("local")\
        .appName("Colab")\
        .config('spark.ui.port', '4050')\
        .config("dfs.client.read.shortcircuit.skip.checksum", "true")\
        .getOrCreate()
spark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


**Mount google drive**

In [22]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

!ls /content/drive/MyDrive/ColabNotebooks/data

Mounted at /content/drive
google_reviews	health_grade  yelp_reviews


**Importing Libraries**

In [23]:
from pyspark.sql.functions import col, desc,sum,count
from pyspark.sql import functions as F
from pyspark.sql.window import Window
import datetime 
import sys

**Read health_grade data**

In [24]:
health_df = spark.read.csv("/content/drive/MyDrive/ColabNotebooks/data/health_grade/raw_data/health_grade.csv",header=True,inferSchema=True,)

health_history = "/content/drive/MyDrive/ColabNotebooks/data/health_grade/raw_data_history/health_grade-" + str(datetime.datetime.now())+'.csv'
health_df.write.csv(health_history)

health_df = health_df.withColumn("grade_date", F.to_date(F.col("grade_date").cast("string"), 'yyyyMMdd'))
health_df.show()

+----------+-------------+----------------+---------+-------+------------+
|grade_date|restaurant_id| restaurant_name|  cuisine| county|health_grade|
+----------+-------------+----------------+---------+-------+------------+
|2022-11-01|        56849|   Namaste India|   Indian| Bergen|           3|
|2022-11-01|        58976|  Malibu Kitchen| Hawaiian|Passaic|           1|
|2022-11-01|        52314|Carribean Dreams|Carribean|Passaic|           2|
|2022-11-01|        64892| Spanish Delight|  Spanish| Bergen|           3|
|2022-11-01|        54781|      Greek Love|    Greek| Bergen|           3|
|2022-11-01|        67432|    Little Italy|  Italian| Bergen|           4|
|2022-12-01|        56849|   Namaste India|   Indian| Bergen|           3|
|2022-12-01|        58976|  Malibu Kitchen| Hawaiian|Passaic|           2|
|2022-12-01|        52314|Carribean Dreams|Carribean|Passaic|           2|
|2022-12-01|        64892| Spanish Delight|  Spanish| Bergen|           4|
|2022-12-01|        54781

In [25]:
health_df.printSchema()

root
 |-- grade_date: date (nullable = true)
 |-- restaurant_id: integer (nullable = true)
 |-- restaurant_name: string (nullable = true)
 |-- cuisine: string (nullable = true)
 |-- county: string (nullable = true)
 |-- health_grade: integer (nullable = true)



In [26]:
#Store daily data in parquet overwrite mode
health_df.write.partitionBy('grade_date').mode('overwrite').parquet("/content/drive/MyDrive/ColabNotebooks/data/health_grade/input_parquet")


In [27]:
healthDF = spark.read.parquet("/content/drive/MyDrive/ColabNotebooks/data/health_grade/input_parquet")

In [28]:
healthDF.show()
healthDF.count()

+-------------+----------------+---------+-------+------------+----------+
|restaurant_id| restaurant_name|  cuisine| county|health_grade|grade_date|
+-------------+----------------+---------+-------+------------+----------+
|        56849|   Namaste India|   Indian| Bergen|           3|2023-02-01|
|        58976|  Malibu Kitchen| Hawaiian|Passaic|           1|2023-02-01|
|        52314|Carribean Dreams|Carribean|Passaic|           2|2023-02-01|
|        64892| Spanish Delight|  Spanish| Bergen|           3|2023-02-01|
|        54781|      Greek Love|    Greek| Bergen|           3|2023-02-01|
|        67432|    Little Italy|  Italian| Bergen|           4|2023-02-01|
|        56849|   Namaste India|   Indian| Bergen|           3|2022-11-01|
|        58976|  Malibu Kitchen| Hawaiian|Passaic|           1|2022-11-01|
|        52314|Carribean Dreams|Carribean|Passaic|           2|2022-11-01|
|        64892| Spanish Delight|  Spanish| Bergen|           3|2022-11-01|
|        54781|      Gree

42

In [29]:
from pyspark.sql.types import StringType

healthDF.createOrReplaceTempView("health")
health_df2 = spark.sql("select grade_date, restaurant_id, restaurant_name, cuisine, health_grade " \
                "from health ")
health_df2 = health_df2.withColumn("restaurant_id",col("restaurant_id").cast(StringType())) 


In [30]:
health_df2.printSchema()

root
 |-- grade_date: date (nullable = true)
 |-- restaurant_id: string (nullable = true)
 |-- restaurant_name: string (nullable = true)
 |-- cuisine: string (nullable = true)
 |-- health_grade: integer (nullable = true)



In [31]:
health_df2.show()
health_df2.summary()


+----------+-------------+----------------+---------+------------+
|grade_date|restaurant_id| restaurant_name|  cuisine|health_grade|
+----------+-------------+----------------+---------+------------+
|2023-02-01|        56849|   Namaste India|   Indian|           3|
|2023-02-01|        58976|  Malibu Kitchen| Hawaiian|           1|
|2023-02-01|        52314|Carribean Dreams|Carribean|           2|
|2023-02-01|        64892| Spanish Delight|  Spanish|           3|
|2023-02-01|        54781|      Greek Love|    Greek|           3|
|2023-02-01|        67432|    Little Italy|  Italian|           4|
|2022-11-01|        56849|   Namaste India|   Indian|           3|
|2022-11-01|        58976|  Malibu Kitchen| Hawaiian|           1|
|2022-11-01|        52314|Carribean Dreams|Carribean|           2|
|2022-11-01|        64892| Spanish Delight|  Spanish|           3|
|2022-11-01|        54781|      Greek Love|    Greek|           3|
|2022-11-01|        67432|    Little Italy|  Italian|         

DataFrame[summary: string, restaurant_id: string, restaurant_name: string, cuisine: string, health_grade: string]

In [32]:
health_df2.count()

42

In [33]:
add_dates = health_df2. \
    groupBy('restaurant_id','restaurant_name', 'restaurant_name', 'cuisine'). \
    agg(F.min('grade_date').alias('min_dt'), 
        F.max('grade_date').alias('max_dt')
        ). \
    withColumn('dt_arr', F.expr('sequence(min_dt, max_dt, interval 1 day)')). \
    withColumn('exploded_date', F.explode('dt_arr')). \
    select('restaurant_id', 'restaurant_name', 'cuisine', F.col('exploded_date').alias('grade_date'))

add_dates.show()

+-------------+---------------+--------+----------+
|restaurant_id|restaurant_name| cuisine|grade_date|
+-------------+---------------+--------+----------+
|        58976| Malibu Kitchen|Hawaiian|2022-11-01|
|        58976| Malibu Kitchen|Hawaiian|2022-11-02|
|        58976| Malibu Kitchen|Hawaiian|2022-11-03|
|        58976| Malibu Kitchen|Hawaiian|2022-11-04|
|        58976| Malibu Kitchen|Hawaiian|2022-11-05|
|        58976| Malibu Kitchen|Hawaiian|2022-11-06|
|        58976| Malibu Kitchen|Hawaiian|2022-11-07|
|        58976| Malibu Kitchen|Hawaiian|2022-11-08|
|        58976| Malibu Kitchen|Hawaiian|2022-11-09|
|        58976| Malibu Kitchen|Hawaiian|2022-11-10|
|        58976| Malibu Kitchen|Hawaiian|2022-11-11|
|        58976| Malibu Kitchen|Hawaiian|2022-11-12|
|        58976| Malibu Kitchen|Hawaiian|2022-11-13|
|        58976| Malibu Kitchen|Hawaiian|2022-11-14|
|        58976| Malibu Kitchen|Hawaiian|2022-11-15|
|        58976| Malibu Kitchen|Hawaiian|2022-11-16|
|        589

In [34]:
# add_dates.summary()

In [35]:
health_df_all_dates = add_dates. \
    join(health_df2, ['restaurant_id', 'grade_date', 'restaurant_name', 'cuisine'], 'left'). \
    withColumn("health_grade", F.last('health_grade',True).over(Window.partitionBy('restaurant_id').orderBy('grade_date').rowsBetween(-sys.maxsize, 0))).\
    select( 'restaurant_id', 'grade_date', 'restaurant_name', 'cuisine', 'health_grade' )


In [36]:
health_df_all_dates.write.mode('append').format('parquet').save("/content/drive/MyDrive/ColabNotebooks/data/health_grade/output_parquet")


In [37]:
read_health_data = spark.read.parquet("/content/drive/MyDrive/ColabNotebooks/data/health_grade/output_parquet")
read_health_data.summary()

DataFrame[summary: string, restaurant_id: string, restaurant_name: string, cuisine: string, health_grade: string]

In [38]:
read_health_data.show()

+-------------+----------+----------------+---------+------------+
|restaurant_id|grade_date| restaurant_name|  cuisine|health_grade|
+-------------+----------+----------------+---------+------------+
|        52314|2022-01-01|Carribean Dreams|Carribean|           2|
|        52314|2022-01-02|Carribean Dreams|Carribean|           2|
|        52314|2022-01-03|Carribean Dreams|Carribean|           2|
|        52314|2022-01-04|Carribean Dreams|Carribean|           2|
|        52314|2022-01-05|Carribean Dreams|Carribean|           2|
|        52314|2022-01-06|Carribean Dreams|Carribean|           2|
|        52314|2022-01-07|Carribean Dreams|Carribean|           2|
|        52314|2022-01-08|Carribean Dreams|Carribean|           2|
|        52314|2022-01-09|Carribean Dreams|Carribean|           2|
|        52314|2022-01-10|Carribean Dreams|Carribean|           2|
|        52314|2022-01-11|Carribean Dreams|Carribean|           2|
|        52314|2022-01-12|Carribean Dreams|Carribean|         

In [39]:
read_health_data.count()

2736

In [40]:
read_health_data.printSchema()

root
 |-- restaurant_id: string (nullable = true)
 |-- grade_date: date (nullable = true)
 |-- restaurant_name: string (nullable = true)
 |-- cuisine: string (nullable = true)
 |-- health_grade: integer (nullable = true)

