##Choosing data source

In [22]:
data_source = "yelp_reviews" #@param ["google_reviews", "yelp_reviews"]

##Installing Pyspark and creating the spark session

In [23]:

!pip install pyspark
from pyspark.sql import SparkSession

spark = SparkSession.builder\
        .master("local")\
        .appName("Colab")\
        .config('spark.ui.port', '4050')\
        .config("dfs.client.read.shortcircuit.skip.checksum", "true")\
        .getOrCreate()
spark



Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [24]:
print (data_source)


yelp_reviews


##Importing google drive

In [25]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

!ls /content/drive/MyDrive/ColabNotebooks/data

Mounted at /content/drive
google_reviews	health_grade  yelp_reviews


**Importing Libraries**

In [26]:
from pyspark.sql.functions import col, desc,sum,count
from pyspark.sql import functions as F
from pyspark.sql.window import Window
import datetime 

**Reading Raw Data**

In [27]:
input_csv_path = f"/content/drive/MyDrive/ColabNotebooks/data/{data_source}/raw_data/reviews.csv"

review_df = spark.read.csv(input_csv_path,header=True,inferSchema=True,)

review_history = f'/content/drive/MyDrive/ColabNotebooks/data/{data_source}/raw_data_history/reviews-' + str(datetime.datetime.now())+'.csv'
review_df.write.csv(review_history,header=True)

review_df = review_df.withColumn("review_date", F.to_date(F.col("review_date").cast("string"), 'yyyyMMdd'))
review_df.show()


+--------------------+-----------+-------------+---------------+-------+------+-------------+--------+---------+
|           review_id|review_date|restaurant_id|restaurant_name| county|rating|reviewer_name|comments|operation|
+--------------------+-----------+-------------+---------------+-------+------+-------------+--------+---------+
|9dc7a06a-d002-11e...| 2022-01-01|        67432|   Little Italy| Bergen|     4|         ****|   *****|      add|
|9dc7a394-d002-11e...| 2022-01-01|        67432|   Little Italy| Bergen|     3|         ****|   *****|      add|
|9dc7a4ca-d002-11e...| 2022-01-01|        58976| Malibu Kitchen|Passaic|     3|         ****|   *****|      add|
|9dc7a5b0-d002-11e...| 2022-01-01|        54781|     Greek Love| Bergen|     5|         ****|   *****|      add|
|9dc7a696-d002-11e...| 2022-01-01|        56849|  Namaste India| Bergen|     3|         ****|   *****|      add|
|9dc7a772-d002-11e...| 2022-01-01|        67432|   Little Italy| Bergen|     3|         ****|   

In [28]:
input_parquet_path = f"/content/drive/MyDrive/ColabNotebooks/data/{data_source}/input_parquet"

review_df.write.partitionBy('review_date').mode('overwrite').parquet(input_parquet_path)

In [29]:
reviewDF = spark.read.parquet(input_parquet_path) 


In [30]:

reviewDF.show(5)
reviewDF.count()


+--------------------+-------------+----------------+-------+------+-------------+--------+---------+-----------+
|           review_id|restaurant_id| restaurant_name| county|rating|reviewer_name|comments|operation|review_date|
+--------------------+-------------+----------------+-------+------+-------------+--------+---------+-----------+
|9dcd47c2-d002-11e...|        64892| Spanish Delight| Bergen|     2|         ****|   *****|      add| 2022-06-04|
|9dcd488a-d002-11e...|        56849|   Namaste India| Bergen|     3|         ****|   *****|      add| 2022-06-04|
|9dcd4948-d002-11e...|        67432|    Little Italy| Bergen|     4|         ****|   *****|      add| 2022-06-04|
|9dcd4a06-d002-11e...|        52314|Carribean Dreams|Passaic|     3|         ****|   *****|      add| 2022-06-04|
|9dcd4aba-d002-11e...|        58976|  Malibu Kitchen|Passaic|     2|         ****|   *****|      add| 2022-06-04|
+--------------------+-------------+----------------+-------+------+-------------+------

2836

In [31]:
from pyspark.sql.types import StringType

reviewDF.createOrReplaceTempView("reviews")
review_df2 = spark.sql("select review_date, restaurant_id, restaurant_name, county, sum(rating) rating_sum, count(rating) rating_count " \
                "from reviews group by restaurant_id, restaurant_name, county, review_date order by review_date")
review_df2 = review_df2.withColumn("restaurant_id",col("restaurant_id").cast(StringType())) 



In [32]:
review_df2.show()


+-----------+-------------+----------------+-------+----------+------------+
|review_date|restaurant_id| restaurant_name| county|rating_sum|rating_count|
+-----------+-------------+----------------+-------+----------+------------+
| 2022-01-01|        67432|    Little Italy| Bergen|        20|           6|
| 2022-01-01|        54781|      Greek Love| Bergen|         5|           1|
| 2022-01-01|        56849|   Namaste India| Bergen|         9|           3|
| 2022-01-01|        58976|  Malibu Kitchen|Passaic|         5|           2|
| 2022-01-02|        56849|   Namaste India| Bergen|         9|           4|
| 2022-01-02|        58976|  Malibu Kitchen|Passaic|         7|           3|
| 2022-01-02|        52314|Carribean Dreams|Passaic|         4|           1|
| 2022-01-02|        67432|    Little Italy| Bergen|         8|           2|
| 2022-01-02|        64892| Spanish Delight| Bergen|         2|           1|
| 2022-01-03|        56849|   Namaste India| Bergen|        11|           5|

In [33]:
add_dates = review_df2. \
    groupBy('restaurant_id','restaurant_name', 'restaurant_name', 'county'). \
    agg(F.min('review_date').alias('min_dt'), 
        F.max('review_date').alias('max_dt')
        ). \
    withColumn('dt_arr', F.expr('sequence(min_dt, max_dt, interval 1 day)')). \
    withColumn('exploded_date', F.explode('dt_arr')). \
    select('restaurant_id', 'restaurant_name', 'county', F.col('exploded_date').alias('review_date'))

add_dates.show()

+-------------+---------------+-------+-----------+
|restaurant_id|restaurant_name| county|review_date|
+-------------+---------------+-------+-----------+
|        58976| Malibu Kitchen|Passaic| 2022-01-01|
|        58976| Malibu Kitchen|Passaic| 2022-01-02|
|        58976| Malibu Kitchen|Passaic| 2022-01-03|
|        58976| Malibu Kitchen|Passaic| 2022-01-04|
|        58976| Malibu Kitchen|Passaic| 2022-01-05|
|        58976| Malibu Kitchen|Passaic| 2022-01-06|
|        58976| Malibu Kitchen|Passaic| 2022-01-07|
|        58976| Malibu Kitchen|Passaic| 2022-01-08|
|        58976| Malibu Kitchen|Passaic| 2022-01-09|
|        58976| Malibu Kitchen|Passaic| 2022-01-10|
|        58976| Malibu Kitchen|Passaic| 2022-01-11|
|        58976| Malibu Kitchen|Passaic| 2022-01-12|
|        58976| Malibu Kitchen|Passaic| 2022-01-13|
|        58976| Malibu Kitchen|Passaic| 2022-01-14|
|        58976| Malibu Kitchen|Passaic| 2022-01-15|
|        58976| Malibu Kitchen|Passaic| 2022-01-16|
|        589

In [34]:
review_df_all_dates = add_dates. \
    join(review_df2, ['restaurant_id', 'review_date', 'restaurant_name', 'county'], 'left'). \
    fillna(0, subset=['rating_sum','rating_count'])

review_df_all_dates.orderBy('restaurant_id', 'review_date').show()

+-------------+-----------+----------------+-------+----------+------------+
|restaurant_id|review_date| restaurant_name| county|rating_sum|rating_count|
+-------------+-----------+----------------+-------+----------+------------+
|        52314| 2022-01-02|Carribean Dreams|Passaic|         4|           1|
|        52314| 2022-01-03|Carribean Dreams|Passaic|         0|           0|
|        52314| 2022-01-04|Carribean Dreams|Passaic|         0|           0|
|        52314| 2022-01-05|Carribean Dreams|Passaic|         3|           1|
|        52314| 2022-01-06|Carribean Dreams|Passaic|         3|           1|
|        52314| 2022-01-07|Carribean Dreams|Passaic|         4|           1|
|        52314| 2022-01-08|Carribean Dreams|Passaic|         3|           1|
|        52314| 2022-01-09|Carribean Dreams|Passaic|         0|           0|
|        52314| 2022-01-10|Carribean Dreams|Passaic|         0|           0|
|        52314| 2022-01-11|Carribean Dreams|Passaic|         0|           0|

In [35]:
review_df_all_dates.count()

1795

In [36]:
rating_sum_path = f"/content/drive/MyDrive/ColabNotebooks/data/{data_source}/output_parquet/rating_sum"

review_df_all_dates.write.partitionBy('review_date').mode('overwrite').parquet(rating_sum_path)

In [37]:
review_df_final = spark.read.parquet(rating_sum_path)

In [38]:
w = Window().partitionBy(['restaurant_id']).orderBy('review_date').rowsBetween(-6,0)
review_df_final = review_df_final.withColumn('rating_sum_rolling', F.sum("rating_sum").over(w)) \
       .withColumn('rating_count_rolling', F.sum("rating_count").over(w))
review_df_final = review_df_final.withColumn('final_rating', review_df_final['rating_sum_rolling']/review_df_final['rating_count_rolling'])


In [39]:
review_df_final = review_df_final.drop("rating_sum_rolling","rating_count_rolling")

In [40]:
review_df_final.show()

+-------------+----------------+-------+----------+------------+-----------+------------------+
|restaurant_id| restaurant_name| county|rating_sum|rating_count|review_date|      final_rating|
+-------------+----------------+-------+----------+------------+-----------+------------------+
|        52314|Carribean Dreams|Passaic|         4|           1| 2022-01-02|               4.0|
|        52314|Carribean Dreams|Passaic|         0|           0| 2022-01-03|               4.0|
|        52314|Carribean Dreams|Passaic|         0|           0| 2022-01-04|               4.0|
|        52314|Carribean Dreams|Passaic|         3|           1| 2022-01-05|               3.5|
|        52314|Carribean Dreams|Passaic|         3|           1| 2022-01-06|3.3333333333333335|
|        52314|Carribean Dreams|Passaic|         4|           1| 2022-01-07|               3.5|
|        52314|Carribean Dreams|Passaic|         3|           1| 2022-01-08|               3.4|
|        52314|Carribean Dreams|Passaic|

In [41]:
final_rating_path = f"/content/drive/MyDrive/ColabNotebooks/data/{data_source}/output_parquet/final_rating"

review_df_final.write.partitionBy('restaurant_id').mode('overwrite').parquet(final_rating_path)


In [42]:
new_df = spark.read.parquet(final_rating_path)
new_df.count()

1795

In [43]:
new_df.summary()

DataFrame[summary: string, restaurant_name: string, county: string, rating_sum: string, rating_count: string, final_rating: string, restaurant_id: string]

In [44]:
new_df.printSchema()

root
 |-- restaurant_name: string (nullable = true)
 |-- county: string (nullable = true)
 |-- rating_sum: long (nullable = true)
 |-- rating_count: long (nullable = true)
 |-- review_date: date (nullable = true)
 |-- final_rating: double (nullable = true)
 |-- restaurant_id: integer (nullable = true)

