## Session Setup

In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Spark aggregation functions") \
    .getOrCreate()

### Loading Data

In [2]:
listings = spark.read.csv("../data/listings.csv.gz", 
    header=True,
    inferSchema=True,
    sep=",", 
    quote='"',
    escape='"', 
    multiLine=True,
    mode="PERMISSIVE" 
)
listings.printSchema()

root
 |-- id: long (nullable = true)
 |-- listing_url: string (nullable = true)
 |-- scrape_id: long (nullable = true)
 |-- last_scraped: timestamp (nullable = true)
 |-- source: string (nullable = true)
 |-- name: string (nullable = true)
 |-- description: string (nullable = true)
 |-- neighborhood_overview: string (nullable = true)
 |-- picture_url: string (nullable = true)
 |-- host_id: integer (nullable = true)
 |-- host_url: string (nullable = true)
 |-- host_name: string (nullable = true)
 |-- host_since: timestamp (nullable = true)
 |-- host_location: string (nullable = true)
 |-- host_about: string (nullable = true)
 |-- host_response_time: string (nullable = true)
 |-- host_response_rate: string (nullable = true)
 |-- host_acceptance_rate: string (nullable = true)
 |-- host_is_superhost: string (nullable = true)
 |-- host_thumbnail_url: string (nullable = true)
 |-- host_picture_url: string (nullable = true)
 |-- host_neighbourhood: string (nullable = true)
 |-- host_listings_

In [3]:
reviews = spark.read.csv("../data/reviews.csv.gz", 
    header=True,
    inferSchema=True,
    sep=",",
    quote='"',
    escape='"',
    multiLine=True,
    mode="PERMISSIVE"
)
reviews.printSchema()

root
 |-- listing_id: long (nullable = true)
 |-- id: long (nullable = true)
 |-- date: timestamp (nullable = true)
 |-- reviewer_id: integer (nullable = true)
 |-- reviewer_name: string (nullable = true)
 |-- comments: string (nullable = true)



In [4]:
# 1. Count the number of reviews per listing using the "reviews" dataset

reviews_per_listing = reviews \
  .groupBy('listing_id') \
  .count() \
  .show(10)

+----------+-----+
|listing_id|count|
+----------+-----+
|     78606|    2|
|    444886|   12|
|    466017|   28|
|   2736493|    4|
|   2557853|   89|
|   3132302|    3|
|   3917692|    1|
|   3734796|    5|
|   3997029|    7|
|   4361078|   70|
+----------+-----+
only showing top 10 rows



In [5]:
# 2. Compute the total number of listings and average review score per host

from pyspark.sql.functions import avg, count

host_stats = listings \
  .filter(listings.review_scores_rating.isNotNull()) \
  .groupBy('host_id') \
  .agg(
    count('id').alias('total_listings'),
    avg('review_scores_rating').alias('average_review_score')
  ) \
  .show(10)

+--------+--------------+--------------------+
| host_id|total_listings|average_review_score|
+--------+--------------+--------------------+
| 2358441|             1|                4.86|
| 2876123|             2|  4.9399999999999995|
| 2038199|             1|                 5.0|
| 4157822|             2|               4.925|
|  719504|             1|                4.96|
| 7950720|             1|                4.86|
| 6572018|             1|                 5.0|
|12122942|             1|                4.93|
|13851928|             1|                4.97|
|13739634|             2|                4.74|
+--------+--------------+--------------------+
only showing top 10 rows



In [6]:
# 3: Find the top ten listings with the highest number of reviews

reviews \
  .groupBy('listing_id') \
  .count() \
  .orderBy('count', ascending=False) \
  .limit(10) \
  .show()

+----------+-----+
|listing_id|count|
+----------+-----+
|  47408549| 1855|
|  30760930| 1682|
|  43120947| 1615|
|  19670926| 1436|
|  45006692| 1433|
|   1436172| 1195|
|   2126708| 1122|
|   1436177| 1005|
|  47438714|  978|
|   3855375|  973|
+----------+-----+

