In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Data Aggregation") \
    .getOrCreate()

In [None]:
listings = spark.read.csv("data/listings.csv.gz", 
    header=True,
    inferSchema=True,
    sep=",",
    quote='"',
    escape='"',
    multiLine=True,
    mode="PERMISSIVE"
)

In [None]:
listings \
  .groupby(listings.property_type) \
  .count() \
  .show(truncate=False)

In [None]:
import pyspark.sql.functions as F

listings \
  .groupby(listings.property_type) \
  .agg(
    F.count('property_type').alias('count')
  ) \
  .orderBy('count', ascending=[False]) \
  .show(truncate=False)

In [None]:

listings \
  .groupby(listings.property_type) \
  .agg(
    F.count('property_type').alias('count'),
    F.avg('review_scores_location')
  ) \
  .orderBy('count', ascending=[False]) \
  .show(truncate=False)

In [None]:
reviews = spark.read.csv("data/reviews.csv.gz", 
    header=True,
    inferSchema=True,
    sep=",",
    quote='"',
    escape='"',
    multiLine=True,
    mode="PERMISSIVE"
)

In [None]:
listings_reviews = listings.join(
    reviews, listings.id == reviews.listing_id, how='inner'
)

In [None]:
listings_reviews \
  .groupBy('id') \
  .agg(
    F.count('id').alias('num_reviews')
  ) \
  .show()

In [None]:
reviews_per_listing = listings_reviews \
  .groupBy(listings.id, listings.name) \
  .agg(
    F.count(reviews.id).alias('num_reviews')
  ) \
  .orderBy('num_reviews', ascending=False) \
  .show(truncate=False)