## Sparksession Initialization

In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Data Aggregation") \
    .getOrCreate()


In [3]:
listings = spark.read.csv("../data/listings.csv.gz", 
    header=True,
    inferSchema=True,
    sep=",",
    quote='"',
    escape='"',
    multiLine=True,
    mode="PERMISSIVE"
)

In [4]:
listings \
  .groupby(listings.property_type) \
  .count() \
  .show(truncate=False)

+----------------------------------+-----+
|property_type                     |count|
+----------------------------------+-----+
|Private room in lighthouse        |2    |
|Private room in loft              |154  |
|Private room in earthen home      |2    |
|Entire chalet                     |5    |
|Earthen home                      |1    |
|Farm stay                         |4    |
|Entire rental unit                |40799|
|Shared room in hostel             |61   |
|Shared room                       |1    |
|Private room in condo             |3255 |
|Room in boutique hotel            |229  |
|Private room in religious building|4    |
|Room in bed and breakfast         |18   |
|Private room in casa particular   |62   |
|Private room in bungalow          |63   |
|Entire cabin                      |50   |
|Entire guesthouse                 |226  |
|Hut                               |4    |
|Private room in nature lodge      |4    |
|Entire guest suite                |175  |
+----------

In [16]:
import pyspark.sql.functions as F

listings \
  .groupby(listings.property_type) \
  .agg(
    F.count('property_type').alias('count')
  ) \
  .orderBy('count', ascending=[False]) \
  .show(truncate=False)

+----------------------------------+-----+
|property_type                     |count|
+----------------------------------+-----+
|Entire rental unit                |40799|
|Private room in rental unit       |14573|
|Private room in home              |11826|
|Entire home                       |8938 |
|Entire condo                      |8438 |
|Private room in condo             |3255 |
|Entire serviced apartment         |1840 |
|Private room in townhouse         |1189 |
|Entire townhouse                  |1069 |
|Room in hotel                     |1042 |
|Private room in bed and breakfast |495  |
|Private room in guesthouse        |371  |
|Entire loft                       |339  |
|Room in boutique hotel            |229  |
|Entire guesthouse                 |226  |
|Entire guest suite                |175  |
|Private room in guest suite       |170  |
|Private room in loft              |154  |
|Private room in serviced apartment|154  |
|Private room                      |103  |
+----------

In [7]:
listings \
  .groupby(listings.property_type) \
  .agg(
    F.count('property_type').alias('count'),
    F.avg('review_scores_location').alias('avg_review_scores_location')
  ) \
  .orderBy('count', ascending=[False]) \
  .show(truncate=False)

+----------------------------------+-----+--------------------------+
|property_type                     |count|avg_review_scores_location|
+----------------------------------+-----+--------------------------+
|Entire rental unit                |40799|4.733095834431922         |
|Private room in rental unit       |14573|4.723925292814142         |
|Private room in home              |11826|4.701917440156306         |
|Entire home                       |8938 |4.726180264430801         |
|Entire condo                      |8438 |4.778066335437068         |
|Private room in condo             |3255 |4.7702538787023885        |
|Entire serviced apartment         |1840 |4.718870662460569         |
|Private room in townhouse         |1189 |4.762151898734181         |
|Entire townhouse                  |1069 |4.817585836909877         |
|Room in hotel                     |1042 |4.618212996389897         |
|Private room in bed and breakfast |495  |4.741178247734139         |
|Private room in gue

### Loading review dataset

In [8]:
reviews = spark.read.csv("../data/reviews.csv.gz", 
    header=True,
    inferSchema=True,
    sep=",",
    quote='"',
    escape='"',
    multiLine=True,
    mode="PERMISSIVE"
)

In [9]:
for field in reviews.schema:
    print(field)

StructField('listing_id', LongType(), True)
StructField('id', LongType(), True)
StructField('date', TimestampType(), True)
StructField('reviewer_id', IntegerType(), True)
StructField('reviewer_name', StringType(), True)
StructField('comments', StringType(), True)


### Join operations with listings and review dataframes

In [10]:
listings_reviews = listings.join(
    reviews, listings.id == reviews.listing_id, how='inner'
)

In [12]:
listings_reviews.printSchema()

root
 |-- id: long (nullable = true)
 |-- listing_url: string (nullable = true)
 |-- scrape_id: long (nullable = true)
 |-- last_scraped: timestamp (nullable = true)
 |-- source: string (nullable = true)
 |-- name: string (nullable = true)
 |-- description: string (nullable = true)
 |-- neighborhood_overview: string (nullable = true)
 |-- picture_url: string (nullable = true)
 |-- host_id: integer (nullable = true)
 |-- host_url: string (nullable = true)
 |-- host_name: string (nullable = true)
 |-- host_since: timestamp (nullable = true)
 |-- host_location: string (nullable = true)
 |-- host_about: string (nullable = true)
 |-- host_response_time: string (nullable = true)
 |-- host_response_rate: string (nullable = true)
 |-- host_acceptance_rate: string (nullable = true)
 |-- host_is_superhost: string (nullable = true)
 |-- host_thumbnail_url: string (nullable = true)
 |-- host_picture_url: string (nullable = true)
 |-- host_neighbourhood: string (nullable = true)
 |-- host_listings_

In [23]:

listings_reviews \
  .groupby(listings_reviews.property_type) \
  .agg(
    F.count('property_type').alias('count'),
    F.avg('review_scores_location').alias('avg_review_scores_location')
  ) \
  .orderBy('count', ascending=[False]) \
  .show(truncate=False)

+---------------------------------+------+--------------------------+
|property_type                    |count |avg_review_scores_location|
+---------------------------------+------+--------------------------+
|Entire rental unit               |668227|4.780720457813537         |
|Private room in rental unit      |308685|4.767027015644179         |
|Private room in home             |304327|4.74466879822161          |
|Entire condo                     |201570|4.795146624729405         |
|Private room in condo            |138882|4.794089544286227         |
|Entire home                      |90808 |4.76103684859707          |
|Private room in townhouse        |75468 |4.781735858058077         |
|Entire serviced apartment        |47849 |4.761400238249481         |
|Entire townhouse                 |24784 |4.81590340542301          |
|Room in hotel                    |19464 |4.779507809288822         |
|Private room in bed and breakfast|14843 |4.762267735632631         |
|Entire loft        

In [27]:
listings_reviews \
.groupBy(listings_reviews.property_type) \
.agg(
    F.count(listings_reviews.property_type).alias('num_reviews')
).show(20, truncate=False)

+----------------------------------+-----------+
|property_type                     |num_reviews|
+----------------------------------+-----------+
|Private room in loft              |7775       |
|Entire chalet                     |8          |
|Earthen home                      |596        |
|Entire rental unit                |668227     |
|Shared room in hostel             |4915       |
|Shared room                       |27         |
|Private room in condo             |138882     |
|Room in boutique hotel            |10933      |
|Private room in religious building|547        |
|Room in bed and breakfast         |571        |
|Private room in casa particular   |1386       |
|Private room in bungalow          |2056       |
|Entire cabin                      |1630       |
|Entire guesthouse                 |8116       |
|Hut                               |102        |
|Private room in nature lodge      |3          |
|Entire guest suite                |10061      |
|Private room in hom

In [28]:
reviews_per_listing = listings_reviews \
  .groupBy(listings.id, listings.name) \
  .agg(
    F.count(reviews.id).alias('num_reviews')
  ) \
  .orderBy('num_reviews', ascending=False) \
  .show(truncate=False)

+--------+--------------------------------------------------+-----------+
|id      |name                                              |num_reviews|
+--------+--------------------------------------------------+-----------+
|47408549|Double Room+ Ensuite                              |1855       |
|30760930|Double Garden View room - London House Hotel***   |1682       |
|43120947|Private double room with en suite facilities      |1615       |
|19670926|Locke Studio Apartment at Leman Locke             |1436       |
|45006692|Budget Double Room In Colliers Hotel.             |1433       |
|1436172 |Cosy Double in Kings Cross Houseshare nr Eurostar |1195       |
|2126708 |London's best transport hub 5 mins walk! Safe too!|1122       |
|1436177 |En-suite Double in Kings Cross Houseshare Eurostar|1005       |
|47438714|KX Basic- Small Double- shared bathroom           |978        |
|3855375 |Double in Kings Cross Houseshare nr Eurostar      |973        |
|46233904|Superior Studio, avg size 23