In [1]:
import findspark
findspark.init()

from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Aggregating").getOrCreate()
spark

In [2]:
airbnb = spark.read.csv("Datasets/nyc_air_bnb.csv", inferSchema=True, header=True)

In [3]:
airbnb.limit(5).toPandas()

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,2539,Clean & quiet apt home by the park,2787,John,Brooklyn,Kensington,40.64749,-73.97237,Private room,149,1,9,2018-10-19,0.21,6,365
1,2595,Skylit Midtown Castle,2845,Jennifer,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,1,45,2019-05-21,0.38,2,355
2,3647,THE VILLAGE OF HARLEM....NEW YORK !,4632,Elisabeth,Manhattan,Harlem,40.80902,-73.9419,Private room,150,3,0,,,1,365
3,3831,Cozy Entire Floor of Brownstone,4869,LisaRoxanne,Brooklyn,Clinton Hill,40.68514,-73.95976,Entire home/apt,89,1,270,2019-07-05,4.64,1,194
4,5022,Entire Apt: Spacious Studio/Loft by central park,7192,Laura,Manhattan,East Harlem,40.79851,-73.94399,Entire home/apt,80,10,9,2018-11-19,0.1,1,0


In [5]:
airbnb.printSchema()

root
 |-- id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- host_id: string (nullable = true)
 |-- host_name: string (nullable = true)
 |-- neighbourhood_group: string (nullable = true)
 |-- neighbourhood: string (nullable = true)
 |-- latitude: string (nullable = true)
 |-- longitude: string (nullable = true)
 |-- room_type: string (nullable = true)
 |-- price: string (nullable = true)
 |-- minimum_nights: string (nullable = true)
 |-- number_of_reviews: string (nullable = true)
 |-- last_review: string (nullable = true)
 |-- reviews_per_month: string (nullable = true)
 |-- calculated_host_listings_count: string (nullable = true)
 |-- availability_365: integer (nullable = true)



In [6]:
from pyspark.sql.types import *
from pyspark.sql.functions import *

df = airbnb.withColumn("price", airbnb["price"].cast(IntegerType())) \
        .withColumn("minimum_nights", airbnb["minimum_nights"].cast(IntegerType())) \
        .withColumn("number_of_reviews", airbnb["number_of_reviews"].cast(IntegerType())) \
        .withColumn("reviews_per_month", airbnb["reviews_per_month"].cast(IntegerType())) \
        .withColumn("calculated_host_listings_count", airbnb["calculated_host_listings_count"].cast(IntegerType()))
#QA
print(df.printSchema())
df.limit(5).toPandas()

root
 |-- id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- host_id: string (nullable = true)
 |-- host_name: string (nullable = true)
 |-- neighbourhood_group: string (nullable = true)
 |-- neighbourhood: string (nullable = true)
 |-- latitude: string (nullable = true)
 |-- longitude: string (nullable = true)
 |-- room_type: string (nullable = true)
 |-- price: integer (nullable = true)
 |-- minimum_nights: integer (nullable = true)
 |-- number_of_reviews: integer (nullable = true)
 |-- last_review: string (nullable = true)
 |-- reviews_per_month: integer (nullable = true)
 |-- calculated_host_listings_count: integer (nullable = true)
 |-- availability_365: integer (nullable = true)

None


Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,2539,Clean & quiet apt home by the park,2787,John,Brooklyn,Kensington,40.64749,-73.97237,Private room,149,1,9,2018-10-19,0.0,6,365
1,2595,Skylit Midtown Castle,2845,Jennifer,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,1,45,2019-05-21,0.0,2,355
2,3647,THE VILLAGE OF HARLEM....NEW YORK !,4632,Elisabeth,Manhattan,Harlem,40.80902,-73.9419,Private room,150,3,0,,,1,365
3,3831,Cozy Entire Floor of Brownstone,4869,LisaRoxanne,Brooklyn,Clinton Hill,40.68514,-73.95976,Entire home/apt,89,1,270,2019-07-05,4.0,1,194
4,5022,Entire Apt: Spacious Studio/Loft by central park,7192,Laura,Manhattan,East Harlem,40.79851,-73.94399,Entire home/apt,80,10,9,2018-11-19,0.0,1,0


In [8]:
# count()
df.groupBy("neighbourhood_group").count().show(10)

+-------------------+-----+
|neighbourhood_group|count|
+-------------------+-----+
|         Douglaston|    1|
|             Queens| 5630|
|              Nadia|    1|
|            Midtown|    4|
|    Jackson Heights|    2|
|     Hell's Kitchen|    7|
|  Greenwich Village|    2|
|       Clinton Hill|    1|
| Washington Heights|    4|
|   Ditmars Steinway|    3|
+-------------------+-----+
only showing top 10 rows



In [9]:
# min()
df.groupBy("neighbourhood_group").min("price").show(10)

+-------------------+----------+
|neighbourhood_group|min(price)|
+-------------------+----------+
|         Douglaston|         1|
|             Queens|        10|
|              Nadia|      null|
|            Midtown|         2|
|    Jackson Heights|         2|
|     Hell's Kitchen|         1|
|  Greenwich Village|        31|
|       Clinton Hill|        14|
| Washington Heights|         2|
|   Ditmars Steinway|         1|
+-------------------+----------+
only showing top 10 rows



In [10]:
# agg()
# min()
df.groupBy("neighbourhood_group").agg({'price':'mean'}).show(10)

+-------------------+------------------+
|neighbourhood_group|        avg(price)|
+-------------------+------------------+
|         Douglaston|               1.0|
|             Queens| 99.57690941385435|
|              Nadia|              null|
|            Midtown|               9.0|
|    Jackson Heights|              16.0|
|     Hell's Kitchen|1.2857142857142858|
|  Greenwich Village|              55.5|
|       Clinton Hill|              14.0|
| Washington Heights|              2.75|
|   Ditmars Steinway|3.3333333333333335|
+-------------------+------------------+
only showing top 10 rows



In [12]:
from pyspark.sql.functions import *
df.groupBy("neighbourhood_group").agg(min(df.price), max(df.price)).show(10)

+-------------------+----------+----------+
|neighbourhood_group|min(price)|max(price)|
+-------------------+----------+----------+
|         Douglaston|         1|         1|
|             Queens|        10|     10000|
|              Nadia|      null|      null|
|            Midtown|         2|        30|
|    Jackson Heights|         2|        30|
|     Hell's Kitchen|         1|         3|
|  Greenwich Village|        31|        80|
|       Clinton Hill|        14|        14|
| Washington Heights|         2|         3|
|   Ditmars Steinway|         1|         6|
+-------------------+----------+----------+
only showing top 10 rows



In [15]:
# summary()
summary = df.select("price", "minimum_nights", "number_of_reviews").summary("count", "min", "25%", "75%", "max")
summary.limit(5).toPandas()

Unnamed: 0,summary,price,minimum_nights,number_of_reviews
0,count,48887,48891,48738
1,min,-74,0,0
2,25%,69,1,1
3,75%,175,5,23
4,max,10000,1250,629


In [18]:
# avg() for all, stddev() for all
df.select(countDistinct("neighbourhood_group").alias("unique_group"), avg('price'), stddev('price')).toPandas()

Unnamed: 0,unique_group,avg(price),stddev_samp(price)
0,77,152.222984,238.541467


In [20]:
# pivot()
# We could select only the "Share room" types if we wanted to
df.filter("room_type='Shared room'").groupBy("room_type").pivot("neighbourhood_group", ["Queens", "Brooklyn"]).count().show(100)

+-----------+------+--------+
|  room_type|Queens|Brooklyn|
+-----------+------+--------+
|Shared room|   198|     413|
+-----------+------+--------+



In [21]:
df.groupBy("neighbourhood").pivot("neighbourhood_group", ["Queens", "Brooklyn"])\
.agg(min(df.price).alias("Min Price"),max(df.price).alias("Max Price")).limit(5).toPandas()

Unnamed: 0,neighbourhood,Queens_Min Price,Queens_Max Price,Brooklyn_Min Price,Brooklyn_Max Price
0,Corona,23.0,359.0,,
1,Prince's Bay,,,,
2,Richmondtown,,,,
3,Mill Basin,,,85.0,299.0
4,Westerleigh,,,,
