## Aggregating data in Dataframes

In [1]:
#Importing pysark and creating a session
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('AggregatingDf').getOrCreate()
spark

## About this dataset

This dataset describes the listing activity and metrics for Air BNB bookers in NYC, NY for 2019. Each line in the dataset is a booking. 

**Source:** https://www.kaggle.com/dgomonov/new-york-city-airbnb-open-data/data

In [3]:
#Importing the dataset 
path ='datasets-intro/'
airbnb = spark.read.csv(path+'nyc_air_bnb.csv', inferSchema=True, header=True)

In [4]:
airbnb.limit(5).toPandas()

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,2539,Clean & quiet apt home by the park,2787,John,Brooklyn,Kensington,40.64749,-73.97237,Private room,149,1,9,2018-10-19,0.21,6,365
1,2595,Skylit Midtown Castle,2845,Jennifer,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,1,45,2019-05-21,0.38,2,355
2,3647,THE VILLAGE OF HARLEM....NEW YORK !,4632,Elisabeth,Manhattan,Harlem,40.80902,-73.9419,Private room,150,3,0,,,1,365
3,3831,Cozy Entire Floor of Brownstone,4869,LisaRoxanne,Brooklyn,Clinton Hill,40.68514,-73.95976,Entire home/apt,89,1,270,2019-07-05,4.64,1,194
4,5022,Entire Apt: Spacious Studio/Loft by central park,7192,Laura,Manhattan,East Harlem,40.79851,-73.94399,Entire home/apt,80,10,9,2018-11-19,0.1,1,0


In [5]:
airbnb.printSchema()

root
 |-- id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- host_id: string (nullable = true)
 |-- host_name: string (nullable = true)
 |-- neighbourhood_group: string (nullable = true)
 |-- neighbourhood: string (nullable = true)
 |-- latitude: string (nullable = true)
 |-- longitude: string (nullable = true)
 |-- room_type: string (nullable = true)
 |-- price: string (nullable = true)
 |-- minimum_nights: string (nullable = true)
 |-- number_of_reviews: string (nullable = true)
 |-- last_review: string (nullable = true)
 |-- reviews_per_month: string (nullable = true)
 |-- calculated_host_listings_count: string (nullable = true)
 |-- availability_365: integer (nullable = true)



There is a wrong data type map for columns, I'm converting those into the actual data types

In [6]:
from pyspark.sql.types import *
from pyspark.sql.functions import *

df = airbnb.withColumn("price", airbnb["price"].cast(IntegerType())) \
        .withColumn("minimum_nights", airbnb["minimum_nights"].cast(IntegerType())) \
        .withColumn("number_of_reviews", airbnb["number_of_reviews"].cast(IntegerType())) \
        .withColumn("reviews_per_month", airbnb["reviews_per_month"].cast(IntegerType())) \
        .withColumn("calculated_host_listings_count", airbnb["calculated_host_listings_count"].cast(IntegerType()))

df.printSchema()
df.limit(5).toPandas()

root
 |-- id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- host_id: string (nullable = true)
 |-- host_name: string (nullable = true)
 |-- neighbourhood_group: string (nullable = true)
 |-- neighbourhood: string (nullable = true)
 |-- latitude: string (nullable = true)
 |-- longitude: string (nullable = true)
 |-- room_type: string (nullable = true)
 |-- price: integer (nullable = true)
 |-- minimum_nights: integer (nullable = true)
 |-- number_of_reviews: integer (nullable = true)
 |-- last_review: string (nullable = true)
 |-- reviews_per_month: integer (nullable = true)
 |-- calculated_host_listings_count: integer (nullable = true)
 |-- availability_365: integer (nullable = true)



Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,2539,Clean & quiet apt home by the park,2787,John,Brooklyn,Kensington,40.64749,-73.97237,Private room,149,1,9,2018-10-19,0.0,6,365
1,2595,Skylit Midtown Castle,2845,Jennifer,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,1,45,2019-05-21,0.0,2,355
2,3647,THE VILLAGE OF HARLEM....NEW YORK !,4632,Elisabeth,Manhattan,Harlem,40.80902,-73.9419,Private room,150,3,0,,,1,365
3,3831,Cozy Entire Floor of Brownstone,4869,LisaRoxanne,Brooklyn,Clinton Hill,40.68514,-73.95976,Entire home/apt,89,1,270,2019-07-05,4.0,1,194
4,5022,Entire Apt: Spacious Studio/Loft by central park,7192,Laura,Manhattan,East Harlem,40.79851,-73.94399,Entire home/apt,80,10,9,2018-11-19,0.0,1,0


In [7]:
#Obaining the number of rows in the dataset
df.count()

49079

In [8]:
#Obtaining the total reviews each host have
df.groupBy("host_id").sum('number_of_reviews').show(10)

+-------+----------------------+
|host_id|sum(number_of_reviews)|
+-------+----------------------+
| 716306|                   197|
|1203500|                    35|
| 368528|                     1|
|1577493|                    16|
|1390555|                    50|
|1317588|                     3|
|2472680|                   219|
|2155832|                   266|
|2426404|                     6|
|2740824|                    22|
+-------+----------------------+
only showing top 10 rows



In [10]:
#Showing min and max of all the numeric variables in the dataset
minmax = df.select("price","minimum_nights","number_of_reviews","last_review","reviews_per_month","calculated_host_listings_count","availability_365").summary("min","max")
minmax.toPandas()

Unnamed: 0,summary,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,min,-74,0,0,-73.94134,0,0,0
1,max,10000,1250,629,9.66,58,365,365


In [11]:
#Host with the highest number of reviews
from pyspark.sql import functions
df.groupBy("host_id").agg(sum("number_of_reviews").alias("Reviews")).orderBy(sum("number_of_reviews").desc()).show(1) 

+--------+-------+
| host_id|Reviews|
+--------+-------+
|37312959|   2273|
+--------+-------+
only showing top 1 row



In [12]:
#Getting the average nights did most hosts specify for a minimum
df.agg(mean(df.minimum_nights)).show()

+-------------------+
|avg(minimum_nights)|
+-------------------+
| 7.1286126280910596|
+-------------------+



In [15]:
#Most expensive neighbourhood to stay in on average
result = df.groupBy("neighbourhood").agg(avg(df.price).alias('avg_price'))
result.orderBy(result.avg_price.desc()).show(1) 

+--------------+---------+
| neighbourhood|avg_price|
+--------------+---------+
|Fort Wadsworth|    800.0|
+--------------+---------+
only showing top 1 row



In [16]:
#Displaying a two by two table that shows the avg price by room type (private and shared) and neighborhood group(Manhattan and Brooklyn )
df.filter("room_type IN('Private room','Shared room')").groupBy("room_type").pivot("neighbourhood_group", ["Manhattan", "Brooklyn"]).avg('price').show(100)

+------------+------------------+-----------------+
|   room_type|         Manhattan|         Brooklyn|
+------------+------------------+-----------------+
| Shared room| 89.06903765690376|50.52784503631961|
|Private room|116.05400302114803|76.47234042553191|
+------------+------------------+-----------------+

