In [1]:
import pyspark
import pyspark.sql.functions as F

In [2]:
# Creating the Apache spark SQL context
spark = pyspark.sql.SparkSession.builder.getOrCreate()
spark.sparkContext.setLogLevel('WARN')
infile = 'sf-airbnb-clean.parquet'
outfile = '../out/out_2_2.txt'

In [3]:
# Reading the parquet file
inputDF = spark.read.parquet(infile)

In [4]:
inputDF.printSchema()

root
 |-- host_is_superhost: string (nullable = true)
 |-- cancellation_policy: string (nullable = true)
 |-- instant_bookable: string (nullable = true)
 |-- host_total_listings_count: double (nullable = true)
 |-- neighbourhood_cleansed: string (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- property_type: string (nullable = true)
 |-- room_type: string (nullable = true)
 |-- accommodates: double (nullable = true)
 |-- bathrooms: double (nullable = true)
 |-- bedrooms: double (nullable = true)
 |-- beds: double (nullable = true)
 |-- bed_type: string (nullable = true)
 |-- minimum_nights: double (nullable = true)
 |-- number_of_reviews: double (nullable = true)
 |-- review_scores_rating: double (nullable = true)
 |-- review_scores_accuracy: double (nullable = true)
 |-- review_scores_cleanliness: double (nullable = true)
 |-- review_scores_checkin: double (nullable = true)
 |-- review_scores_communication: double (nullable = true

In [5]:
finalDF = inputDF \
    .agg(
        F.max('price').alias('max_price'), # Aggregating the maximum of the column 'price'
        F.min('price').alias('min_price'), # Aggregating the minimum of the column 'price'
        F.count('price').alias('row_count')# Aggregating the count of the column 'price', could be done on any column
    )

In [6]:
finalDF.show()

+---------+---------+---------+
|max_price|min_price|row_count|
+---------+---------+---------+
|  10000.0|     10.0|     7146|
+---------+---------+---------+



In [7]:
# Since it is asked to output into a file. The outputs are collected dumps.
# .write().text(<path>) takes in path and creates file as part-0000..... files
# Same goes for all the following exercise 
with open(outfile, 'w') as f:
    f.write(str(finalDF.collect()[0]))