In [1]:
import pyspark
import pyspark.sql.functions as F

In [2]:
# Creating the Apache spark SQL context
spark = pyspark.sql.SparkSession.builder.getOrCreate()
spark.sparkContext.setLogLevel('WARN')
infile = 'sf-airbnb-clean.parquet'
outfile = '../out/out_2_4.txt'

In [3]:
inputDF = spark.read.parquet(infile)

In [4]:
inputDF.printSchema()

root
 |-- host_is_superhost: string (nullable = true)
 |-- cancellation_policy: string (nullable = true)
 |-- instant_bookable: string (nullable = true)
 |-- host_total_listings_count: double (nullable = true)
 |-- neighbourhood_cleansed: string (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- property_type: string (nullable = true)
 |-- room_type: string (nullable = true)
 |-- accommodates: double (nullable = true)
 |-- bathrooms: double (nullable = true)
 |-- bedrooms: double (nullable = true)
 |-- beds: double (nullable = true)
 |-- bed_type: string (nullable = true)
 |-- minimum_nights: double (nullable = true)
 |-- number_of_reviews: double (nullable = true)
 |-- review_scores_rating: double (nullable = true)
 |-- review_scores_accuracy: double (nullable = true)
 |-- review_scores_cleanliness: double (nullable = true)
 |-- review_scores_checkin: double (nullable = true)
 |-- review_scores_communication: double (nullable = true

In [5]:
finalDF = inputDF \
    .sort(
        inputDF['price'].asc(),                # Sorting by the least price
        inputDF['review_scores_value'].desc()  # Secondary sorting by the review_scores
    ) \
    .select('accommodates') \
    .take(1)                                    # Gets the accomodates columns and then 
                                                # gets the first element of the sorted array

In [6]:
finalDF[0]

Row(accommodates=2.0)

In [7]:
# Since it is asked to output into a file. The outputs are collected dumps.
# .write().text(<path>) takes in path and creates file as part-0000..... files
# Same goes for all the following exercise 
with open(outfile, 'w') as f:
    f.write(str(finalDF[0]))