## Session Setup

In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Read Inside Airbnb data") \
    .getOrCreate()

### Loading Data

In [2]:
listings = spark.read.csv("../data/listings.csv.gz", 
    header=True,
    inferSchema=True,
    sep=",", 
    quote='"',
    escape='"', 
    multiLine=True,
    mode="PERMISSIVE" 
)

### Dataframe Schema

In [3]:
listings.printSchema()

root
 |-- id: long (nullable = true)
 |-- listing_url: string (nullable = true)
 |-- scrape_id: long (nullable = true)
 |-- last_scraped: timestamp (nullable = true)
 |-- source: string (nullable = true)
 |-- name: string (nullable = true)
 |-- description: string (nullable = true)
 |-- neighborhood_overview: string (nullable = true)
 |-- picture_url: string (nullable = true)
 |-- host_id: integer (nullable = true)
 |-- host_url: string (nullable = true)
 |-- host_name: string (nullable = true)
 |-- host_since: timestamp (nullable = true)
 |-- host_location: string (nullable = true)
 |-- host_about: string (nullable = true)
 |-- host_response_time: string (nullable = true)
 |-- host_response_rate: string (nullable = true)
 |-- host_acceptance_rate: string (nullable = true)
 |-- host_is_superhost: string (nullable = true)
 |-- host_thumbnail_url: string (nullable = true)
 |-- host_picture_url: string (nullable = true)
 |-- host_neighbourhood: string (nullable = true)
 |-- host_listings_

In [4]:
# 1. Get a non-null picture URL for any property ("picture_url" field)
# Select any non-null picture URL
listings.filter(
    listings.picture_url.isNotNull()
) \
.select('picture_url') \
.limit(1) \
.show(truncate=False)

+----------------------------------------------------------------------------------------------------------+
|picture_url                                                                                               |
+----------------------------------------------------------------------------------------------------------+
|https://a0.muscache.com/pictures/hosting/Hosting-264776/original/3cc7b93f-dbda-4ded-ac15-e9d96691e7ca.jpeg|
+----------------------------------------------------------------------------------------------------------+



In [5]:
# 2. Get number of properties that get more than 10 reviews per month
listings.filter(
  listings.reviews_per_month > 10
) \
.count()

57

In [6]:
# 3. Get properties that have more bathrooms than bedrooms
listings.filter(
    (listings.bathrooms > listings.bedrooms)
) \
.select('name', 'bathrooms', 'bedrooms') \
.show(10, truncate=False)

+--------------------------------------------------+---------+--------+
|name                                              |bathrooms|bedrooms|
+--------------------------------------------------+---------+--------+
|Central London with Stunning Views!               |1.5      |1       |
|Also five minutes to South Bank                   |1.5      |1       |
|Battersea live/work artist house                  |1.5      |1       |
|Large double bedroom in Shoreditch w/garden       |1.5      |1       |
|Bedroom In Great Location Stratford               |1.5      |1       |
|Spacious luxury 2 bedroom apartment               |1.5      |1       |
|Very Central! Bayswater Apartment                 |2.0      |1       |
|Room in London with a family                      |1.5      |1       |
|Stunning large room (double sofa bed), Hackney, E9|1.5      |1       |
|Cosy Double studio in Zone 2 Hammersmith (1)      |1.5      |1       |
+--------------------------------------------------+---------+--

In [7]:
# 4. Get properties where the price is greater than 5,000. Collect the result as a Python list
from pyspark.sql.functions import regexp_replace

listings_with_price = listings \
  .withColumn('price_numeric', regexp_replace('price', '[$,]', '').cast('float'))

res = listings_with_price.filter(
    (listings_with_price.price_numeric > 5000)
) \
.select('name', 'price') \
.collect()

res

[Row(name='Room in a cosy flat. Central, clean', price='$8,000.00'),
 Row(name='Spacious Private Ground Floor Room', price='$6,308.00'),
 Row(name='No Longer Available', price='$53,588.00'),
 Row(name='Bright & airy DoubleBed with EnSuite in Zone 2!', price='$74,100.00'),
 Row(name='Stunning home overlook canary wharf', price='$7,360.00'),
 Row(name='The Apartments by The Sloane Club, L 2 Bedroom Apt', price='$7,377.00'),
 Row(name='Kensington- Luxury 2 bedroom ground floor flat', price='$7,796.00'),
 Row(name='Spacious London Flat', price='$5,034.00'),
 Row(name='Single room. 7ft x 9ft - Over looking garden', price='$5,700.00'),
 Row(name='Luxury modern apartment in Dulwich Village', price='$5,372.00'),
 Row(name='Beautiful 2 BR flat in Kilburn with free parking', price='$6,000.00'),
 Row(name='Semi-detached mews house in Knightsbridge.', price='$7,007.00'),
 Row(name='Bright & Comfortable Angel Apartment', price='$9,999.00'),
 Row(name='Affordable Spacious  Room on the edge of the ci

In [8]:
# 5. Get a list of properties with the following characteristics:
# * price < 150
# * more than 20 reviews
# * review_scores_rating > 4.5
# Consider using the "&" operator

listings_with_price.filter(
    (listings_with_price.price_numeric < 150) &
    (listings_with_price.number_of_reviews > 20) &
    (listings_with_price.review_scores_rating > 4.5)
  ) \
.select('name', 'price_numeric', 'number_of_reviews', 'review_scores_rating') \
.show(truncate=False)

+--------------------------------------------------+-------------+-----------------+--------------------+
|name                                              |price_numeric|number_of_reviews|review_scores_rating|
+--------------------------------------------------+-------------+-----------------+--------------------+
|One Bedroom Apartment                             |98.0         |24               |4.58                |
|Refurbished Two Bedroom Apartment                 |144.0        |36               |4.64                |
|Central London with Stunning Views!               |62.0         |532              |4.9                 |
|Also five minutes to South Bank                   |66.0         |563              |4.63                |
|Patio Apartment in London (Twickenham)            |73.0         |88               |4.64                |
|Lovely 2 bedroom flat near Brixton, zone 2, London|135.0        |23               |4.77                |
|Hackney Stylish & light 1 bedroom Victorian f