## Session Setup

In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Read Inside Airbnb data") \
    .getOrCreate()

### Loading Data

In [2]:
listings = spark.read.csv("../data/listings.csv.gz", 
    header=True,
    inferSchema=True,
    sep=",", 
    quote='"',
    escape='"', 
    multiLine=True,
    mode="PERMISSIVE" 
)

### Dataframe Schema

In [3]:
listings.printSchema()

root
 |-- id: long (nullable = true)
 |-- listing_url: string (nullable = true)
 |-- scrape_id: long (nullable = true)
 |-- last_scraped: timestamp (nullable = true)
 |-- source: string (nullable = true)
 |-- name: string (nullable = true)
 |-- description: string (nullable = true)
 |-- neighborhood_overview: string (nullable = true)
 |-- picture_url: string (nullable = true)
 |-- host_id: integer (nullable = true)
 |-- host_url: string (nullable = true)
 |-- host_name: string (nullable = true)
 |-- host_since: timestamp (nullable = true)
 |-- host_location: string (nullable = true)
 |-- host_about: string (nullable = true)
 |-- host_response_time: string (nullable = true)
 |-- host_response_rate: string (nullable = true)
 |-- host_acceptance_rate: string (nullable = true)
 |-- host_is_superhost: string (nullable = true)
 |-- host_thumbnail_url: string (nullable = true)
 |-- host_picture_url: string (nullable = true)
 |-- host_neighbourhood: string (nullable = true)
 |-- host_listings_

In [4]:
# 1. Get a non-null picture URL for any property ("picture_url" field)
# Select any non-null picture URL
listings.filter(
    listings.picture_url.isNotNull()
) \
.select('picture_url') \
.limit(1) \
.show(truncate=False)

+----------------------------------------------------------------------------------------------------------+
|picture_url                                                                                               |
+----------------------------------------------------------------------------------------------------------+
|https://a0.muscache.com/pictures/hosting/Hosting-264776/original/3cc7b93f-dbda-4ded-ac15-e9d96691e7ca.jpeg|
+----------------------------------------------------------------------------------------------------------+



In [5]:
# 2. Get number of properties that get more than 10 reviews per month
listings.filter(
  listings.reviews_per_month > 10
) \
.count()

57

In [6]:
# 3. Get properties that have more bathrooms than bedrooms
listings.filter(
    (listings.bathrooms > listings.bedrooms)
) \
.select('name', 'bathrooms', 'bedrooms') \
.show(10, truncate=False)

+--------------------------------------------------+---------+--------+
|name                                              |bathrooms|bedrooms|
+--------------------------------------------------+---------+--------+
|Central London with Stunning Views!               |1.5      |1       |
|Also five minutes to South Bank                   |1.5      |1       |
|Battersea live/work artist house                  |1.5      |1       |
|Large double bedroom in Shoreditch w/garden       |1.5      |1       |
|Bedroom In Great Location Stratford               |1.5      |1       |
|Spacious luxury 2 bedroom apartment               |1.5      |1       |
|Very Central! Bayswater Apartment                 |2.0      |1       |
|Room in London with a family                      |1.5      |1       |
|Stunning large room (double sofa bed), Hackney, E9|1.5      |1       |
|Cosy Double studio in Zone 2 Hammersmith (1)      |1.5      |1       |
+--------------------------------------------------+---------+--