## Sparksession Initialization

In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Inside Airbnb data processing") \
    .getOrCreate()


### Loading data

In [2]:
listings = spark.read.csv("../data/listings.csv.gz", 
    header=True,
    inferSchema=True,
    sep=",",
    quote='"',
    escape='"',
    multiLine=True,
    mode="PERMISSIVE"
)


In [3]:
review_locations = listings.select(listings.review_scores_location)
review_locations.show()


+----------------------+
|review_scores_location|
+----------------------+
|                  4.62|
|                  4.54|
|                  4.36|
|                   4.5|
|                  4.37|
|                   4.5|
|                  4.53|
|                  4.52|
|                  4.49|
|                  4.94|
|                  4.82|
|                  4.92|
|                  4.14|
|                  4.78|
|                  4.43|
|                  4.73|
|                  null|
|                  null|
|                  4.48|
|                  4.75|
+----------------------+
only showing top 20 rows



In [4]:
listings \
  .select(listings.review_scores_location) \
  .show()


+----------------------+
|review_scores_location|
+----------------------+
|                  4.62|
|                  4.54|
|                  4.36|
|                   4.5|
|                  4.37|
|                   4.5|
|                  4.53|
|                  4.52|
|                  4.49|
|                  4.94|
|                  4.82|
|                  4.92|
|                  4.14|
|                  4.78|
|                  4.43|
|                  4.73|
|                  null|
|                  null|
|                  4.48|
|                  4.75|
+----------------------+
only showing top 20 rows



### Listing location with higher ratings

In [5]:
high_score_listings = listings \
  .filter(listings.review_scores_location > 4.5) \
  .select('id', 'price', 'name', 'review_scores_location')

high_score_listings.show(20, truncate=False)


+------+-------+------------------------------------------------+----------------------+
|id    |price  |name                                            |review_scores_location|
+------+-------+------------------------------------------------+----------------------+
|264776|$297.00|Huge Four Bedroom Apartment                     |4.62                  |
|264777|$98.00 |One Bedroom Apartment                           |4.54                  |
|264782|$120.00|One Bedroom Garden Apartment                    |4.53                  |
|264783|$216.00|Four Bedroom Garden Apartment                   |4.52                  |
|266037|$62.00 |Central London with Stunning Views!             |4.94                  |
|268398|$66.00 |Also five minutes to South Bank                 |4.82                  |
|270600|$73.00 |Patio Apartment in London (Twickenham)          |4.92                  |
|425143|null   |luxury 1bed in Chelsea Bridge Wharf             |4.78                  |
|426354|$200.00|1 bed

#### Showing locations where no columns have any null value

In [6]:
high_score_listings.dropna().show(20, truncate=False)


+------+-------+------------------------------------------------+----------------------+
|id    |price  |name                                            |review_scores_location|
+------+-------+------------------------------------------------+----------------------+
|264776|$297.00|Huge Four Bedroom Apartment                     |4.62                  |
|264777|$98.00 |One Bedroom Apartment                           |4.54                  |
|264782|$120.00|One Bedroom Garden Apartment                    |4.53                  |
|264783|$216.00|Four Bedroom Garden Apartment                   |4.52                  |
|266037|$62.00 |Central London with Stunning Views!             |4.94                  |
|268398|$66.00 |Also five minutes to South Bank                 |4.82                  |
|270600|$73.00 |Patio Apartment in London (Twickenham)          |4.92                  |
|426354|$200.00|1 bedroom flat with big balcony!                |4.73                  |
|427584|$129.00|Hackn

In [9]:
high_score_listings.printSchema()

root
 |-- id: long (nullable = true)
 |-- price: string (nullable = true)
 |-- name: string (nullable = true)
 |-- review_scores_location: double (nullable = true)



In [8]:
high_score_listings.schema['price']


StructField('price', StringType(), True)

#### Removing $ and typecast 'price' column value to float

In [10]:
from pyspark.sql.functions import regexp_replace

price_num_df = listings \
  .withColumn('price_num', regexp_replace('price', '[$,]', '').cast('float')) \

price_num_df.schema['price_num']


StructField('price_num', FloatType(), True)

In [11]:
price_num_df \
  .select('price_num', 'name') \
  .show(20, truncate=False)

+---------+--------------------------------------------------+
|price_num|name                                              |
+---------+--------------------------------------------------+
|297.0    |Huge Four Bedroom Apartment                       |
|98.0     |One Bedroom Apartment                             |
|148.0    |Two Bedroom Newly Refurbished Apartment           |
|144.0    |Refurbished Two Bedroom Apartment                 |
|157.0    |Spacious refurbished 2 bedroom apt with balcony   |
|148.0    |Two Bedrooms Garden Maisonette                    |
|120.0    |One Bedroom Garden Apartment                      |
|216.0    |Four Bedroom Garden Apartment                     |
|238.0    |Huge Three Bedroom Flat with parking and terrace  |
|62.0     |Central London with Stunning Views!               |
|66.0     |Also five minutes to South Bank                   |
|73.0     |Patio Apartment in London (Twickenham)            |
|null     |Heathrow BNB - Home Away From Home!         

In [12]:
price_num_df.printSchema()

root
 |-- id: long (nullable = true)
 |-- listing_url: string (nullable = true)
 |-- scrape_id: long (nullable = true)
 |-- last_scraped: timestamp (nullable = true)
 |-- source: string (nullable = true)
 |-- name: string (nullable = true)
 |-- description: string (nullable = true)
 |-- neighborhood_overview: string (nullable = true)
 |-- picture_url: string (nullable = true)
 |-- host_id: integer (nullable = true)
 |-- host_url: string (nullable = true)
 |-- host_name: string (nullable = true)
 |-- host_since: timestamp (nullable = true)
 |-- host_location: string (nullable = true)
 |-- host_about: string (nullable = true)
 |-- host_response_time: string (nullable = true)
 |-- host_response_rate: string (nullable = true)
 |-- host_acceptance_rate: string (nullable = true)
 |-- host_is_superhost: string (nullable = true)
 |-- host_thumbnail_url: string (nullable = true)
 |-- host_picture_url: string (nullable = true)
 |-- host_neighbourhood: string (nullable = true)
 |-- host_listings_