In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Inside Airbnb data processing") \
    .getOrCreate()

In [None]:
listings = spark.read.csv("data/listings.csv.gz", 
    header=True,
    inferSchema=True,
    sep=",",
    quote='"',
    escape='"',
    multiLine=True,
    mode="PERMISSIVE"
)

In [None]:
review_locations = listings.select(listings.review_scores_location)
review_locations.show()

In [None]:
listings \
  .select(listings.review_scores_location) \
  .show()

In [None]:
high_score_listings = listings \
  .filter(listings.review_scores_location > 4.5) \
  .select('id', 'price', 'name', 'review_scores_location')

high_score_listings.show(20, truncate=False)


In [None]:
high_score_listings.dropna().show(20, truncate=False)

In [None]:
high_score_listings.schema['price']

In [None]:
from pyspark.sql.functions import regexp_replace

price_num_df = listings \
  .withColumn('price_num', regexp_replace('price', '[$,]', '').cast('float')) \

price_num_df.schema['price_num']

In [None]:
price_num_df \
  .select('price_num', 'name') \
  .show(20, truncate=False)

In [None]:
price_num_df.filter( (price_num_df.price_num < 100) & (price_num_df.review_scores_location > 4.5)) \
  .select('name', 'price', 'review_scores_location') \
  .show(truncate=False)

In [None]:
price_num_df.filter('price_num < 100 AND review_scores_location > 4.5') \
  .select('name', 'price', 'review_scores_location') \
  .show(truncate=False)

In [None]:
listings \
  .select(listings.property_type) \
  .distinct() \
  .show(truncate=False)

In [None]:
listings \
  .select(listings.property_type, listings.room_type) \
  .distinct() \
  .show(truncate=False)

In [None]:
listings \
  .select(listings.property_type) \
  .distinct() \
  .write \
  .csv('data/property_types')