## Spark Session Setup

In [2]:
from pyspark.sql import SparkSession


spark = SparkSession.builder \
    .appName("Advanced Spark") \
    .config("spark.sql.adaptive.enabled", "true") \
    .config("spark.sql.adaptive.coalescePartitions.enabled", "true") \
    .config("spark.driver.maxResultSize", "2g") \
    .getOrCreate()

### Loading Data

In [3]:
listings = spark.read.csv("../data/listings.csv.gz", 
    header=True,
    inferSchema=True,
    sep=",", 
    quote='"',
    escape='"', 
    multiLine=True,
    mode="PERMISSIVE" 
)
listings.printSchema()

root
 |-- id: long (nullable = true)
 |-- listing_url: string (nullable = true)
 |-- scrape_id: long (nullable = true)
 |-- last_scraped: timestamp (nullable = true)
 |-- source: string (nullable = true)
 |-- name: string (nullable = true)
 |-- description: string (nullable = true)
 |-- neighborhood_overview: string (nullable = true)
 |-- picture_url: string (nullable = true)
 |-- host_id: integer (nullable = true)
 |-- host_url: string (nullable = true)
 |-- host_name: string (nullable = true)
 |-- host_since: timestamp (nullable = true)
 |-- host_location: string (nullable = true)
 |-- host_about: string (nullable = true)
 |-- host_response_time: string (nullable = true)
 |-- host_response_rate: string (nullable = true)
 |-- host_acceptance_rate: string (nullable = true)
 |-- host_is_superhost: string (nullable = true)
 |-- host_thumbnail_url: string (nullable = true)
 |-- host_picture_url: string (nullable = true)
 |-- host_neighbourhood: string (nullable = true)
 |-- host_listings_

In [4]:
reviews = spark.read.csv("../data/reviews.csv.gz", 
    header=True,
    inferSchema=True,
    sep=",",
    quote='"',
    escape='"',
    multiLine=True,
    mode="PERMISSIVE"
)
reviews.printSchema()

root
 |-- listing_id: long (nullable = true)
 |-- id: long (nullable = true)
 |-- date: timestamp (nullable = true)
 |-- reviewer_id: integer (nullable = true)
 |-- reviewer_name: string (nullable = true)
 |-- comments: string (nullable = true)



In [5]:
# 1. For each listing compute string category depending on its price, and add it as a new column.
# A category is defined in the following way:
#
# * price < 50 -> "Budget"
# * 50 <= price < 150 -> "Mid-range"
# * price >= 150 -> "Luxury"
# 
# Only include listings where the price is not null.
# Count the number of listings in each category

from pyspark.sql.functions import udf
from pyspark.sql.types import StringType
from pyspark.sql.functions import regexp_replace

listings = listings.withColumn('price_numeric', regexp_replace('price', '[$,]', '').cast('float'))

def categorize_price(price):
    if price is None:
        return 'Unknown'
    elif price < 50:
        return 'Budget'
    elif 50 <= price < 150:
        return 'Mid-range'
    elif price >= 150:
        return 'Luxury'
    else:
        return 'Unknown'


categorize_price_udf = udf(categorize_price, StringType())


In [6]:
listings_with_category = listings \
  .filter(listings.price_numeric.isNotNull()) \
  .withColumn(
    'price_category',
    categorize_price_udf(listings.price_numeric)
  ) \
  .groupBy('price_category') \
  .count() \
  .orderBy('count', ascending=False) 




In [14]:
# 2. In this task you will need to compute a santiment score per review, and then an average sentiment score per listing.
# A santiment score indicates how "positive" or "negative" a review is. The higher the score the more positive it is, and vice-versa.
#
# To compute a sentiment score per review compute the number of positive words in a review and subtract the number of negative
# words in the same review (the list of words is already provided)
#
# To complete this task, compute a DataFrame that contains the following fields:
# * name - the name of a listing
# * average_sentiment - average sentiment of reviews computed using the algorithm described above

from pyspark.sql.functions import avg
from pyspark.sql.types import FloatType

positive_words = ['good', 'great', 'excellent', 'amazing', 'fantastic', 'wonderful', 'pleasant', 'lovely', 'nice', 'enjoyed']
negative_words = ['bad', 'terrible', 'awful', 'horrible', 'disappointing', 'poor', 'hate', 'unpleasant', 'dirty', 'noisy']

def sentiment_score(comment):
    if comment is None:
        return 0.0
    comment_lower = comment.lower()
    score = 0

    for word in positive_words:
        if word in comment_lower:
            score += 1

    for word in negative_words:
        if word in comment_lower:
            score -= 1
    return float(score)

sentiment_score_udf = udf(sentiment_score, FloatType())



In [15]:
test_comment = "This place was great and wonderful!"
result = sentiment_score(test_comment)
print(f"Result: {result}")


Result: 2.0


In [16]:
reviews.printSchema()

root
 |-- listing_id: long (nullable = true)
 |-- id: long (nullable = true)
 |-- date: timestamp (nullable = true)
 |-- reviewer_id: integer (nullable = true)
 |-- reviewer_name: string (nullable = true)
 |-- comments: string (nullable = true)



In [None]:
reviews_with_sentiment = reviews \
  .withColumn(
    'sentiment_score',
    sentiment_score_udf(reviews.comments)
  )

listings \
   .join(reviews_with_sentiment, listings.id == reviews.listing_id, 'inner') \
   .groupBy('listing_id', 'name') \
   .agg(
      avg('sentiment_score').alias('average_sentiment')
   ) \
   .orderBy('average_sentiment', ascending=False) \
   .select('listing_id', 'name', 'average_sentiment') \
   .show(truncate=False)

### Alternate way as udf function throwing pickling error

In [32]:
from pyspark.sql.functions import when, col, lower, lit

positive_words = ['good', 'great', 'excellent', 'amazing', 'fantastic', 'wonderful', 'pleasant', 'lovely', 'nice', 'enjoyed']
negative_words = ['bad', 'terrible', 'awful', 'horrible', 'disappointing', 'poor', 'hate', 'unpleasant', 'dirty', 'noisy']

# Create individual patterns for each word
positive_score = lit(0)
for word in positive_words:
    positive_score = positive_score + when(lower(col('comments')).rlike(f'\\b{word}\\b'), 1).otherwise(0)

negative_score = lit(0)
for word in negative_words:
    negative_score = negative_score + when(lower(col('comments')).rlike(f'\\b{word}\\b'), 1).otherwise(0)

reviews_with_sentiment = reviews.limit(20) \
  .withColumn('sentiment_score', 
              when(col('comments').isNull(), lit(0.0))
              .otherwise((positive_score - negative_score).cast('float')))

reviews_with_sentiment.show()

+----------+---------+-------------------+-----------+-------------+--------------------+---------------+
|listing_id|       id|               date|reviewer_id|reviewer_name|            comments|sentiment_score|
+----------+---------+-------------------+-----------+-------------+--------------------+---------------+
|     13913|    80770|2010-08-18 00:00:00|     177109|      Michael|My girlfriend and...|            2.0|
|     13913|   367568|2011-07-11 00:00:00|   19835707|      Mathias|Alina was a reall...|            1.0|
|     13913|   529579|2011-09-13 00:00:00|    1110304|      Kristin|Alina is an amazi...|            1.0|
|     13913|   595481|2011-10-03 00:00:00|    1216358|      Camilla|Alina's place is ...|            2.0|
|     13913|   612947|2011-10-09 00:00:00|     490840|        Jorik|Nice location in ...|            2.0|
|     13913|  4847959|2013-05-28 00:00:00|    6405442|         Vera|I'm very happy to...|            3.0|
|     13913|  8142329|2013-10-17 00:00:00|    

### Using SparkSQL

In [33]:
# 3. Rewrite the following code from the previous exercise using SparkSQL:
#
# ```
# from pyspark.sql.functions import length, avg, count
# 
# reviews_with_comment_length = reviews.withColumn('comment_length', length('comments'))
# reviews_with_comment_length \
#   .join(listings, reviews_with_comment_length.listing_id == listings.id, 'inner') \
#   .groupBy('listing_id').agg(
#       avg(reviews_with_comment_length.comment_length).alias('average_comment_length'),
#       count(reviews_with_comment_length.id).alias('reviews_count')
#   ) \
#   .filter('reviews_count >= 5') \
#   .orderBy('average_comment_length', ascending=False) \
#   .show()
# ```
# This was a solution for the the task:
#
# "Get top five listings with the highest average review comment length. Only return listings with at least 5 reviews"

reviews.createOrReplaceTempView("reviews")
listings.createOrReplaceTempView("listings")

# Write the SQL query
sql_query = """
SELECT
  r.listing_id,
  AVG(LENGTH(r.comments)) AS average_comment_length,
  COUNT(r.id) AS reviews_count
FROM
  reviews r
JOIN
  listings l
  ON r.listing_id = l.id
GROUP BY
  r.listing_id
HAVING
  COUNT(r.id) >= 5
ORDER BY
  average_comment_length DESC
"""

spark \
  .sql(sql_query) \
  .show()

+------------------+----------------------+-------------+
|        listing_id|average_comment_length|reviews_count|
+------------------+----------------------+-------------+
|618608352812465378|    1300.1666666666667|            6|
|          28508447|    1089.3333333333333|            6|
|627425975703032358|     951.7777777777778|            9|
|           2197681|                 939.2|            5|
|          13891813|                 905.0|            5|
|            979753|     893.9230769230769|           13|
|630150178279666225|     890.7272727272727|           11|
|           8856894|     890.1666666666666|            6|
|          29469389|                 885.0|            6|
|          22524075|                 885.0|            5|
|           5555679|     878.7169811320755|          106|
|          33385444|                 848.0|            5|
|            565214|     834.0833333333334|           12|
|          53493254|                 831.0|            7|
|          126