In [67]:
import timeit
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType, TimestampType

In [68]:
start_time = timeit.default_timer()

spark = SparkSession.builder \
    .appName("IcebergLocalDevelopment") \
    .master("local[*]") \
    .config('spark.jars.packages', 'org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.5.2') \
    .config("spark.sql.extensions", "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions") \
    .config("spark.sql.catalog.local", "org.apache.iceberg.spark.SparkCatalog") \
    .config("spark.sql.catalog.local.type", "hadoop") \
    .config("spark.sql.catalog.local.warehouse", "spark-warehouse/iceberg") \
    .config("spark.sql.catalogImplementation", "hive") \
    .enableHiveSupport() \
    .getOrCreate()


end_time = timeit.default_timer()
elapsed_time = end_time - start_time

print("Elapsed time : ",elapsed_time)

Elapsed time :  0.016270587000690284


In [69]:
start_time = timeit.default_timer()

import os, json
df1=spark.read.option("multiline","true").json("amenities_sample_output.json")
df2=spark.read.option("multiline","true").json("amenity_category.json")
df3=spark.read.json("expedia-lodging-amenities-en_us-1-all.jsonl")

end_time = timeit.default_timer()
elapsed_time = end_time - start_time

print("Elapsed time : ",elapsed_time)



Elapsed time :  18.173789105999276


                                                                                

In [70]:
df3=df3.limit(1000)

In [71]:
df3.count()

1000

In [72]:
df1.show(truncate=False)

+-------------------+---------------+--------------------------+----------------+
|amenities          |amenities_count|amenity_categories        |themes          |
+-------------------+---------------+--------------------------+----------------+
|[list of amenities]|45             |[list of amenity category]|[list of themes]|
+-------------------+---------------+--------------------------+----------------+



In [73]:
from pyspark.sql.functions import col, concat_ws, lower, split, size

# Flatten propertyAmenities and roomAmenities
flattened_df = df3.select(
    col("propertyId.expedia").alias("expedia_id"),
    col("popularAmenities").alias("themes"),
    concat_ws(", ", *[
        col(f"propertyAmenities.{col_name}") for col_name in df3.schema["propertyAmenities"].dataType.fieldNames()
    ]).alias("property_amenities"),
    concat_ws(", ", *[
        col(f"roomAmenities.{col_name}") for col_name in df3.schema["roomAmenities"].dataType.fieldNames()
    ]).alias("room_amenities")
)

# Combine both amenities into one field
combined_df = flattened_df.select(
    col("expedia_id"),
    col("themes"),
    lower(concat_ws(", ", col("property_amenities"), col("room_amenities"))).alias("combined_amenities")
)

# Add amenities_count column by splitting combined_amenities into an array and calculating its size
# Instead of recreating combined_amenities, use the existing column
result_df = combined_df.withColumn(
    "combined_amenities",
    split(col("combined_amenities"), ",\\s*")  # Split the existing combined_amenities column
)

# Step 2: Create the 'amenities_count' column based on the size of the 'combined_amenities' list
result_df = result_df.withColumn(
    "amenities_count",
    size(col("combined_amenities"))
)

# Show the result
result_df.show(truncate=False)

+----------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [6]:
'''
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, explode, collect_list, array_distinct, lower, regexp_replace, broadcast
import json

# Load the data into a PySpark DataFrame
data = [
    (33554978, ["AC", "BARBECUE", "DRYER", "HOT_TUB", "KITCHEN", "MICROWAVE", "OUTDOOR_SPACE", "POOL", "TENNIS_COURT", "WASHER"],
     ["hair dryer", "shampoo", "soap", "toilet paper", "towels provided", "beach sun loungers", "beach towels", "near the beach", "bed sheets provided", "air conditioning", "dining table", "books", "dvd player", "music library", "stereo", "video library", "tv with cable/satellite service", "wifi available", "blender", "dishwasher", "ice maker", "microwave", "oven", "paper towels", "refrigerator", "stovetop", "toaster", "washing machine and dryer", "near the sea", "barbecue grill", "deck or patio", "garden", "car required", "no pets allowed", "fence around the pool", "outdoor pool", "private pool", "spa tub", "housekeeping (on request)", "iron/ironing board", "phone", "if you have requests for specific accessibility needs, please contact the property using the information on the reservation confirmation received after booking. ", "smoke-free property", "wheelchair accessible", "birdwatching nearby", "cycling nearby", "golf nearby", "hiking nearby", "kayaking nearby", "mountain biking nearby", "scuba diving nearby", "swimming nearby", "tennis on site", "whale watching nearby"], 56),
    (33554974, ["BARBECUE", "KITCHEN", "MICROWAVE"],
     ["hair dryer", "shampoo", "toilet paper", "towels provided", "bed sheets provided", "heating", "tv", "wifi available", "coffee/tea maker", "cookware/dishes/utensils", "electric kettle", "microwave", "oven", "refrigerator", "toaster", "barbecue grill", "car not required", "no pets allowed", "fire extinguisher", "smoke detector", "iron/ironing board", "smoke-free property"], 23)
]
schema = ["expedia_id", "themes", "combined_amenities", "amenities_count"]

result_df = spark.createDataFrame(data, schema)
'''

'\nfrom pyspark.sql import SparkSession\nfrom pyspark.sql.functions import col, explode, collect_list, array_distinct, lower, regexp_replace, broadcast\nimport json\n\n# Load the data into a PySpark DataFrame\ndata = [\n    (33554978, ["AC", "BARBECUE", "DRYER", "HOT_TUB", "KITCHEN", "MICROWAVE", "OUTDOOR_SPACE", "POOL", "TENNIS_COURT", "WASHER"],\n     ["hair dryer", "shampoo", "soap", "toilet paper", "towels provided", "beach sun loungers", "beach towels", "near the beach", "bed sheets provided", "air conditioning", "dining table", "books", "dvd player", "music library", "stereo", "video library", "tv with cable/satellite service", "wifi available", "blender", "dishwasher", "ice maker", "microwave", "oven", "paper towels", "refrigerator", "stovetop", "toaster", "washing machine and dryer", "near the sea", "barbecue grill", "deck or patio", "garden", "car required", "no pets allowed", "fence around the pool", "outdoor pool", "private pool", "spa tub", "housekeeping (on request)", "iro

In [76]:
# Load the amenity_category.json file
with open("amenity_category.json", "r") as file:
    amenity_categories = json.load(file)

# Convert the amenity category dictionary into a DataFrame
# Flatten the categories if they are lists
amenity_category_data = []

In [77]:
for key, value in amenity_categories.items():
    if isinstance(value, list):
        for category in value:
            amenity_category_data.append((key, category))
    else:
        amenity_category_data.append((key, value))

In [78]:
amenity_category_data

[('1_game_drive_per_night', 'Entertainment'),
 ('24_hour_business_center', 'Business Services'),
 ('24_hour_fitness_facilities', 'Wellness Facilities'),
 ('24_hour_front_desk', 'Guest Services'),
 ('24_hour_health_club', 'Wellness Facilities'),
 ('24_hour_pool_access', 'Pool'),
 ('2_for_1_buffet', 'Restaurant'),
 ('2_game_drives_per_night', 'Entertainment'),
 ('300_thread_count_linen', 'Bedding/linens'),
 ('400_count_egyptian_100%_cotton_sheets', 'Bedding/linens'),
 ('a/c_or_climate_control', 'Air Conditioner'),
 ('above_ground_pool', 'Pool'),
 ('ac', 'Air Conditioner'),
 ('access', 'Wheelchair Accessible'),
 ('access', 'Accessibility'),
 ('access_to_nearby_health_club', 'Wellness Facilities'),
 ('access_to_nearby_indoor_pool', 'Pool'),
 ('access_to_nearby_outdoor_pool', 'Pool'),
 ('access_via_exterior_corridors', 'Accessibility'),
 ('accessible', 'Wheelchair Accessible'),
 ('accessible', 'Accessibility'),
 ('accessible_bathtub', 'Accessibility'),
 ('accessible_bathtub', 'Wheelchair Ac

In [79]:
# Now create the DataFrame with flattened categories
amenity_category_df = spark.createDataFrame(amenity_category_data, ["amenity", "category"])


In [80]:
amenity_category_df.show(truncate=False)

+-------------------------------------+---------------------+
|amenity                              |category             |
+-------------------------------------+---------------------+
|1_game_drive_per_night               |Entertainment        |
|24_hour_business_center              |Business Services    |
|24_hour_fitness_facilities           |Wellness Facilities  |
|24_hour_front_desk                   |Guest Services       |
|24_hour_health_club                  |Wellness Facilities  |
|24_hour_pool_access                  |Pool                 |
|2_for_1_buffet                       |Restaurant           |
|2_game_drives_per_night              |Entertainment        |
|300_thread_count_linen               |Bedding/linens       |
|400_count_egyptian_100%_cotton_sheets|Bedding/linens       |
|a/c_or_climate_control               |Air Conditioner      |
|above_ground_pool                    |Pool                 |
|ac                                   |Air Conditioner      |
|access 

In [81]:
result_df.show(truncate=False)

+----------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [82]:
from pyspark.sql import functions as F

# Exploding the combined_amenities column into individual rows
result_df_exploded = result_df.withColumn("amenity", F.explode("combined_amenities"))

In [83]:
# Joining the exploded result_df with the amenity_category_df to get the category
result_df_mapped = result_df_exploded.join(
    amenity_category_df, 
    result_df_exploded["amenity"] == amenity_category_df["amenity"], 
    "left"
)

In [84]:
result_df_mapped.show(5,truncate=False)



+----------+---------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

                                                                                

In [85]:
# Grouping by expedia_id and aggregating the categories into a list
result_df_final = result_df_mapped.groupBy("expedia_id", "themes","combined_amenities", "amenities_count") \
                                  .agg(F.collect_list("category").alias("categories"))
result_df_final = result_df_final.withColumn(
    "categories", F.array_distinct(F.col("categories"))
)
# Show the result
result_df_final.show(truncate=False)



+----------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

                                                                                

In [86]:
specific_expedia_id=2515

specific_sample = result_df.filter(result_df.expedia_id == specific_expedia_id)

specific_sample.show(truncate=False)



+----------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

                                                                                

In [87]:
check=specific_sample.select("combined_amenities").show(truncate=False)



+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

                                                                                

In [88]:
specific_row = result_df.filter(F.col("expedia_id") == 2515)


In [89]:
specific_row = result_df.filter(F.col("expedia_id") == 2515)
combined_amenities_list = specific_row.select("combined_amenities").collect()


                                                                                

In [90]:
combined_amenities_list

[Row(combined_amenities=['accessible bathroom (select rooms)', 'assistive listening devices available', 'braille signage', 'elevator', 'if you have requests for specific accessibility needs', 'please contact the property using the information on the reservation confirmation received after booking. ', 'in-room accessibility (select rooms)', 'well-lit path to entrance', 'wheelchair-accessible parking', 'wheelchair-accessible path of travel', 'wheelchair-accessible path to elevator', 'wheelchair-accessible registration desk', 'wheelchair-accessible van parking', 'cave exploring', 'health/beauty spa', 'hiking/biking trails', 'mountain climbing', 'outlet shopping', 'rock climbing', 'segway rentals/tours', 'theme parks', 'winery tours', 'zoo', 'business center', 'meeting room', 'elevator', 'free newspapers in lobby', 'art supplies', 'free cribs/infant beds', 'laundry facilities', 'microwave', 'pool gate', 'refrigerator', 'seasonal outdoor pool', 'free buffet breakfast available daily', '24-h