In [46]:
from pyspark.sql import SparkSession

spark = SparkSession \
  .builder \
  .appName("Hotel reservations") \
  .getOrCreate()

ConnectionRefusedError: [Errno 61] Connection refused

Traceback (most recent call last):
  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/Users/ivanmushketyk/Development/courses-creation/ztm-data-engineering/module-06/lib/python3.12/site-packages/ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "/Users/ivanmushketyk/Development/courses-creation/ztm-data-engineering/module-06/lib/python3.12/site-packages/traitlets/config/application.py", line 1075, in launch_instance
    app.start()
  File "/Users/ivanmushketyk/Development/courses-creation/ztm-data-engineering/module-06/lib/python3.12/site-packages/ipykernel/kernelapp.py", line 739, in start
    self.io_loop.start()
  File "/Users/ivanmushketyk/Development/courses-creation/ztm-data-engineering/module-06/lib/python3.12/site-packages/tornado/platform/asyncio.py", line 205, in start
    self.asyncio_loop.run_forever()
  File "/opt/homebrew/Cellar/python@3.12/3.12.7/Frameworks/Python.framework/Ver

In [2]:
data = spark.read.csv("data/hotel-reservations.csv", header=True, inferSchema=True)

data.printSchema()

root
 |-- Booking_ID: string (nullable = true)
 |-- no_of_adults: integer (nullable = true)
 |-- no_of_children: integer (nullable = true)
 |-- no_of_weekend_nights: integer (nullable = true)
 |-- no_of_week_nights: integer (nullable = true)
 |-- type_of_meal_plan: string (nullable = true)
 |-- required_car_parking_space: integer (nullable = true)
 |-- room_type_reserved: string (nullable = true)
 |-- lead_time: integer (nullable = true)
 |-- arrival_year: integer (nullable = true)
 |-- arrival_month: integer (nullable = true)
 |-- arrival_date: integer (nullable = true)
 |-- market_segment_type: string (nullable = true)
 |-- repeated_guest: integer (nullable = true)
 |-- no_of_previous_cancellations: integer (nullable = true)
 |-- no_of_previous_bookings_not_canceled: integer (nullable = true)
 |-- avg_price_per_room: double (nullable = true)
 |-- no_of_special_requests: integer (nullable = true)
 |-- booking_status: string (nullable = true)



In [3]:
data.count()

36275

In [4]:
data.show()

+----------+------------+--------------+--------------------+-----------------+-----------------+--------------------------+------------------+---------+------------+-------------+------------+-------------------+--------------+----------------------------+------------------------------------+------------------+----------------------+--------------+
|Booking_ID|no_of_adults|no_of_children|no_of_weekend_nights|no_of_week_nights|type_of_meal_plan|required_car_parking_space|room_type_reserved|lead_time|arrival_year|arrival_month|arrival_date|market_segment_type|repeated_guest|no_of_previous_cancellations|no_of_previous_bookings_not_canceled|avg_price_per_room|no_of_special_requests|booking_status|
+----------+------------+--------------+--------------------+-----------------+-----------------+--------------------------+------------------+---------+------------+-------------+------------+-------------------+--------------+----------------------------+------------------------------------+--

In [None]:
train_data, test_data = data.randomSplit([0.8, 0.2], seed=42)
print("Train size: ", train_data.count())
print("Test size: ", test_data.count())

In [31]:
stages = []

In [32]:
from pyspark.ml.feature import StringIndexer, OneHotEncoder 

categorical_cols = ['type_of_meal_plan', 'room_type_reserved', 'arrival_month', 'market_segment_type']

for col in categorical_cols:
    string_indexer = StringIndexer(inputCol=col, outputCol= col + '_index')
    print(f'StringIndexer {string_indexer.getInputCol()} -> {string_indexer.getOutputCol()}')
    
    encoder = OneHotEncoder(inputCol=string_indexer.getOutputCol(), outputCol=col + '_vec')
    print(f'OneHotEncoder {encoder.getInputCol()} -> {encoder.getOutputCol()}')
    print()
    
    stages += [string_indexer, encoder]


StringIndexer type_of_meal_plan -> type_of_meal_plan_index
OneHotEncoder type_of_meal_plan_index -> type_of_meal_plan_vec

StringIndexer room_type_reserved -> room_type_reserved_index
OneHotEncoder room_type_reserved_index -> room_type_reserved_vec

StringIndexer arrival_month -> arrival_month_index
OneHotEncoder arrival_month_index -> arrival_month_vec

StringIndexer market_segment_type -> market_segment_type_index
OneHotEncoder market_segment_type_index -> market_segment_type_vec



In [33]:
stages += [StringIndexer(inputCol='booking_status', outputCol= 'booking_status_index')]

In [34]:
encoded_categorical_cols = [col + "_vec" for col in categorical_cols]
encoded_categorical_cols

['type_of_meal_plan_vec',
 'room_type_reserved_vec',
 'arrival_month_vec',
 'market_segment_type_vec']

In [35]:
numeric_cols = ['no_of_adults', 'no_of_children', 'no_of_weekend_nights', 'no_of_week_nights',
               'required_car_parking_space', 'lead_time',
               'repeated_guest', 'no_of_previous_cancellations', 'no_of_previous_bookings_not_canceled',
               'avg_price_per_room', 'no_of_special_requests'] 

In [36]:
input_columns = encoded_categorical_cols + numeric_cols
input_columns

['type_of_meal_plan_vec',
 'room_type_reserved_vec',
 'arrival_month_vec',
 'market_segment_type_vec',
 'no_of_adults',
 'no_of_children',
 'no_of_weekend_nights',
 'no_of_week_nights',
 'required_car_parking_space',
 'lead_time',
 'repeated_guest',
 'no_of_previous_cancellations',
 'no_of_previous_bookings_not_canceled',
 'avg_price_per_room',
 'no_of_special_requests']

In [37]:
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(inputCols=input_columns, outputCol='features')

stages += [assembler]

Train size:  29040
Test size:  7235


In [39]:
from pyspark.ml.classification import GBTClassifier

gbt = GBTClassifier(featuresCol='features', labelCol='booking_status_index', maxIter=10)

stages.append(gbt)

In [40]:
from pyspark.ml import Pipeline

pipeline = Pipeline(stages=stages)

In [41]:
model = pipeline.fit(train_data)

In [42]:
predictions = model.transform(test_data)
predictions.select('features', 'booking_status_index', 'prediction').show(10)

+--------------------+--------------------+----------+
|            features|booking_status_index|prediction|
+--------------------+--------------------+----------+
|(35,[0,3,19,20,24...|                 1.0|       0.0|
|(35,[0,3,9,20,24,...|                 0.0|       0.0|
|(35,[0,3,15,21,24...|                 0.0|       0.0|
|(35,[0,3,14,20,24...|                 1.0|       0.0|
|(35,[0,3,9,21,24,...|                 0.0|       0.0|
|(35,[0,3,9,21,24,...|                 0.0|       0.0|
|(35,[2,3,10,21,24...|                 0.0|       0.0|
|(35,[1,3,16,20,24...|                 0.0|       0.0|
|(35,[0,3,13,21,24...|                 0.0|       0.0|
|(35,[0,3,14,21,24...|                 0.0|       0.0|
+--------------------+--------------------+----------+
only showing top 10 rows



In [43]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

accuracty_evaluator = MulticlassClassificationEvaluator(labelCol='booking_status_index', predictionCol='prediction', metricName='accuracy')
accuracy = accuracty_evaluator.evaluate(predictions)
print(f'Test Accuracy = {accuracy}')

Test Accuracy = 0.8395300621976504


In [44]:
precision_evaluator = MulticlassClassificationEvaluator(labelCol='booking_status_index', predictionCol='prediction', metricName='precisionByLabel')
precision = precision_evaluator.evaluate(predictions)
print(f'Test Precision = {precision}')

Test Precision = 0.848318851530034


In [45]:
recall_evaluator = MulticlassClassificationEvaluator(labelCol='booking_status_index', predictionCol='prediction', metricName='recallByLabel')
recall = recall_evaluator.evaluate(predictions)
print(f'Test Precision = {recall}')

Test Precision = 0.9261703444009074
