In [33]:
from pyspark.sql import SparkSession
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler, StandardScaler
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.sql.functions import isnan, when, count, col
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml import Pipeline

In [13]:
# Initialize SparkSession
spark = SparkSession.builder.appName("HotelReservations").getOrCreate()

In [14]:
# Load the dataset
data = spark.read.csv("data/hotel-reservations.csv", header=True, inferSchema=True)

# Display the schema of the dataset
data.printSchema()

root
 |-- Booking_ID: string (nullable = true)
 |-- no_of_adults: integer (nullable = true)
 |-- no_of_children: integer (nullable = true)
 |-- no_of_weekend_nights: integer (nullable = true)
 |-- no_of_week_nights: integer (nullable = true)
 |-- type_of_meal_plan: string (nullable = true)
 |-- required_car_parking_space: integer (nullable = true)
 |-- room_type_reserved: string (nullable = true)
 |-- lead_time: integer (nullable = true)
 |-- arrival_year: integer (nullable = true)
 |-- arrival_month: integer (nullable = true)
 |-- arrival_date: integer (nullable = true)
 |-- market_segment_type: string (nullable = true)
 |-- repeated_guest: integer (nullable = true)
 |-- no_of_previous_cancellations: integer (nullable = true)
 |-- no_of_previous_bookings_not_canceled: integer (nullable = true)
 |-- avg_price_per_room: double (nullable = true)
 |-- no_of_special_requests: integer (nullable = true)
 |-- booking_status: string (nullable = true)



In [15]:
data.count()

36275

In [16]:
data.show()

+----------+------------+--------------+--------------------+-----------------+-----------------+--------------------------+------------------+---------+------------+-------------+------------+-------------------+--------------+----------------------------+------------------------------------+------------------+----------------------+--------------+
|Booking_ID|no_of_adults|no_of_children|no_of_weekend_nights|no_of_week_nights|type_of_meal_plan|required_car_parking_space|room_type_reserved|lead_time|arrival_year|arrival_month|arrival_date|market_segment_type|repeated_guest|no_of_previous_cancellations|no_of_previous_bookings_not_canceled|avg_price_per_room|no_of_special_requests|booking_status|
+----------+------------+--------------+--------------------+-----------------+-----------------+--------------------------+------------------+---------+------------+-------------+------------+-------------------+--------------+----------------------------+------------------------------------+--

In [76]:
stages = []

In [77]:
categorical_cols = ['type_of_meal_plan', 'room_type_reserved', 'arrival_month', 'market_segment_type']

for col in categorical_cols:
    string_indexer = StringIndexer(inputCol=col, outputCol= col + '_index')
    encoder = OneHotEncoder(inputCol=string_indexer.getOutputCol(), outputCol=col + '_vec')
    stages += [string_indexer, encoder]

stages += [StringIndexer(inputCol='booking_status', outputCol= 'booking_status_index')]

In [78]:
numeric_cols = ['no_of_adults', 'no_of_children', 'no_of_weekend_nights', 'no_of_week_nights',
               'required_car_parking_space', 'lead_time', 'arrival_year', 'arrival_date',
               'repeated_guest', 'no_of_previous_cancellations', 'no_of_previous_bookings_not_canceled',
               'avg_price_per_room', 'no_of_special_requests'] 

In [79]:
input_columns = [c + "_vec" for c in categorical_cols] + numeric_cols

In [80]:
assembler = VectorAssembler(inputCols=input_columns, outputCol='features')
stages += [assembler]


In [81]:
# pipeline = Pipeline(stages=stages)

# model = pipeline.fit(train_data)
# model.transform(test_data).show()

In [82]:
train_data, test_data = data.randomSplit([0.8, 0.2], seed=42)
print("Train size: ", train_data.count())
print("Test size: ", test_data.count())

Train size:  29040
Test size:  7235


In [83]:
# Create GBTClassifier
gbt = GBTClassifier(featuresCol='features', labelCol='booking_status_index', maxIter=10)
stages.append(gbt)

In [84]:

pipeline = Pipeline(stages=stages)

model = pipeline.fit(train_data)

In [86]:
# Make predictions
predictions = model.transform(test_data)

# Evaluate model
evaluator = MulticlassClassificationEvaluator(labelCol='booking_status_index', predictionCol='prediction', metricName='accuracy')
accuracy = evaluator.evaluate(predictions)
print("Test Accuracy = %g" % accuracy)
print("Test Error = %g" % (1.0 - accuracy))


Test Accuracy = 0.842018
Test Error = 0.157982


24/09/25 23:12:57 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
24/09/25 23:12:57 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.VectorBLAS
