In [None]:
from pyspark.sql import SparkSession
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler, StandardScaler
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.sql.functions import isnan, when, count, col
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml import Pipeline

In [None]:
# Initialize SparkSession
spark = SparkSession.builder.appName("HotelReservations").getOrCreate()

In [None]:
# Load the dataset
data = spark.read.csv("data/hotel-reservations.csv", header=True, inferSchema=True)

# Display the schema of the dataset
data.printSchema()

In [None]:
data.count()

In [None]:
data.show()

In [None]:
stages = []

In [None]:
categorical_cols = ['type_of_meal_plan', 'room_type_reserved', 'arrival_month', 'market_segment_type']

for col in categorical_cols:
    string_indexer = StringIndexer(inputCol=col, outputCol= col + '_index')
    encoder = OneHotEncoder(inputCol=string_indexer.getOutputCol(), outputCol=col + '_vec')
    stages += [string_indexer, encoder]

stages += [StringIndexer(inputCol='booking_status', outputCol= 'booking_status_index')]

In [None]:
numeric_cols = ['no_of_adults', 'no_of_children', 'no_of_weekend_nights', 'no_of_week_nights',
               'required_car_parking_space', 'lead_time', 'arrival_year', 'arrival_date',
               'repeated_guest', 'no_of_previous_cancellations', 'no_of_previous_bookings_not_canceled',
               'avg_price_per_room', 'no_of_special_requests'] 

In [None]:
input_columns = [c + "_vec" for c in categorical_cols] + numeric_cols

In [None]:
assembler = VectorAssembler(inputCols=input_columns, outputCol='features')
stages += [assembler]


In [None]:
# pipeline = Pipeline(stages=stages)

# model = pipeline.fit(train_data)
# model.transform(test_data).show()

In [None]:
train_data, test_data = data.randomSplit([0.8, 0.2], seed=42)
print("Train size: ", train_data.count())
print("Test size: ", test_data.count())

In [None]:
# Create GBTClassifier
gbt = GBTClassifier(featuresCol='features', labelCol='booking_status_index', maxIter=10)
stages.append(gbt)

In [None]:
pipeline = Pipeline(stages=stages)


In [None]:
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

# Create parameter grid
paramGrid = ParamGridBuilder() \
    .addGrid(gbt.maxDepth, [5, 10]) \
    .addGrid(gbt.maxIter, [10, 20]) \
    .addGrid(gbt.stepSize, [0.1, 0.2]) \
    .build()

# Create evaluator
evaluator = MulticlassClassificationEvaluator(labelCol='booking_status_index', predictionCol='prediction', metricName='accuracy')

# Create CrossValidator
cross_validator = CrossValidator(estimator=pipeline,
                          estimatorParamMaps=paramGrid,
                          evaluator=evaluator,
                          numFolds=3)

# Run cross-validation, and choose the best set of parameters.
best_model = cross_validator.fit(train_data)

In [None]:
# Make predictions on test data
predictions = best_model.transform(test_data)

# Evaluate model
accuracy = evaluator.evaluate(predictions)
print("Test Accuracy = %g" % accuracy)
print("Test Error = %g" % (1.0 - accuracy))