In [1]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.2.tar.gz (317.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.3/317.3 MB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.2-py2.py3-none-any.whl size=317812365 sha256=c0c089d0a6eb750a2d0346bced2b845c52c1d58a0691341b8cc0ef0de748b45f
  Stored in directory: /root/.cache/pip/wheels/34/34/bd/03944534c44b677cd5859f248090daa9fb27b3c8f8e5f49574
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.2


In [2]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer, VectorAssembler, StandardScaler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.sql.functions import col

# Initialize Spark Session
spark = SparkSession.builder.appName("HotelBookingClassification").getOrCreate()

# Load the dataset
data = spark.read.csv("/content/merged_file.csv", header=True, inferSchema=True)

# Index categorical features (String to Numeric) with handleInvalid='skip'
indexers = [StringIndexer(inputCol=column, outputCol=column+"_index", handleInvalid='skip').fit(data)
            for column in ['deposit_type', 'country', 'customer_type', 'hotel', 'meal', 'distribution_channel', 'reserved_room_type', 'assigned_room_type']]  # Include all string columns here

# Apply StringIndexer to convert categorical variables
for indexer in indexers:
    data = indexer.transform(data)

# Selecting the top features
selected_features = [
    "hotel_index",  # Use indexed columns
    "lead_time",
    "stays_in_weekend_nights",
    "stays_in_week_nights",
    "adults",
    "children",
    "babies",
    "meal_index",  # Use indexed columns
    "country_index",  # Use indexed columns
    "distribution_channel_index",  # Use indexed columns
    "is_repeated_guest",
    "previous_cancellations",
    "previous_bookings_not_canceled",
    "reserved_room_type_index",  # Use indexed columns
    "assigned_room_type_index",  # Use indexed columns
    "booking_changes",
    "deposit_type_index",  # Use indexed columns
    "days_in_waiting_list",
    "customer_type_index",  # Use indexed columns
    "adr",
    "required_car_parking_spaces",
    "total_of_special_requests",
    "Distribution_Id"
]

# Assemble features into a single vector
assembler = VectorAssembler(inputCols=selected_features, outputCol="features")
data = assembler.transform(data)

# Standardizing the features
scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures")
scaler_model = scaler.fit(data)
data = scaler_model.transform(data)

# Select only necessary columns
data = data.select("scaledFeatures", col("is_canceled").alias("label"))

# Train-test split
train_data, test_data = data.randomSplit([0.7, 0.3], seed=42)



In [3]:
# Train Logistic Regression model
lr = LogisticRegression(featuresCol='scaledFeatures', labelCol='label', maxIter=100)
lr_model = lr.fit(train_data)

# Make predictions
predictions = lr_model.transform(test_data)

# Evaluate the model
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print(f"LogisticRegression Accuracy: {accuracy}")

# Feature importance (coefficients)
coefficients = lr_model.coefficients.toArray()
feature_importance = [(selected_features[i], coefficients[i]) for i in range(len(coefficients))]
sorted_importance = sorted(feature_importance, key=lambda x: abs(x[1]), reverse=True)

print("\nFeature Importance (Coefficients):")
for feature, coeff in sorted_importance:
    print(f"{feature}: {coeff}")

LogisticRegression Accuracy: 0.7910188043783328

Feature Importance (Coefficients):
required_car_parking_spaces: -9.648986848456225
previous_cancellations: 2.5578133668982623
deposit_type_index: 1.2475470323101092
previous_bookings_not_canceled: -0.7125351595653298
assigned_room_type_index: -0.5154199507666305
customer_type_index: -0.45003437741049607
total_of_special_requests: -0.44501484930844554
reserved_room_type_index: 0.3770767407561245
lead_time: 0.35191739782943315
adr: 0.2613300698767953
booking_changes: -0.2519872849323774
distribution_channel_index: -0.22287723924900876
is_repeated_guest: -0.13875229450737947
country_index: -0.07635709598923597
Distribution_Id: 0.07119107291494627
adults: 0.0666361697630424
stays_in_week_nights: 0.0637046146022792
children: 0.06114398101395882
hotel_index: -0.05374166806378894
meal_index: 0.043544868551667754
days_in_waiting_list: -0.032244350132075245
stays_in_weekend_nights: 0.019960858034741086
babies: 0.00841409780043186
