In [1]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.2.tar.gz (317.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.3/317.3 MB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.2-py2.py3-none-any.whl size=317812365 sha256=6bd8bbdb0c182186f330504dfe077cfa1313e53019f800123e5b73d3e8659def
  Stored in directory: /root/.cache/pip/wheels/34/34/bd/03944534c44b677cd5859f248090daa9fb27b3c8f8e5f49574
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.2


In [4]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer, VectorAssembler, StandardScaler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.sql.functions import col

# Initialize Spark Session
spark = SparkSession.builder.appName("HotelBookingClassification").getOrCreate()

# Load the dataset
data = spark.read.csv("/content/merged_file (2).csv", header=True, inferSchema=True)

# Index categorical features (String to Numeric) with handleInvalid='skip'
indexers = [StringIndexer(inputCol=column, outputCol=column+"_index", handleInvalid='skip').fit(data)
            for column in ['deposit_type', 'country', 'customer_type', 'hotel', 'meal', 'distribution_channel', 'reserved_room_type', 'assigned_room_type']]  # Include all string columns here

# Apply StringIndexer to convert categorical variables
for indexer in indexers:
    data = indexer.transform(data)

# Selecting the top features
selected_features = [
    "hotel_index",  # Use indexed columns
    "lead_time",
    "stays_in_weekend_nights",
    "stays_in_week_nights",
    "adults",
    "children",
    "babies",
    "meal_index",  # Use indexed columns
    "country_index",  # Use indexed columns
    "distribution_channel_index",  # Use indexed columns
    "is_repeated_guest",
    "previous_cancellations",
    "previous_bookings_not_canceled",
    "reserved_room_type_index",  # Use indexed columns
    "assigned_room_type_index",  # Use indexed columns
    "booking_changes",
    "deposit_type_index",  # Use indexed columns
    "days_in_waiting_list",
    "customer_type_index",  # Use indexed columns
    "adr",
    "required_car_parking_spaces",
    "total_of_special_requests",
    "Distribution_Id"
]

# Assemble features into a single vector
assembler = VectorAssembler(inputCols=selected_features, outputCol="features")
data = assembler.transform(data)

# Standardizing the features
scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures")
scaler_model = scaler.fit(data)
data = scaler_model.transform(data)

# Select only necessary columns
data = data.select("scaledFeatures", col("is_canceled").alias("label"))

# Train-test split
train_data, test_data = data.randomSplit([0.7, 0.3], seed=42)









In [5]:
from pyspark.ml.classification import DecisionTreeClassifier

# Train Decision Tree model
dt = DecisionTreeClassifier(featuresCol='scaledFeatures', labelCol='label')
dt_model = dt.fit(train_data)

# Make predictions
predictions = dt_model.transform(test_data)

# Evaluate the model
dt_accuracy = evaluator.evaluate(predictions)
print(f"Decision Tree Accuracy: {dt_accuracy}")

# Feature Importance
dt_feature_importances = dt_model.featureImportances
dt_feature_importance = [(selected_features[i], dt_feature_importances[i]) for i in range(len(selected_features))]
dt_sorted_importance = sorted(dt_feature_importance, key=lambda x: x[1], reverse=True)

print("\nFeature Importance (Decision Tree):")
for feature, importance in dt_sorted_importance:
    print(f"{feature}: {importance}")


Decision Tree Accuracy: 0.7924387910569394

Feature Importance (Decision Tree):
deposit_type_index: 0.4818659225068816
country_index: 0.15458354237858682
lead_time: 0.12731818293119956
hotel_index: 0.08647683544414661
customer_type_index: 0.06905293138495466
Distribution_Id: 0.04118918206424597
required_car_parking_spaces: 0.0377635526273485
previous_bookings_not_canceled: 0.0009115423287586655
adr: 0.0003284205198892327
previous_cancellations: 0.0002291277458405811
children: 0.0001427102646231762
stays_in_weekend_nights: 0.0001380498035244681
stays_in_week_nights: 0.0
adults: 0.0
babies: 0.0
meal_index: 0.0
distribution_channel_index: 0.0
is_repeated_guest: 0.0
reserved_room_type_index: 0.0
assigned_room_type_index: 0.0
booking_changes: 0.0
days_in_waiting_list: 0.0
total_of_special_requests: 0.0
