#### Data Exploration


#### 1. Load Data


In [0]:
import os
booking_df = spark.read.option("header", True).option("inferSchema", True).csv(f"file:{os.path.dirname(os.getcwd())}/data/booking.csv")
display(booking_df)

In [0]:
booking_df.printSchema()

#### 2. Data Quality Check


In [0]:
# Check for missing values
from pyspark.sql.functions import col, count, when

missing_values_df = booking_df.select(
    [count(when(col(c).isNull(), c)).alias(c) for c in booking_df.columns]
)
display(missing_values_df)

#### 3. Feature Analysis


In [0]:
# Visualize distributions of key numerical columns
import matplotlib.pyplot as plt
import pandas as pd

# Convert Spark DataFrame to Pandas DataFrame for visualization
booking_pd_df = booking_df.toPandas()

# Plot histograms for numerical columns
numerical_columns = [
    "number of adults", "number of children", "number of weekend nights",
    "number of week nights", "lead time", "repeated", "P-C", "P-not-C",
    "average price", "special requests"
]

booking_pd_df[numerical_columns].hist(bins=15, figsize=(15, 10))
plt.show()

In [0]:
import seaborn as sns
# Analyze categorical columns
categorical_columns = ["type of meal", "room type", "market segment type", "booking status"]

for col in categorical_columns:
    display(booking_df.groupBy(col).count().orderBy("count", ascending=False))

# Check correlations between numerical columns
correlation_matrix = booking_pd_df[numerical_columns].corr()
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm")
plt.show()