In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import plotly.express as px
import folium

# for ML:
from sklearn.model_selection import train_test_split, KFold, cross_validate, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
import eli5

In [None]:
path = "../input/hotel-booking-demand/hotel_bookings.csv"
df = pd.read_csv(path)

In [None]:
df.head()

In [None]:
df.shape

In [None]:
data = df.copy()

# 1.EDA

How much do guests pay for a room per night?

In [None]:
data = data.drop(data[data["is_canceled"]==1].index)
data["adr_pp"] = data["adr"] / (data["adults"] + data["children"])
room_prices = data[["hotel", "reserved_room_type", "adr_pp"]].sort_values("reserved_room_type")

plt.figure(figsize=(12, 8))
sns.boxplot(x="reserved_room_type",
            y="adr_pp",
            hue="hotel",
            data=room_prices, 
            hue_order=["City Hotel", "Resort Hotel"],
            fliersize=0)
plt.title("Price of room types per night", fontsize=16)
plt.xlabel("Room type", fontsize=16)
plt.ylabel("Price", fontsize=16)
plt.legend(loc="upper right")
plt.ylim(0, 160)
plt.show()

How long do people stay at the hotels?

In [None]:
data["total_nights"] = data["stays_in_weekend_nights"] + data["stays_in_week_nights"]
days = data["total_nights"].value_counts().index
guests_days = data["total_nights"].value_counts()

plt.figure(figsize=(16, 8))
sns.barplot(x = days, y = guests_days)
plt.title("Length of stay", fontsize=16)
plt.xlabel("Number of nights", fontsize=16)
plt.ylabel("Guests", fontsize=16)
plt.xlim(0,22)
plt.show()

Which are the most busy month?

In [None]:
month = data.groupby("arrival_date_month")["hotel"].count().index
guests_monthly = data.groupby("arrival_date_month")["hotel"].count()
guests_monthly[["July","August"]] /= 3
guests_monthly[["January", "February", "March", "April", "May", "June", 
            "September", "October", "November", "December"]] /= 2

plt.figure(figsize=(12, 8))
sns.lineplot(x = month, y= guests_monthly ,sizes=(2.5, 2.5))
plt.title("Average number of guests per month", fontsize=16)
plt.xlabel("Month", fontsize=16)
plt.xticks(rotation=45)
plt.ylabel("Number of guests", fontsize=16)
plt.show()

# 2. Predict cancelations

In [None]:
data1 = df.copy()

In [None]:
data1.isnull().sum()

In [None]:
data1 = data1.drop(["company"], axis = 1)
data1 = data1.fillna({"children": 0,"country": "Unknown", "agent": 0})

In [None]:
data1.isnull().sum()

In [None]:
data1.dtypes

In [None]:
cancel_corr = data1.corr()["is_canceled"]
cancel_corr.abs().sort_values(ascending=False)[1:]

In [None]:
num_features = ["lead_time","arrival_date_week_number","arrival_date_day_of_month",
                "stays_in_weekend_nights","stays_in_week_nights","adults","children",
                "babies","is_repeated_guest", "previous_cancellations",
                "previous_bookings_not_canceled","agent",
                "required_car_parking_spaces", "total_of_special_requests", "adr"]

cat_features = ["hotel","arrival_date_month","meal","market_segment",
                "distribution_channel","reserved_room_type","deposit_type","customer_type"]

features = num_features + cat_features
X = data1.drop(["is_canceled"], axis=1)[features]
y = data1["is_canceled"]

num_transformer = SimpleImputer(strategy="constant")
cat_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="constant", fill_value="Unknown")),
    ("onehot", OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(transformers=[("num", num_transformer, num_features),
                                               ("cat", cat_transformer, cat_features)])

In [None]:
base_models = [("DT_model", DecisionTreeClassifier(random_state=42)),
               ("RF_model", RandomForestClassifier(random_state=42,n_jobs=-1)),
               ("LR_model", LogisticRegression(random_state=42,n_jobs=-1)),
               ("XGB_model", XGBClassifier(random_state=42, n_jobs=-1))]

kfolds = 4 
split = KFold(n_splits=kfolds, shuffle=True, random_state=42)

for name, model in base_models:
    model_steps = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', model)])
    
    cv_results = cross_val_score(model_steps, 
                                 X, y, 
                                 cv=split,
                                 scoring="accuracy",
                                 n_jobs=-1)

    min_score = round(min(cv_results), 4)
    max_score = round(max(cv_results), 4)
    mean_score = round(np.mean(cv_results), 4)
    std_dev = round(np.std(cv_results), 4)
    print(f"{name} cross validation accuarcy score: {mean_score} +/- {std_dev} (std) min: {min_score}, max: {max_score}")