In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
%matplotlib inline
p=pd.read_csv("hotelbookingdataset.csv")
p.head()

In [None]:
p.describe()

In [None]:
p.shape

In [None]:
p.info()

In [None]:
p.nunique()

In [None]:
p.isnull().sum()

# data preprocessing- cleaning,reduction and transformation

In [7]:
p["total_days_of_staying"]=p["stays_in_weekend_nights"]+p["stays_in_week_nights"]

In [None]:
p["total_customers"]=p["adults"]+p["children"]

p.sum(numeric_only = True)


In [None]:
p.isnull().sum()

# eda

In [None]:
sns.set(style = "darkgrid")
plt.title("Cancelled or not by each hotel", fontdict = {'fontsize': 20})
ax = sns.countplot(x = "hotel", hue = 'is_canceled', data = p)

# lead time of each hotel-  is the period of time (most typically measured in calendar days) between when a guest makes the reservation and the actual check-in/arrival date.

In [None]:
p.groupby('hotel')['lead_time'].sum()

# customer visited and not visited in each hotel

In [None]:
p.groupby(['hotel','is_canceled'])["total_customers"].sum()

# actual revenue and loss of  revenue of each hotel in each year 

In [None]:
revenue=p.groupby(['hotel',"is_canceled","arrival_date_year","deposit_type"])['adr'].sum()
revenue

In [None]:
plt.figure(figsize=(8,8))
sns.countplot(x='arrival_date_year',hue='hotel', data=p,palette='husl')
plt.title("Arrivals per year in Both hotels ",fontweight="bold", size=20)


In [None]:
plt.figure(figsize=(15,8))
sns.countplot(data = p, x ="arrival_date_month")
plt.title('Arrivals per month',fontweight="bold", size=20)

In [None]:
import plotly.express as px
fig=px.choropleth(p,locations="country",color="total_customers",scope="world",title="visitors around the world",hover_name="country")
fig.show()

#  customer satisfaction using spearman correlation with reduced features 

In [21]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from scipy import stats
from sklearn.metrics import confusion_matrix, roc_auc_score, accuracy_score, plot_confusion_matrix, classification_report
import warnings
warnings.filterwarnings(action='ignore')

In [22]:
p["hotel"].replace(["Resort Hotel","City Hotel"],[0,1],inplace=True)

In [23]:
x=p.drop([
"arrival_date_year",
"arrival_date_month",
"arrival_date_week_number",
"arrival_date_day_of_month",
"stays_in_weekend_nights",
"stays_in_week_nights",
"adults",
"children",
"babies",
"meal",
"country",
"market_segment",
"distribution_channel",
"previous_cancellations",
"previous_bookings_not_canceled",
"reserved_room_type",
"assigned_room_type",
"deposit_type",
"agent",
"company",
"days_in_waiting_list",
"customer_type",
"required_car_parking_spaces",
"reservation_status",
"reservation_status_date"], axis=1)

In [None]:
x

In [None]:
Resort_hotel = x.loc[x['hotel'] ==0]
Resort_hotel

In [None]:
corr =Resort_hotel.corr(method='spearman')
# Generate a mask for the upper triangle
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(20, 18))

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, annot = True, mask=mask, cmap="YlGnBu", center=0,
            square=True, linewidths=.5)

In [None]:
city_hotel = x.loc[x['hotel'] ==1]
city_hotel

In [None]:
corr = city_hotel.corr(method='spearman')
# Generate a mask for the upper triangle
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(20, 18))

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, annot = True, mask=mask, cmap="YlGnBu", center=0,
            square=True, linewidths=.5)

# prediction of cancellation using supervised classifier

In [29]:
features = ['hotel', 'lead_time', "is_repeated_guest","booking_changes",
           "adr","total_of_special_requests","total_days_of_staying","total_customers"]
target = ['is_canceled']

X_train = x[features]
y_train = x[target].to_numpy()
X_test = x[features]
y_test = x[target].to_numpy()

# Normalize Features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

In [30]:
def run_model(model, X_train, y_train, X_test, y_test, verbose=True):
    if verbose == False:
        model.fit(X_train,y_train, verbose=0)
    else:
        model.fit(X_train,y_train)
    y_pred = model.predict(X_test)
    roc_auc = roc_auc_score(y_test, y_pred)
    print("ROC_AUC = {}".format(roc_auc))
    print(classification_report(y_test,y_pred,digits=5))
    plot_confusion_matrix(model, X_test, y_test,cmap=plt.cm.Blues, normalize = 'all')
    
    return model, roc_auc

# random forest - highest accuracy

In [None]:
model_rf = RandomForestClassifier()
model_rf, roc_auc_rf = run_model(model_rf, X_train, y_train, X_test, y_test)

# lightgbm classifier

In [None]:
import lightgbm as lgb
model_lgb = lgb.LGBMClassifier()
model_lgb, roc_auc_lgb = run_model(model_lgb, X_train, y_train, X_test, y_test)

# catboost classifier

In [None]:
import catboost as cb

model_cb = cb.CatBoostClassifier()
model_cb, roc_auc_cb = run_model(model_cb, X_train, y_train, X_test, y_test, verbose=False)

# xgboost classifier

In [None]:
import xgboost as xgb


model_xgb = xgb.XGBClassifier()
model_xgb, roc_auc_xgb = run_model(model_xgb, X_train, y_train, X_test, y_test)

In [None]:
x

# hotel booking prediction

In [None]:
data={"hotel":[0], "lead_time":[450],"is_repeated_guest":[0],"booking_changes":[1],
    "adr":[85],"total_of_special_requests":[0],"total_days_of_staying":[2],"total_customers":[2]}
y=pd.DataFrame(data)
y


In [None]:
featuresw=['hotel', 'lead_time', "is_repeated_guest","booking_changes",
           "adr","total_of_special_requests","total_days_of_staying","total_customers"]
test = y[featuresw]
test = scaler.fit_transform(test)
model=RandomForestClassifier()
model.fit(X_train,y_train)
y_pred = model.predict(test)
print(y_pred) 