In [None]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

# 1. Reading Dataset

In [None]:

df = pd.read_csv("../input/hotel-booking/hotel_booking.csv")

# 2. Inspecting Dataset

In [None]:
df.head()

In [None]:
df.describe()

In [None]:
df.columns

# 3. Rows

In [None]:
df.shape
# 119390 Rows , 36 Coloumns

# 4. Missing Data

In [None]:
df.isna().sum().sum()

In [None]:
null_df = df.isnull()
null_df[null_df[:] == True].count()
# Agents has most missing values

# 5. Removing Company

In [None]:
df.drop(["company"],axis=1,inplace=True)

# 6. Most travelers

In [None]:
df["country"].value_counts()[0:5]


# 7. Most ADR name

In [None]:
num = df["adr"].idxmax()
price = df["adr"].max()
name = df.iloc[num]["name"]
print(str(price) + " - " + name)

# 8. Mean of ADR's

In [None]:
round(df["adr"].mean(),2)

# 9. Stayed Nights

In [None]:
df["stays_nights"] = (df["stays_in_week_nights"]) + (df["stays_in_weekend_nights"])

In [None]:
round(df["stays_nights"].mean(),2)

# 10. Special Requests

In [None]:
specialReqs = df[df["total_of_special_requests"] == 5]

In [None]:
specialReqs["name"]

# 11. Family Names Frequency

In [None]:
names = df["name"].apply(lambda p : p.split()[1])

In [None]:
df["family_names"] = names

In [None]:
df["family_names"].value_counts()[0:5]

# 12. Most Babies with Children

In [None]:
df["babies_and_children"] = df["babies"] + df["children"]
most_bc = df[df["babies_and_children"] == 10]
most_bc["name"]

# 13. Most Frequent Phone nums

In [None]:
df["phone-number"].apply(lambda p : p[0:3]).value_counts()[0:3]


# Data Visualization

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns


### Dropped some unrelated columns

In [None]:
df.drop(["phone-number","email","credit_card","name","reservation_status_date"], axis=1,inplace=True)

In [None]:
df.head()

In [None]:
df["total_of_special_requests"].describe()

In [None]:
sns.countplot(x = "hotel",data = df)

* City Hotel has more guests by a large margin

In [None]:
sns.countplot(x = "arrival_date_year", data = df)
plt.show()

* More guests came during 2016 than 2015 and 2017.
* After 2016 we can see a drop off in the rate of guests.
* After 2015 there were a huge jump in the rate of guests.

In [None]:
sns.countplot(x = "arrival_date_year", data = df, hue = "hotel")
plt.show()

* We can see the same trend in both hotels 
* In the 2015-2017 interval City hotel accepted more guests than Resort hotel
* The change in the guests in all years are greater for City hotel than Resort hotel


### Matrix Heatmap On Year | Month


In [None]:
c = df.groupby("arrival_date_year")["arrival_date_month"].value_counts().unstack().fillna(0)
sns.heatmap(data=c,cmap="YlGnBu")
plt.show()

* More guests in July, June, March, May 
* October 2016 and May 2017 has the most guests than the other months
* Some data is missing in 2015 and 2017 

In [None]:
sns.barplot(x = df["meal"], y = df["adr"])
plt.show()

* Guests with HB meal pay more than the others
* Undefined meals pay less than the others

In [None]:
sns.barplot(x = df["meal"], y = df["stays_in_weekend_nights"])
plt.show()

* Guests with HB meal tend to stay longer in hotel
* Guests with SC (The most expensive one) stay less in hotel

In [None]:
sns.barplot(x = df["meal"], y = df["required_car_parking_spaces"])

### So far HB meal guys using most of the utils

In [None]:
sns.barplot(x = df["is_canceled"], y = df["previous_cancellations"], hue = df["is_repeated_guest"])

### Customers with the history of cancellation tend to cancel more often

### Interestingly repeated guests tend to cancel more

In [None]:
sns.barplot(x = df["is_canceled"], y = df["days_in_waiting_list"])

### More days in waiting list, more chance of cancelling

In [None]:
sns.barplot(y = df["is_canceled"], x = df["deposit_type"])

* Non Refundable payments tends to cancel more, why?
* Refundable deposit_type canceled less than the others

In [None]:
fig = plt.figure(figsize=(10,5))
sns.barplot(y = df["adr"], x = df["arrival_date_month"])

### Less adr's in fall and winter
* August gives the most revenue than other months
* We see the U trend type between July to June
* December's revenue slightly higher than the adjacent months
* August, July, June are the top 3 in adr and this could be derived from the fact, that in summer people go to vacation more 

In [None]:
sns.barplot(y = df["children"], x = df["arrival_date_month"])
plt.show()

### More children in July & August that any other month by a large margin
* In July and August more children go to hotels
* In November less children go to hotels (They're busy studying...)

In [None]:
sns.barplot(y = df["babies"], x = df["assigned_room_type"])

### Room C is more suitable for babies

In [None]:
sns.barplot(y = df["children"], x = df["assigned_room_type"])

### Room G, F, H more suitable for children

In [None]:
fig = plt.figure(figsize=(10,5))

sns.barplot(y = df["booking_changes"], x = df["country"])

### Some countries tend to change their booking more

In [None]:
sns.barplot(y = df["adr"], x = df["customer_type"])

### Transient Customers have more adr

In [None]:
sns.barplot(y = df["total_of_special_requests"], x = df["customer_type"])

In [None]:
sns.histplot(df["hotel"])

In [None]:
sns.barplot(y = df["is_canceled"], x = df["hotel"])


### Customers of city hotel cancel more

In [None]:
sns.barplot(y = df["is_repeated_guest"], x = df["hotel"])


### Resort hotel have more repeated guests

In [None]:
fig = plt.figure(figsize=(13,5))

sns.barplot(y = df["adr"], x = df["arrival_date_month"],hue= df["hotel"])

### Resort hotel has more guests in summer and City hotel has more in winter and fall

In [None]:
fig = plt.figure(figsize=(10,5),dpi = 100)
sns.histplot(df["adr"], bins=300 , kde=True)
plt.xlim(0, 400)


In [None]:
sns.scatterplot(x = df["adr"] , y = df["babies"])

In [None]:
sns.stripplot(x = df["arrival_date_year"], y = df["adr"])
plt.ylim(0, 400)


### This could account for inflation or the quality just get better between 2015 - 2017

In [None]:
sns.scatterplot(x = df["stays_in_week_nights"], y = df["stays_in_weekend_nights"])

### More stays in week nights directly results into more stays in weekend

In [None]:
sns.countplot(df["meal"],hue= df["hotel"])

### City hotel's bb is more favorable than Resort one

In [None]:
fig = plt.figure(figsize=(10,5),dpi = 100)

sns.countplot(df["market_segment"],hue= df["hotel"])

In [None]:
sns.histplot(x = df["adults"], bins=400)
plt.xlim(0, 4)


In [None]:
sns.barplot(x = df["hotel"], y = df["total_of_special_requests"])

### More special requests in Resort hotel

In [None]:
sns.barplot(x = df["hotel"], y = df["required_car_parking_spaces"])

### Resort hotel guests require more car parking spaces