In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df=pd.read_csv('../input/hotel-bookings/hotel_bookings.csv')
df.head()

In [None]:
df.shape

In [None]:
df.isna().sum()

In [None]:
# Replace missing values:
# agent: If no agency is given, booking was most likely made without one.
# company: If none given, it was most likely private.
# rest schould be self-explanatory.

In [None]:
def data_clean(df):
    df.fillna(0,inplace=True)
    print(df.isnull().sum())

In [None]:
data_clean(df)

In [None]:
list=['children','adults','babies']

for i in list:
    print('{} has unique values as {}'.format(i,df[i].unique()))

In [None]:
### seems to have some dirtiness in data as Adults,babies & children cant be zero at a same time

In [None]:
df.shape

In [None]:
len(df[df['adults']==0])

In [None]:
filter=(df['children']==0) & (df['adults']==0) & (df['babies']==0)
df[filter]

In [None]:
### Visualise Entire Dataframe where adult,children & babies are 0

In [None]:
pd.set_option('display.max_columns',32)

In [None]:
filter=(df['children']==0) & (df['adults']==0) & (df['babies']==0)
df[filter]

In [None]:
data=df[~filter]

In [None]:
data.shape

In [None]:
data.head()

##  Where do the guests come from?Lets perform Spatial Analysis

In [None]:
country_wise_data=data[data['is_canceled']==0]['country'].value_counts().reset_index()
country_wise_data.columns=['country','No of guests']
country_wise_data

In [None]:
import folium
from folium.plugins import HeatMap

In [None]:
basemap=folium.Map()

In [None]:
country_wise_data.dtypes

In [None]:
import plotly.express as px

In [None]:
# show on map
map_guest = px.choropleth(country_wise_data,
                    locations=country_wise_data['country'],
                    color=country_wise_data['No of guests'], 
                    hover_name=country_wise_data['country'], 
                    title="Home country of guests")
map_guest.show()

#### People from all over the world are staying in these two hotels. Most guests are from Portugal and other countries in Europe

## How much do guests pay for a room per night?

In [None]:
data.head()

#### Both hotels have different room types and different meal arrangements. Seasonal factors are also important. So the prices vary a lot. Since no currency information is given, but Portugal is part of the European Monetary Union, I assume that all prices are in EUR.

In [None]:
data2=data[data['is_canceled']==0]

In [None]:
# boxplot:
plt.figure(figsize=(12, 8))
sns.boxplot(x="reserved_room_type",
            y="adr",
            hue="hotel",
            data=data2)
plt.title("Price of room types per night and person", fontsize=16)
plt.xlabel("Room type", fontsize=16)
plt.ylabel("Price [EUR]", fontsize=16)
plt.legend(loc="upper right")
plt.ylim(0, 600)
plt.show()

#### This figure shows the average price per room, depending on its type and the standard deviation. Note that due to data anonymization rooms with the same type letter may not necessarily be the same across hotels.

## How does the price per night vary over the year?

In [None]:
data_resort = data[(data["hotel"] == "Resort Hotel") & (data["is_canceled"] == 0)]
data_city = data[(data["hotel"] == "City Hotel") & (data["is_canceled"] == 0)]

In [None]:
data_resort.head()

In [None]:
resort_hotel=data_resort.groupby(['arrival_date_month'])['adr'].mean().reset_index()
resort_hotel

In [None]:
city_hotel=data_city.groupby(['arrival_date_month'])['adr'].mean().reset_index()
city_hotel

In [None]:
final=resort_hotel.merge(city_hotel,on='arrival_date_month')
final.columns=['month','price_for_resort','price_for_city_hotel']
final

#### now we will observe over here is month column is not in order, & if we will visualise we will get improper conclusion
#### so very first we have to provide right hierarchy to the month column

In [None]:
!pip install sort-dataframeby-monthorweek

## Dependency package needs to be installed
!pip install sorted-months-weekdays

In [None]:
import sort_dataframeby_monthorweek as sd

In [None]:
def sort_data(df,colname):
    return sd.Sort_Dataframeby_Month(df,colname)


In [None]:
final=sort_data(final,'month')
final

In [None]:
px.line(final, x='month', y=['price_for_resort','price_for_city_hotel'], title='Room price per night over the Months')

### This clearly shows that the prices in the Resort hotel are much higher during the summer (no surprise here)., The price of the city hotel varies less and is most expensive during spring and autumn.

## Which are the most busy month or in which months Guests are high?

In [None]:
data_resort.head()

In [None]:
rush_resort=data_resort['arrival_date_month'].value_counts().reset_index()
rush_resort.columns=['month','no of guests']
rush_resort

In [None]:
rush_city=data_city['arrival_date_month'].value_counts().reset_index()
rush_city.columns=['month','no of guests']
rush_city

In [None]:
final_rush=rush_resort.merge(rush_city,on='month')
final_rush.columns=['month','no of guests in resort','no of guest in city hotel']
final_rush

In [None]:
final_rush=sort_data(final_rush,'month')
final_rush

In [None]:
final_rush.dtypes

In [None]:
final_rush.columns

In [None]:
px.line(data_frame=final_rush, x='month', y=['no of guests in resort','no of guest in city hotel'], title='Total no of guests per Months')

### Conclusion
     The City hotel has more guests during spring and autumn, when the prices are also highest.
    In July and August there are less visitors, although prices are lower.

    Guest numbers for the Resort hotel go down slighty from June to September, which is also when the prices are highest.
    Both hotels have the fewest guests during the winter.

## How long do people stay at the hotels?

In [None]:
filter=data['is_canceled']==0
clean_data=data[filter]

In [None]:
clean_data.head()

In [None]:
clean_data["total_nights"] = clean_data["stays_in_weekend_nights"] + clean_data["stays_in_week_nights"]

In [None]:
clean_data.head()

In [None]:
stay=clean_data.groupby(['total_nights','hotel']).agg('count').reset_index()
stay=stay.iloc[:,0:3]
stay.head()

In [None]:
stay=stay.rename(columns={'is_canceled':'Number of stays'})
stay.head()

In [None]:
plt.figure(figsize=(20, 8))
sns.barplot(x = "total_nights", y = "Number of stays" , hue="hotel",
            hue_order = ["City Hotel", "Resort Hotel"], data=stay)

### Select important Features using Co-relation

In [None]:
data.head()

In [None]:
co_relation=data.corr()
co_relation

In [None]:
co_relation=data.corr()["is_canceled"]
co_relation

In [None]:
co_relation.abs().sort_values(ascending=False)

In [None]:
co_relation.abs().sort_values(ascending=False)[1:]

In [None]:
data.columns

    From this list it is apparent that lead_time, total_of_special_requests, required_car_parking_spaces, booking_changes and previous_cancellations are the 5 most important numerical features.
    However, to predict whether or not a booking will be canceled, the number of booking changes is a possible source of leakage, because this information can change over time.
    I will also not include days_in_waiting_list,booking changes  and arrival_date_year.

    The most important feature to exclude is the "reservation_status":

In [None]:
data.groupby("is_canceled")["reservation_status"].value_counts()

In [None]:
list_not=['days_in_waiting_list','arrival_date_year']

In [None]:
num_features=[col for col in data.columns if data[col].dtype!='O' and col not in list_not]
num_features

In [None]:
cat_not=['arrival_date_year', 'assigned_room_type', 'booking_changes', 'reservation_status', 'country','days_in_waiting_list']

In [None]:
cat_features=[col for col in data.columns if data[col].dtype=='O' and col not in cat_not]
cat_features

In [None]:
data_cat=data[cat_features]

In [None]:
data_cat.head()

In [None]:
import warnings
from warnings import filterwarnings
filterwarnings("ignore")

In [None]:
data_cat['reservation_status_date']=pd.to_datetime(data_cat['reservation_status_date'])

In [None]:
data_cat['year']=data_cat['reservation_status_date'].dt.year
data_cat['month']=data_cat['reservation_status_date'].dt.month
data_cat['day']=data_cat['reservation_status_date'].dt.day

In [None]:
data_cat.head()

In [None]:
data_cat.drop('reservation_status_date',axis=1,inplace=True)

In [None]:
data_cat['cancellation']=data['is_canceled']

In [None]:
data_cat.columns

### Feature Encoding

### Perform Mean Encoding Technique 

In [None]:
cols=data_cat.columns[0:8]
cols

In [None]:
for col in cols:
    print(data_cat.groupby([col])['cancellation'].mean())
    print('\n')

In [None]:
for col in cols:
    print(data_cat.groupby([col])['cancellation'].mean().to_dict())
    print('\n')
    

In [None]:
df=data_cat.copy()

In [None]:

for col in cols:
    dict=data_cat.groupby([col])['cancellation'].mean().to_dict()
    data_cat[col]=data_cat[col].map(dict)
    

In [None]:
data_cat.head(20)

In [None]:
dataframe=pd.concat([data_cat,data[num_features]],axis=1)

In [None]:
dataframe.head()

In [None]:
dataframe.drop(['cancellation'],axis=1,inplace=True)

In [None]:
dataframe.shape

### Handle Outliers

In [None]:
sns.distplot(dataframe['lead_time'])

In [None]:
import numpy as np

def handle_outlier(col):
    dataframe[col]=np.log1p(dataframe[col])


In [None]:
handle_outlier('lead_time')

In [None]:
sns.distplot(dataframe['lead_time'].dropna())

In [None]:
sns.distplot(dataframe['adr'])

In [None]:
handle_outlier('adr')

In [None]:
sns.distplot(dataframe['adr'].dropna())

In [None]:
dataframe.isnull().sum()

In [None]:
dataframe.dropna(inplace=True)

In [None]:
## separate dependent & independent features
y=dataframe['is_canceled']
x=dataframe.drop('is_canceled',axis=1)

### Feature Importance

In [None]:

from sklearn.linear_model import Lasso
from sklearn.feature_selection import SelectFromModel


In [None]:

# select a suitable alpha (equivalent of penalty).
# The bigger the alpha the less features that will be selected.

feature_sel_model = SelectFromModel(Lasso(alpha=0.005, random_state=0)) # remember to set the seed, the random state in this function
feature_sel_model.fit(x,y)

In [None]:
feature_sel_model.get_support()

In [None]:
cols=x.columns

In [None]:
# let's print the number of total and selected features

# this is how we can make a list of the selected features
selected_feat = cols[(feature_sel_model.get_support())]

In [None]:
# let's print some stats
print('total features: {}'.format((x.shape[1])))
print('selected features: {}'.format(len(selected_feat)))


In [None]:
selected_feat

In [None]:
x=x[selected_feat]

### splitting dataset & model Building

In [None]:

from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,train_size=0.75,random_state=0)

In [None]:
from sklearn.linear_model import LogisticRegression
logreg=LogisticRegression()
logreg.fit(x_train,y_train)

In [None]:
y_pred=logreg.predict(x_test)

In [None]:
from sklearn.metrics import confusion_matrix
cm=confusion_matrix(y_test,y_pred)
cm


In [None]:
from sklearn.metrics import accuracy_score
score=accuracy_score(y_test,y_pred)
score


### Cross validate your model

In [None]:
from sklearn.model_selection import cross_val_score
score=cross_val_score(logreg,x,y,cv=10)

In [None]:
score

In [None]:
score.mean()

### Play with multiple Algos

In [None]:
#fit naive bayes
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

In [None]:
### classifier models
models = []
models.append(('LogisticRegression', LogisticRegression()))
models.append(('Naive Bayes',GaussianNB()))
models.append(('RandomForest', RandomForestClassifier()))
models.append(('Decision Tree', DecisionTreeClassifier()))
models.append(('KNN', KNeighborsClassifier(n_neighbors = 5)))

In [None]:


for name, model in models:
    print(name)
    model.fit(x_train, y_train)
    
    # Make predictions.
    predictions = model.predict(x_test)

    # Compute the error.
    from sklearn.metrics import confusion_matrix
    print(confusion_matrix(predictions, y_test))

    from sklearn.metrics import accuracy_score
    print(accuracy_score(predictions,y_test))
    print('\n')