In [None]:
!pip install sort-dataframeby-monthorweek
!pip install sorted-months-weekdays

In [None]:
from scipy import stats
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd  
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objs as go
import sort_dataframeby_monthorweek as sd

## 讀取資料

In [None]:
ht_train = pd.read_csv('../input/hotel-bookings/hotel_bookings.csv')

In [None]:
ht_train.head()

## 資料前處理(異常值、缺失值、新增變數)

In [None]:
ht_train.isnull().sum()

In [None]:
ht_train_full = ht_train.copy()
ht_train_full.company.fillna(0, inplace=True)
ht_train_full.children.fillna(0, inplace=True)
ht_train_full.country.fillna(ht_train_full.country.mode()[0],inplace=True)
ht_train_full.agent.fillna(0, inplace=True)

### 進行缺失值補值

1.company欄位因缺失值比例太高，但官方資料宣稱不得刪除，故因為是數值型變數，因而缺失值補0

2.children欄位因缺失值少且為數值型變數，故直接補0

3.country欄位因來自葡萄牙的人佔資料比的四成且為類別型變數，故採用眾數補值

4.agent欄位因考量到可能不經由旅行社代訂，故設定為0表時無旅行社

### 新增變數

1.新增總入住天數

2.新增入住人數

3.刪除無人的住宿紀錄

In [None]:
nums_stays = ht_train_full.stays_in_weekend_nights + ht_train_full.stays_in_week_nights
ht_train_full.insert(10,"stays_nights_total",nums_stays)

In [None]:
nums_peoples = ht_train_full.adults + ht_train_full.children + ht_train_full.babies
ht_train_full.insert(13,"number_of_people",nums_peoples)

In [None]:
filter_0 = (ht_train_full.adults == 0)& (ht_train_full.adults == 0) & (ht_train_full.babies == 0)
ht_train_full[~filter_0]

In [None]:
plt.boxplot(x=ht_train_full['adr'])

觀察上圖可發現存在離群值(>5000)，故直接刪除

In [None]:
ht_train_full = ht_train_full[ht_train_full["adr"]<5000]

In [None]:
book_amount = ht_train_full.groupby("hotel")["is_canceled"].count().reset_index().rename(columns={"is_canceled":"amount"})
book_amount["book_rate"] = round(book_amount.amount/book_amount.amount.sum(),4)
book_amount

比較兩種飯店的訂房率，發現City Hotel的訂房率遠大於Resort Hotel，可能是價格因素所致。

## 視覺化分析(EDA)

In [None]:
month_count=ht_train_full.groupby(['arrival_date_month','hotel'])['arrival_date_month'].count().reset_index(name='counts')
month_count.columns = ['month','hotel', 'count']
final_month_count =sd.Sort_Dataframeby_Month(month_count, 'month')

In [None]:
px.bar(final_month_count, y='count', x="month",color = 'hotel',barmode='group', title='顧客住房日期(月)分布圖')

針對該柱狀圖，我們可以發現不論是何種飯店，在暑假(7、8月)時都會有最高的訂房量。預計是因為增加了許多家庭客以及孩子的數量。而 City Hotel有較多的訂房量，可以推論因為 City Hotel 有較便宜的房價，可以吸引一般家庭客或上班族，導致其訂房量增加。

In [None]:
month_count=ht_train_full.groupby(['arrival_date_month','hotel'])['adr'].mean().reset_index()
month_count.columns = ['month','hotel', 'adr']
final_month_count =sd.Sort_Dataframeby_Month(month_count, 'month')

In [None]:
px.bar(final_month_count, y='adr', x="month",color = 'hotel',barmode='group', title='顧客平均消費(月)分布圖')

透過每月平均消費的柱狀圖，可以觀察到暑假時消費相對較多，尤其是 Resort Hotel。推論該旅館因為是較高消費的度假旅館，在花費上會較多。且搭配前面的「顧客平均消費(月)分布圖」可以發現，即便 Resort Hotel的訂房量較少，但因為其房價高所以會讓每月的平均消費大於 City Hootel。

In [None]:
ht_train_full_y = ht_train_full[ht_train_full['is_canceled'] == 1]
ht_train_full_n = ht_train_full[ht_train_full['is_canceled'] == 0]
final_ht_train_full_n =sd.Sort_Dataframeby_Month(ht_train_full_n, 'arrival_date_month')
px.box(final_ht_train_full_n, y='adr', x="arrival_date_month",color = 'hotel', title='顧客消費(月)盒狀圖')

觀察到顧客消費的盒狀圖，可以發現 Resort Hotel的離群值會比 City Hotel來的多且平均消費較高，代表較多的有錢人會去度假飯店，追求好的品質跟服務，同時造成了較多的花費。且 Resort Hotel 會在旅遊淡季時獲得比較多的收入，可利用這個特點在淡季時做一定的促銷增加更多的來客量。

In [None]:
room_type_count=ht_train_full.groupby(['assigned_room_type','hotel'])['assigned_room_type'].count().reset_index(name='counts')
room_type_count.columns = ['room_type','hotel', 'count']
#final_month_count =sd.Sort_Dataframeby_Month(month_count, 'month')

In [None]:
px.bar(room_type_count, y='count', x="room_type",color = 'hotel',barmode='group', title='顧客訂房類型')

觀察到顧客的訂房類型，發現不論是哪種類型的飯店，大多都選擇 A、D、E 三種房型。我們可以透過該結果建議飯店可以將較沒人使用的 K、P、L 的房型改建成其他顧客較喜好的房型，增加訂房量。

In [None]:
stays_nights_count=ht_train_full.groupby(['stays_nights_total','hotel'])['stays_nights_total'].count().reset_index(name='counts')
stays_nights_count.columns = ['stays_nights','hotel', 'count']
#final_month_count =sd.Sort_Dataframeby_Month(month_count, 'month')


In [None]:
px.bar(stays_nights_count, y='count', x="stays_nights",color = 'hotel',barmode='group', title='顧客住房天數分布圖')

大多顧客的住房天數都落在1~3天，代表裡面有許多商務客或是家庭客或以短期旅遊為主的顧客。唯獨在 Resort Hotel 上有許多住宿7天的顧客，下面會進一步分析該客群。

In [None]:
stays_nights_7 = ht_train_full[(ht_train_full['hotel'] == 'Resort Hotel') & (ht_train_full['stays_nights_total'] == 7 )]
stays_nights_7.groupby('reserved_room_type')['hotel'].count()

In [None]:
stays_nights_7.groupby('country')['hotel'].count().reset_index(name='counts').sort_values('counts',ascending=False)

In [None]:
customer_7 = stays_nights_7.groupby('customer_type')['hotel'].count().reset_index(name='counts')

In [None]:
px.pie(customer_7,values='counts', names='customer_type')

In [None]:
customer_country_7 = stays_nights_7.groupby(['customer_type','country'])['hotel'].count().reset_index(name='counts')
customer_country_7 = customer_country_7[customer_country_7['counts']>19]

In [None]:
px.bar(customer_country_7, y='counts', x='customer_type',color = 'country', title='顧客類型&國家')

單獨看住宿7天的顧客，發現房型分布跟整體資料的房型分布並沒有差太多。如果以國籍來看，可以發現除了本地的葡萄牙人以外，又以德國人為多數。若以顧客類型來看，又以短期顧客為主，意外的不是以商務客為主，最後我們綜合國籍跟顧客類型，發現短期客依然以本地的葡萄牙人最多，商務客以英國人最多，短期團客則是英國、葡萄牙、伊朗為主。透過這些分析，我們可以考慮針對外國商務客如果有長期住宿需求的話給予一定的折扣，吸引更多外國商務客。

In [None]:
ht_train_full1 = ht_train_full[ht_train_full['is_canceled'] == 0]
px.box(ht_train_full1, y='adr', x='assigned_room_type',color = 'hotel', title='各房間類型顧客消費盒狀圖')

透過該盒狀圖可以發現，大多數房型都還是以高消費的 Resort Hotel 為收到最多旅客花費的飯店。特別的是，Resort Hotel 的 C 房型的消費比 City Hotel 還多。可建議 Resort Hotel 額外加強對於 G 房型的促銷，吸引較喜歡這飯店的高消費族群前來消費，進而增加業績。

In [None]:
ht_resort = ht_train_full[(ht_train_full['hotel'] == 'Resort Hotel') & (ht_train_full['is_canceled'] == 0)]
ht_city = ht_train_full[(ht_train_full['hotel'] == 'City Hotel') & (ht_train_full['is_canceled'] == 0)]

In [None]:
adults_count=ht_resort.groupby(['arrival_date_month','hotel'])['adults'].sum().reset_index()
adults_count.columns = ['month','hotel', 'adults']
children_count=ht_resort.groupby(['arrival_date_month','hotel'])['children'].sum().reset_index()
babies_count=ht_resort.groupby(['arrival_date_month','hotel'])['babies'].sum().reset_index()
adults_count['children'] = children_count['children']
adults_count['babies'] = babies_count['babies']
adults_count =sd.Sort_Dataframeby_Month(adults_count, 'month')

In [None]:
import plotly.graph_objects as go


fig = go.Figure(go.Bar(x=adults_count['month'], y=adults_count['adults'], name='adults'))
fig.add_trace(go.Bar(x=adults_count['month'], y=adults_count['children'], name='children'))
fig.add_trace(go.Bar(x=adults_count['month'], y=adults_count['babies'], name='babies'))

fig.update_layout(barmode='stack',title=go.layout.Title (text="各月份顧客組成 (Resort Hotel)"))
fig.show()

In [None]:
adults_count_city=ht_city.groupby(['arrival_date_month','hotel'])['adults'].sum().reset_index()
adults_count_city.columns = ['month','hotel', 'adults']
children_count_city=ht_city.groupby(['arrival_date_month','hotel'])['children'].sum().reset_index()
babies_count_city=ht_city.groupby(['arrival_date_month','hotel'])['babies'].sum().reset_index()
adults_count_city['children'] = children_count_city['children']
adults_count_city['babies'] = babies_count_city['babies']
adults_count_city =sd.Sort_Dataframeby_Month(adults_count_city, 'month')

In [None]:
import plotly.graph_objects as go


fig = go.Figure(go.Bar(x=adults_count_city['month'], y=adults_count_city['adults'], name='adults'))
fig.add_trace(go.Bar(x=adults_count_city['month'], y=adults_count_city['children'], name='children'))
fig.add_trace(go.Bar(x=adults_count_city['month'], y=adults_count_city['babies'], name='babies'))

fig.update_layout(barmode='stack',title=go.layout.Title(text="各月份顧客組成 (City Hotel)"))
fig.show()

觀察 City Hotel 跟 Resort Hotel 可以發現，在暑假旅遊旺季的時候會因為多了許多小孩而有顯著的人數增加。可以藉由此現象，建議飯店可以推出許多關於親子的促銷活動，例如增加許多可以親子遊玩的飯店套裝行程，亦或是設立一些兒童專房，亦或是給予折扣。增加更多對於家庭客的拉力，進而增加飯店業績。

In [None]:
room_type_meal=ht_train_full_n.groupby(['assigned_room_type','hotel','meal'])['meal'].count().reset_index(name='counts')
room_type_meal.columns = ['room_type','hotel', 'meal','meal_count']
room_type_meal_resort = room_type_meal[(room_type_meal['hotel'] == 'Resort Hotel') ]
room_type_meal_city = room_type_meal[(room_type_meal['hotel'] == 'City Hotel') ]

In [None]:
px.bar(room_type_meal_resort, y='meal_count', x="room_type",color = 'meal',barmode='group', title='顧客訂房類型&餐點 (Resort Hotel)')

In [None]:
px.bar(room_type_meal_city, y='meal_count', x="room_type",color = 'meal',barmode='group', title='顧客訂房類型&餐點 (City Hotel)')

透過以上圖表可以發現，不論是哪種飯店何哪種房型，都以 BB 類型的餐點為顧客的主要選擇，不過我們也發現在某些餐點也是有一定的顧客支持。在後面會進一步討論各個餐點的顧客是來自於哪個國家。

In [None]:
meal_city = ht_train_full[(ht_train_full['is_canceled'] == 0) & (ht_train_full['hotel'] == 'City Hotel') & (ht_train_full['meal'] == 'BB')]
country_meal_city  = meal_city.groupby(['arrival_date_month','country'])['country'].count().reset_index(name='counts')
country_meal_city.columns = ['month','country', 'counts']
country_meal_city = country_meal_city[country_meal_city["counts"]>80]
country_meal_city =sd.Sort_Dataframeby_Month(country_meal_city, 'month')

In [None]:
px.bar(country_meal_city, y='counts', x="month",color = 'country', title='顧客居住地(國家)&餐點類型 (City Hotel & meal = BB)')

發現 City Hotel 的顧客在喜好 BB 類型的餐點分布上來自葡萄牙跟法國特別喜好 BB 類型的餐點(英國、德國人也頗喜歡，八月份時西班牙人也很喜歡)

In [None]:
meal_city = ht_train_full[(ht_train_full['is_canceled'] == 0) & (ht_train_full['hotel'] == 'City Hotel') & (ht_train_full['meal'] == 'HB')]
country_meal_city  = meal_city.groupby(['arrival_date_month','country'])['country'].count().reset_index(name='counts')
country_meal_city.columns = ['month','country', 'counts']
country_meal_city = country_meal_city[country_meal_city["counts"]>10]
country_meal_city =sd.Sort_Dataframeby_Month(country_meal_city, 'month')

In [None]:
px.bar(country_meal_city, y='counts', x="month",color = 'country', title='顧客居住地(國家)&餐點類型 (City Hotel & meal = HB)')

發現 City Hotel 的顧客在喜好 HB 類型的餐點分布上來自葡萄牙跟德國的特別喜好 HB 類型的餐點(法國以及義大利人也頗喜歡)

In [None]:
meal_city = ht_train_full[(ht_train_full['is_canceled'] == 0) & (ht_train_full['hotel'] == 'City Hotel') & (ht_train_full['meal'] == 'SC')]
country_meal_city  = meal_city.groupby(['arrival_date_month','country'])['country'].count().reset_index(name='counts')
country_meal_city.columns = ['month','country', 'counts']
country_meal_city = country_meal_city[country_meal_city["counts"]>30]
country_meal_city =sd.Sort_Dataframeby_Month(country_meal_city, 'month')

In [None]:
px.bar(country_meal_city, y='counts', x="month",color = 'country', title='顧客居住地(國家)&餐點類型 (City Hotel & meal = SC)')

發現 City Hotel 的顧客在喜好 SC 類型的餐點分布上較平均但來自葡萄牙和法國以及英國特別喜好SC類型的餐點

In [None]:
meal_city_resort = ht_train_full[(ht_train_full['is_canceled'] == 0) & (ht_train_full['hotel'] == 'Resort Hotel') & (ht_train_full['meal'] == 'BB')]
country_meal_city_resort  = meal_city_resort.groupby(['arrival_date_month','country'])['country'].count().reset_index(name='counts')
country_meal_city_resort.columns = ['month','country', 'counts']
country_meal_city_resort = country_meal_city_resort[country_meal_city_resort["counts"]>50]
country_meal_city_resort =sd.Sort_Dataframeby_Month(country_meal_city_resort, 'month')

In [None]:
px.bar(country_meal_city_resort, y='counts', x="month",color = 'country', title='顧客居住地(國家)&餐點類型 (Resort Hotel & meal = BB)')

發現 Resort Hotel 的顧客在喜好 BB 類型的餐點分布上來自葡萄牙和英國的特別喜好BB類型的餐點(伊朗人也頗喜歡)

In [None]:
meal_city_resort = ht_train_full[(ht_train_full['is_canceled'] == 0) & (ht_train_full['hotel'] == 'Resort Hotel') & (ht_train_full['meal'] == 'HB')]
country_meal_city_resort  = meal_city_resort.groupby(['arrival_date_month','country'])['country'].count().reset_index(name='counts')
country_meal_city_resort.columns = ['month','country', 'counts']
country_meal_city_resort = country_meal_city_resort[country_meal_city_resort["counts"]>20]
country_meal_city_resort =sd.Sort_Dataframeby_Month(country_meal_city_resort, 'month')

In [None]:
px.bar(country_meal_city_resort, y='counts', x="month",color = 'country', title='顧客居住地(國家)&餐點類型 (Resort Hotel & meal = HB)')

發現 Resort Hotel 的顧客在喜好 HB 類型的餐點分布上來自葡萄牙和英國的特別喜好 HB 類型的餐點(7、8月時西班牙人以及9月的法國人也頗喜歡)

In [None]:
sale_data = ht_train_full[['hotel','distribution_channel','market_segment']]
sale_data['equal'] = np.where(sale_data['distribution_channel']==sale_data['market_segment'],1,0)
sale_data['equal'].value_counts()


In [None]:
sale_data.groupby(["hotel","equal"])["market_segment"].count()

可以看出，預期銷售管道跟顧客實際使用的銷售管道有一定的落差

In [None]:
ht_train_full2 = ht_train_full[ht_train_full['is_canceled'] == 1]

In [None]:
sale_data_y = ht_train_full1[['hotel','customer_type','country','distribution_channel','market_segment']]
sale_data_y['equal'] = np.where(sale_data_y['distribution_channel']==sale_data_y['market_segment'],1,0)
sale_data_y['equal'].value_counts()

In [None]:
sale_data_y.groupby(['hotel','equal'])['market_segment'].count()

In [None]:
sale_data_y.groupby(['hotel','customer_type','equal'])['market_segment'].count()

In [None]:
sale_data_n = ht_train_full2[['hotel','customer_type','country','distribution_channel','market_segment']]
sale_data_n['equal'] = np.where(sale_data_n['distribution_channel']==sale_data_n['market_segment'],1,0)
sale_data_n['equal'].value_counts()

In [None]:
sale_data_n.groupby(['hotel','equal'])['market_segment'].count()


In [None]:
sale_data_n.groupby(['hotel','customer_type','equal'])['market_segment'].count()

透過這兩個table,我們可以看到不論是那個飯店類型，取消訂單的人所使用的銷售管道幾乎都跟我們預期的不同，可推論說銷售管道的錯誤，有一定機會導致取消訂單的問題發生

In [None]:
import seaborn as sns
ht_lead = ht_train_full.groupby('lead_time')['is_canceled'].describe().sort_values(by='mean',ascending=False)
ht_lead_10 = ht_lead[ht_lead['count'] > 10]
x = ht_lead_10.index
y = round(ht_lead_10['mean'],4)*100

plt.figure(figsize=(12,12))
sns.regplot(x=x,y=y)
plt.title(' The relationship between booking and cancellation' )
plt.xlabel('booking time')
plt.ylabel('Cancel(%)')

## 顧客EDA分析結論

### 住宿時間、房型

1. 我們發現，在7、8月時會有較多的成人、小孩，推論可能是暑假讓許多家庭願意帶小孩出門旅遊。藉由此現象，建議飯店可以推出許多關於親子的促銷活動，例如增加許多可以親子遊玩的飯店套裝行程，亦或是設立一些兒童專房，亦或是給予折扣。增加更多對於家庭客的拉力，進而增加飯店業績。

2. 而 City Hotel有較多的訂房量，可以推論因為 City Hotel 有較便宜的房價，可以吸引一般家庭客或上班族，導致其訂房量增加。

3. 觀察到顧客的平均消費，可以發現 Resort Hotel的離群值會比 City Hotel來的多且平均消費較高，代表較多的有經濟能力的人會去度假飯店，追求好的品質跟服務，同時造成了較多的花費。且 Resort Hotel 會在旅遊淡季時獲得比較多的收入，建議 Resort Hotel 可利用這個特點在淡季時做一定的促銷，例如給予訂房折扣或增加不同的套裝服務。進而增加更多的來客量。

4. 觀察到顧客的訂房類型，發現不論是哪種類型的飯店，大多都選擇 A、D、E 三種房型。經由結果我們建議飯店可以將較少人使用的 K、P、L 的房型改建成其他顧客較喜好的房型，增加訂房量。

5. 大多顧客的住房天數都落在1~3天，代表裡面有許多商務客或是家庭客或以短期旅遊為主的顧客。唯獨在 Resort Hotel 上有許多住宿7天的顧客，下面會進一步分析該客群。

    5.1  
    觀察住宿7天的族群，發現該族群的房型分布跟整體族群的房型分布並沒有差太多。如果以國籍來看，可以發現除了本地的葡萄牙人以外，又以德國人為多數。若以顧客類型來看，又以短期顧客為主，意外的不是以商務客為主。最後我們綜合國籍跟顧客類型，發現短期客依然以本地的葡萄牙人最多，商務客以英國人最多，短期團客則是英國、葡萄牙、伊朗為主。透過這些分析，我們可以考慮針對外國商務客如果有長期住宿需求的話給予一定的折扣，藉此吸引更多外國商務客。

    5.2 
    我們也發現大多數房型都還是以高消費的 Resort Hotel 為收到最多旅客花費的飯店。特別的是，City Hotel 的 C、H 房型的消費比 Resort Hotel 還多，而且該族群的消費能力也有不錯的表現。可建議 City Hotel 額外加強對於這兩種房型的促銷，吸引較喜歡這房型的高消費族群前來消費，進而增加業績。



### 住宿餐點

不論是哪種飯店何哪種房型，都以 BB 類型的餐點為顧客的主要選擇，不過我們也發現在某些餐點也是有一定的顧客支持。在後面會進一步討論各個餐點的顧客是來自於哪個國家。
#### City Hotel

1. 發現 City Hotel 的顧客在喜好 BB 類型的餐點分布上來自葡萄牙跟法國特別喜好 BB 類型的餐點(英國、德國人也頗喜歡，八月份時西班牙人也很喜歡)

2. 發現 City Hotel 的顧客在喜好 HB 類型的餐點分布上來自葡萄牙跟德國的特別喜好 HB 類型的餐點(法國以及義大利人也頗喜歡)

3. 發現 City Hotel 的顧客在喜好 SC 類型的餐點分布上較平均但來自葡萄牙和法國以及英國特別喜好SC類型的餐點

#### Resort Hotel

1. 發現 Resort Hotel 的顧客在喜好 BB 類型的餐點分布上來自葡萄牙和英國的特別喜好BB類型的餐點(伊朗人也頗喜歡)

2. 發現 Resort Hotel 的顧客在喜好 HB 類型的餐點分布上來自葡萄牙和英國的特別喜好 HB 類型的餐點(7、8月時西班牙人以及9月的法國人也頗喜歡)

針對不同餐點，我們建議可以在餐點上增加一些除了葡萄牙本地特色以外，該顧客族群母國的料理元素。讓顧客在用餐時也能有像在家吃飯的感覺，建立起顧客對於飯店的喜好，進而轉為忠誠的顧客。

### 取消原因

#### 銷售管道

我們發現，飯店所預期的銷售管道跟顧客實際使用的銷售管道是有一定的落差的。且可以觀察到取消訂單的有許多人都是錯誤的銷售管道，代表對顧客的精準推銷，是有助於確保訂單不被取消，但需要更努力的增加其精準度。我們也發現，透過旅行社等管道的精準銷售的比例也較高，取消訂單的人數也較少，因而可以增加對旅行社的促銷方案，增加業績。

#### 預定時間

我們透過預定時長跟取消的折線圖觀察到兩者呈正相關，代表越早預訂的顧客會有較高機率取消。針對這個現象，我們可以規劃相關優惠方案，例如越早預訂就可以享有早鳥價，或是以訂金的方式，減少顧客想取消訂單的意願。


## 預測模型

In [None]:
from sklearn import metrics, linear_model
from sklearn.feature_extraction import DictVectorizer   
from sklearn.compose import ColumnTransformer  
from sklearn.pipeline import Pipeline  
from sklearn.impute import SimpleImputer 
from sklearn.preprocessing import StandardScaler, OneHotEncoder,LabelEncoder,FunctionTransformer  
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier 
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier,AdaBoostClassifier 
from sklearn.linear_model import LogisticRegression,Ridge, RidgeCV, Lasso, LassoCV
from sklearn.metrics import mean_absolute_error, accuracy_score, classification_report
from xgboost import XGBRegressor

In [None]:
ht_train_full.var()

In [None]:
ht_train_full['lead_time'] = np.log(ht_train_full['lead_time'] + 1)
ht_train_full['arrival_date_week_number'] = np.log(ht_train_full['arrival_date_week_number'] + 1)
ht_train_full['arrival_date_day_of_month'] = np.log(ht_train_full['arrival_date_day_of_month'] + 1)
ht_train_full['agent'] = np.log(ht_train_full['agent'] + 1)
ht_train_full['company'] = np.log(ht_train_full['company'] + 1)
ht_train_full['days_in_waiting_list'] = np.log(ht_train_full['days_in_waiting_list'] + 1)
ht_train_full['adr'] = np.log(ht_train_full['adr'] + 1)

In [None]:
ht_train_full.var()

In [None]:
num_feature = ['lead_time','stays_nights_total','stays_in_weekend_nights','stays_in_week_nights','number_of_people','adults','children','babies','is_repeated_guest','previous_cancellations','previous_bookings_not_canceled','booking_changes','agent','days_in_waiting_list','adr','required_car_parking_spaces','total_of_special_requests']

num_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median'))])

In [None]:
cat_feature = ['hotel','meal','country','market_segment','distribution_channel','reserved_room_type','assigned_room_type','deposit_type','customer_type']
cat_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transformer, num_feature),
        ('cat', cat_transformer, cat_feature)])

In [None]:
feature = num_feature + cat_feature
X = ht_train_full.drop('is_canceled',axis=1)[feature]
y = ht_train_full['is_canceled']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
X_train

In [None]:
base_models = [('LR_model', LogisticRegression()),
               ('DT_model', DecisionTreeClassifier()),
               ('GB_model', GradientBoostingClassifier()), 
               ('ADB_model',  AdaBoostClassifier(DecisionTreeClassifier())), 
               ('RF_model', RandomForestClassifier()),
               ('XGB_model', XGBRegressor(booster = 'gbtree', learning_rate = 0.1, max_depth = 5, n_estimators = 180)),
               ('RidgeCV_model', RidgeCV(alphas = 0.005, normalize = True)),
               ('LassoCV_model', LassoCV(alphas = None, cv = 10, max_iter = 100000, normalize = True)),
               ('E-Net_model', linear_model.ElasticNet(alpha=0.005,l1_ratio=0.1))]

In [None]:
for name,model in base_models:
    clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', model)])
    clf.fit(X_train, y_train)
    score = clf.score(X_test, y_test)
    print("%s score: %.3f" % (name,score))

由上面數據得知最好的模型是 RandomForest 後續會針對此模型做調參，提升模型準確率。

In [None]:
rf_model = RandomForestClassifier(n_estimators=160,
                               max_features=0.4,
                               oob_score = True,                               
                               n_jobs=-1,
                               random_state=0)
CLF = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', rf_model)])
CLF.fit(X_train, y_train)
CLF.score(X_test, y_test)
y_pred_dtc = CLF.predict(X_test)

acc_dtc = accuracy_score(y_test, y_pred_dtc)
clf_report = classification_report(y_test, y_pred_dtc)
print(f"Accuracy Score of Decision Tree is : {acc_dtc}")
print(f"Classification Report : \n{clf_report}")

0.890108049250356 n_estimators=160,

0.8900242901415529  n_estimators=180,max_features=0.3

0.8842449116341402 n_estimators=160,max_features='auto'

由上面的調參結果得知，我們最好的變數組合的正確率最多只能到89%。接者會透過其他的變數處理方法，期望能讓模型正確率有所進步。

In [None]:
test2 = ht_train_full.copy()
useless_col = ['days_in_waiting_list', 'arrival_date_year', 'arrival_date_year', 'assigned_room_type', 'booking_changes','reservation_status', 'country', 'days_in_waiting_list']
test2['reservation_status_date'] = pd.to_datetime(test2['reservation_status_date'])
test2['year'] = test2['reservation_status_date'].dt.year
test2['month'] = test2['reservation_status_date'].dt.month
test2['day'] = test2['reservation_status_date'].dt.day
test2.drop(useless_col, axis = 1, inplace = True)
test2.drop(['reservation_status_date','arrival_date_month'] , axis = 1, inplace = True)
test2_dum = pd.get_dummies(test2)

In [None]:
num_feature = ['lead_time','stays_nights_total','stays_in_weekend_nights','stays_in_week_nights','number_of_people','adults','children','babies','is_repeated_guest','previous_cancellations','previous_bookings_not_canceled','agent','adr','required_car_parking_spaces','total_of_special_requests']

num_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median'))])

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transformer, num_feature)])

In [None]:
test2_dum['adr'] = test2_dum['adr'].fillna(value = test2_dum['adr'].mean())

In [None]:
#feature = num_feature + cat_feature
X = test2_dum.drop('is_canceled',axis=1)
y = test2_dum['is_canceled']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
base_models = [('LR_model', LogisticRegression()),
               ('DT_model', DecisionTreeClassifier()),
               ('GB_model', GradientBoostingClassifier()), 
               ('ADB_model',  AdaBoostClassifier(DecisionTreeClassifier())), 
               ('RF_model', RandomForestClassifier()),
               ('XGB_model', XGBRegressor(booster = 'gbtree', learning_rate = 0.1, max_depth = 5, n_estimators = 180)),
               ('RidgeCV_model', RidgeCV(alphas = 0.005, normalize = True)),
               ('LassoCV_model', LassoCV(alphas = None, cv = 10, max_iter = 100000, normalize = True)),
               ('E-Net_model', linear_model.ElasticNet(alpha=0.005,l1_ratio=0.1))]

In [None]:
for name,model in base_models:
    clf = Pipeline(steps=[('classifier', model)])
    clf.fit(X_train, y_train)
    score = clf.score(X_test, y_test)
    print("%s score: %.3f" % (name,score))

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
dt_model = DecisionTreeClassifier()
CLF = Pipeline(steps=[('classifier', dt_model)])
CLF.fit(X_train, y_train)
CLF.score(X_test, y_test)
y_pred_dtc = CLF.predict(X_test)

acc_dtc = accuracy_score(y_test, y_pred_dtc)
clf_report = classification_report(y_test, y_pred_dtc)
print(f"Accuracy Score of Decision Tree is : {acc_dtc}")
print(f"Classification Report : \n{clf_report}")

In [None]:
rf_model = RandomForestClassifier(n_estimators=160,
                               max_features=0.4,
                               oob_score = True,                               
                               n_jobs=-1,
                               random_state=0)
CLF = Pipeline(steps=[('classifier', rf_model)])
CLF.fit(X_train, y_train)
CLF.score(X_test, y_test)
y_pred_dtc = CLF.predict(X_test)

acc_dtc = accuracy_score(y_test, y_pred_dtc)
clf_report = classification_report(y_test, y_pred_dtc)
print(f"Accuracy Score of  RandomForest is : {acc_dtc}")
print(f"Classification Report : \n{clf_report}")

In [None]:
adb_model = AdaBoostClassifier(DecisionTreeClassifier(),n_estimators=350, learning_rate=0.1, random_state=0)
CLF = Pipeline(steps=[('classifier', adb_model)])
CLF.fit(X_train, y_train)
CLF.score(X_test, y_test)
y_pred_dtc = CLF.predict(X_test)

acc_dtc = accuracy_score(y_test, y_pred_dtc)
clf_report = classification_report(y_test, y_pred_dtc)
print(f"Accuracy Score of  AdaBoost is : {acc_dtc}")
print(f"Classification Report : \n{clf_report}")

In [None]:
import keras
from keras.layers import Dense, Dropout
from keras.models import Sequential

model  = Sequential()
model.add(Dense(100, activation = 'relu', input_shape = (58, )))
model.add(Dense(50, activation = 'relu'))
model.add(Dense(50, activation = 'relu'))
model.add(Dense(1, activation = 'sigmoid'))
model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])
model_history = model.fit(X_train, y_train, validation_data = (X_test, y_test), epochs = 100)

In [None]:
print(model.summary())

In [None]:
plt.figure(figsize = (12, 6))

train_loss = model_history.history['loss']
val_loss = model_history.history['val_loss'] 
epoch = range(1, 101)

loss = pd.DataFrame({'train_loss' : train_loss, 'val_loss' : val_loss})

px.line(data_frame = loss, x = epoch, y = ['train_loss', 'val_loss'], title = 'Train and Val Loss')

In [None]:
plt.figure(figsize = (12, 6))

train_acc = model_history.history['accuracy']
val_acc = model_history.history['val_accuracy'] 
epoch = range(1, 101)


accuracy = pd.DataFrame({'train_acc' : train_acc, 'val_acc' : val_acc})

px.line(data_frame = accuracy, x = epoch, y = ['train_acc', 'val_acc'], title = 'Train and Val Accuracy')

In [None]:
accuracy_ann = model.evaluate(X_test, y_test)[1]

print(f'Accuracy of model is {accuracy_ann}')

### 模型分析結論

透過對於類別型變數採dummy variable，將其變數處理成模型可使用的數值型變數。在經過不同的模型檢視後。我們發現我們的模型有顯著的進步，除了原先表現不錯的DecisionTreeClassifier、RandomForestClassifier以外，AdaBoostClassifier 和 XGBRegressor也都有一定的進步。也因為我們觀察到DecisionTreeClassifier、RandomForestClassifier以及AdaBoostClassifier 表現上差不多。因而針對三個模型調整最佳參數。

同時我也建立ANN的模型做預測，雖然預測正確率會高於機器學習，但其表現不穩定，大概會落在 88% ~ 97% 這個區間。後續也有透過修改 optimizer、增加Dense layer 以及加上 Dropout 等方法，但都無法有效的提升模型的準確率。

我們發現:

    1. DecisionTreeClassifier 在不設定任何參數，皆使用模型預設的參數表現為最好(0.952)

    2. RandomForestClassifier 在 n_estimators=160, max_features=0.4, oob_score = True, n_jobs=-1, random_state=0 的參數組合有最好的表現(0.967)

    3. AdaBoostClassifier 在 n_estimators=350, learning_rate=0.1, random_state=0 的參數組合有最好的表現(0.953)
    
    4. ANN 在設定4層的網路，且不加 Dropout 層的狀態下會有相對較好的表現，不過表現不穩定，正確率落在 88% ~ 98% 這個區間。
    
因此，如果最後選擇最佳模型，我會選擇使用 RandomForest 做最後的預測模型。因為該模型穩定，且時間成本低，在建模時會是較好的選擇。不過如果考慮要用ANN等相關深度模型的話，我會考慮改用 LSTM 或 ensemble 的方法，比較是否能夠比 RandomForest 或是 ANN 有更好的表現。