In [1]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

%matplotlib inline

In [18]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split

In [17]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [3]:
path = os.getcwd()
dataset_name = 'Hotel Reservations.csv'
datapath = os.path.join(path, 'data', dataset_name)

In [4]:
dataset = pd.read_csv(datapath)
dataset.head()

Unnamed: 0,Booking_ID,no_of_adults,no_of_children,no_of_weekend_nights,no_of_week_nights,type_of_meal_plan,required_car_parking_space,room_type_reserved,lead_time,arrival_year,arrival_month,arrival_date,market_segment_type,repeated_guest,no_of_previous_cancellations,no_of_previous_bookings_not_canceled,avg_price_per_room,no_of_special_requests,booking_status
0,INN00001,2,0,1,2,Meal Plan 1,0,Room_Type 1,224,2017,10,2,Offline,0,0,0,65.0,0,Not_Canceled
1,INN00002,2,0,2,3,Not Selected,0,Room_Type 1,5,2018,11,6,Online,0,0,0,106.68,1,Not_Canceled
2,INN00003,1,0,2,1,Meal Plan 1,0,Room_Type 1,1,2018,2,28,Online,0,0,0,60.0,0,Canceled
3,INN00004,2,0,0,2,Meal Plan 1,0,Room_Type 1,211,2018,5,20,Online,0,0,0,100.0,0,Canceled
4,INN00005,2,0,1,1,Not Selected,0,Room_Type 1,48,2018,4,11,Online,0,0,0,94.5,0,Canceled


### Labeling data dan Drop Kolom
Labeling data perlu dilakukan, selain untuk merapikan dataset. Hal ini juga perlu dilakukan untuk mengubah nilai pada dataset menjadi integer agar model dapat mempelajari data tersebut dengan baik.

Selain itu, kita juga perlu menghapus beberapa kolom yang dikira kurang berguna seperti kolom Booling_ID yang tidak dibutuhkan

In [5]:
df = dataset.copy()

In [6]:
df['booking_status'].unique()

array(['Not_Canceled', 'Canceled'], dtype=object)

In [7]:
df['room_type_reserved'].unique()

array(['Room_Type 1', 'Room_Type 4', 'Room_Type 2', 'Room_Type 6',
       'Room_Type 5', 'Room_Type 7', 'Room_Type 3'], dtype=object)

In [8]:
df['type_of_meal_plan'].unique()

array(['Meal Plan 1', 'Not Selected', 'Meal Plan 2', 'Meal Plan 3'],
      dtype=object)

In [9]:
df['booking_status'] = df['booking_status'].replace(['Canceled', 'Not_Canceled'], [1, 0])

In [10]:
df['room_type_reserved'] = df['room_type_reserved'].replace(
    ['Room_Type 1', 'Room_Type 4', 'Room_Type 2', 'Room_Type 6', 'Room_Type 5', 'Room_Type 7', 'Room_Type 3'],
    [1, 4, 2, 6, 5, 7, 3]
)

In [11]:
df['type_of_meal_plan'] = df['type_of_meal_plan'].replace(
    ['Meal Plan 1', 'Not Selected', 'Meal Plan 2', 'Meal Plan 3'],
    [1, 0, 2, 3]
)

In [12]:
df.drop(['Booking_ID'], axis = 1, inplace = True)

In [13]:
df.head()

Unnamed: 0,no_of_adults,no_of_children,no_of_weekend_nights,no_of_week_nights,type_of_meal_plan,required_car_parking_space,room_type_reserved,lead_time,arrival_year,arrival_month,arrival_date,market_segment_type,repeated_guest,no_of_previous_cancellations,no_of_previous_bookings_not_canceled,avg_price_per_room,no_of_special_requests,booking_status
0,2,0,1,2,1,0,1,224,2017,10,2,Offline,0,0,0,65.0,0,0
1,2,0,2,3,0,0,1,5,2018,11,6,Online,0,0,0,106.68,1,0
2,1,0,2,1,1,0,1,1,2018,2,28,Online,0,0,0,60.0,0,1
3,2,0,0,2,1,0,1,211,2018,5,20,Online,0,0,0,100.0,0,1
4,2,0,1,1,0,0,1,48,2018,4,11,Online,0,0,0,94.5,0,1


### Encoding dan Scaling Dataset
Pada proses encoding, saya akan menggunakan one hot encoder. One Hot Encoder merupakan salah satu tipe encoding yang digunakan untuk melakukan encoding pada data berjenis ordinal, yaitu data yang tidak memiliki peringkat value. Scaling juga diperlukan untuk menormalisasi nilai pada dataset agar tidak terjadi timpang data, kali ini saya menggunakan min max scaler.

In [14]:
X = df.drop('booking_status', axis = 1)
y = df['booking_status']

In [16]:
ord_col = ['type_of_meal_plan', 'type_of_meal_plan', 'room_type_reserved', 'market_segment_type']
num_col = X.drop(ord_col, axis = 1).columns

num_pipeline = Pipeline([
    ("Min-Max scaler", MinMaxScaler())
])

cat_pipeline = Pipeline([
    ("One-Hot", OneHotEncoder())
])

full_pipeline = ColumnTransformer([
    ('Categorical', cat_pipeline, ord_col),
    ('Numerical', num_pipeline, num_col)
])
    
X_prepared = full_pipeline.fit_transform(X)
X_prepared[0]

array([0.        , 1.        , 0.        , 0.        , 0.        ,
       1.        , 0.        , 0.        , 1.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 1.        , 0.        ,
       0.5       , 0.        , 0.14285714, 0.11764706, 0.        ,
       0.50564334, 0.        , 0.81818182, 0.03333333, 0.        ,
       0.        , 0.        , 0.12037037, 0.        ])

### Melakukan Modelling
Pada test pertama ini, akan saya lakukan modeling tanpa mengubah banyak data (data murni) menggunakan logistic regression, random forest terlebih dahulu.

In [26]:
X_train, X_test, y_train, y_test = train_test_split(X_prepared, y, random_state = 1, test_size = 0.2)

In [27]:
lr = LogisticRegression()
rfc = RandomForestClassifier()

model = lr.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [29]:
model.score(X_test, y_test)

0.8009648518263267

In [30]:
modelrfc = rfc.fit(X_train, y_train)

modelrfc.score(X_test, y_test)

0.9055823569951758