## Preprocessing

In [98]:
import pandas as pd
import numpy as np

In [99]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
import category_encoders as ce

In [117]:
hotel = pd.read_csv('hotel_bookings.csv')
hotel.head()

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,...,deposit_type,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date
0,Resort Hotel,0,342,2015,July,27,1,0,0,2,...,No Deposit,,,0,Transient,0.0,0,0,Check-Out,2015-07-01
1,Resort Hotel,0,737,2015,July,27,1,0,0,2,...,No Deposit,,,0,Transient,0.0,0,0,Check-Out,2015-07-01
2,Resort Hotel,0,7,2015,July,27,1,0,1,1,...,No Deposit,,,0,Transient,75.0,0,0,Check-Out,2015-07-02
3,Resort Hotel,0,13,2015,July,27,1,0,1,1,...,No Deposit,304.0,,0,Transient,75.0,0,0,Check-Out,2015-07-02
4,Resort Hotel,0,14,2015,July,27,1,0,2,2,...,No Deposit,240.0,,0,Transient,98.0,0,1,Check-Out,2015-07-03


In [118]:
df = hotel[:5000] #mengambil 5000 data pertama

In [119]:
df.shape #cek bentuk df

(5000, 32)

In [121]:
df_baru = df[['hotel', 'is_canceled', 'adults', 'children', 'babies', 'meal', 'country', 'market_segment', 'distribution_channel', 'reserved_room_type', 'booking_changes', 'deposit_type', 'days_in_waiting_list', 'customer_type', 'required_car_parking_spaces', 'total_of_special_requests']]

In [127]:
df_baru.shape

(5000, 16)

In [122]:
df_baru.isna().sum()

hotel                          0
is_canceled                    0
adults                         0
children                       0
babies                         0
meal                           0
country                        2
market_segment                 0
distribution_channel           0
reserved_room_type             0
booking_changes                0
deposit_type                   0
days_in_waiting_list           0
customer_type                  0
required_car_parking_spaces    0
total_of_special_requests      0
dtype: int64

In [128]:
df_baru.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 16 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   hotel                        5000 non-null   object 
 1   is_canceled                  5000 non-null   int64  
 2   adults                       5000 non-null   int64  
 3   children                     5000 non-null   float64
 4   babies                       5000 non-null   int64  
 5   meal                         5000 non-null   object 
 6   country                      4998 non-null   object 
 7   market_segment               5000 non-null   object 
 8   distribution_channel         5000 non-null   object 
 9   reserved_room_type           5000 non-null   object 
 10  booking_changes              5000 non-null   int64  
 11  deposit_type                 5000 non-null   object 
 12  days_in_waiting_list         5000 non-null   int64  
 13  customer_type     

karena hanya terdapat 2 missing value saja dalam fitur yang digunakan, saya memutuskan untuk men-dropna missing valuetersebut

In [129]:
df_baru.dropna(inplace=True)

In [130]:
df_baru.isna().sum()

hotel                          0
is_canceled                    0
adults                         0
children                       0
babies                         0
meal                           0
country                        0
market_segment                 0
distribution_channel           0
reserved_room_type             0
booking_changes                0
deposit_type                   0
days_in_waiting_list           0
customer_type                  0
required_car_parking_spaces    0
total_of_special_requests      0
dtype: int64

In [131]:
df_baru.shape

(4998, 16)

In [132]:
transformer= ColumnTransformer([
    ('one_hot',OneHotEncoder(drop='first'),['hotel', 'meal', 'market_segment', 'country', 'distribution_channel', 'reserved_room_type', 'deposit_type', 'customer_type']),
    ('binary_encode', ce.BinaryEncoder(),['country'])
], remainder='passthrough')

menggunakan onehot encoder karena onehot encoder dapat melakukan encode terhadap data nominal maupun ordinal
untuk country saya menggunakan binary encoder karena country merupakan data nominal

## Splitting Data

In [133]:
from sklearn.model_selection import train_test_split

In [134]:
X=df_baru[['hotel', 'adults', 'children', 'babies', 'meal', 'country', 'market_segment', 'distribution_channel', 'reserved_room_type', 'booking_changes', 'deposit_type', 'days_in_waiting_list', 'customer_type', 'required_car_parking_spaces', 'total_of_special_requests']]
y=df_baru['is_canceled']

In [135]:
X_train, X_test, y_train, y_test= train_test_split(X,y,
    test_size=0.2,
    stratify=y,
    random_state=2020)

In [136]:
X_train.shape

(3998, 15)

## Model Benchmark

In [137]:
#library

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier


from sklearn.metrics import f1_score, classification_report
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.pipeline import Pipeline

In [None]:
# disini saya menggunkan evaluation matrix f1 karena saya menganggap bahwa kedua masalah (false positf, dan false negatif) memiliki pengaruh terhadap finansial hotel. 

# pada kasus false positif, Model memprediksi user akan cancel booking (membatalkan pesanan), padahal sebenarnya/realisasinya user tidak membatalkan pesanan. hal ini akan membuat nama hotel menjadi buruk karena pelayanan dan fasilitas yang seharusnya diberikan tidak diberikan sehingga banyak review negatif dan berujung pada penurunan pengunjung

# pada kasus false negatif, Model memprediksi user tidak membatalkan pesanan, padahal sebenarnya/realisasinya user cancel booking (membatalkan pesanan). hal ini akan membuat over budget bagi hotel karena telah menyediakan pelayanan dan fasilitas untuk pengunjung tetapi pengunjung membatalkan pesanan

In [165]:
models = [
    ['Log Regression', LogisticRegression()],
    ['DecisionTree', DecisionTreeClassifier()],
    ['KNeighbours', KNeighborsClassifier()]]

In [None]:
# membuat list didalam list yang terdiri dari nama, dan model ML
# disini saya menggunakan 3 model ML yaitu logistic regression, decision tree dan KNN
# penggunaan ML tersebut karena data target (y) merupakan data klasifikasi

In [166]:
def evaluate(models):
    model_name=[]
    f1_score=[]
    f1_std=[]
    # membuat function untuk melakukan pengujian thdp beberapa model

    for name, model in models:
        estimator=Pipeline([('preprocess',transformer),('model',model)])
        # penggunaan pipeline untuk melakukan beberapa proces menjadi 1 process yang urut.

        skfold= StratifiedKFold(n_splits=5)
        # penguijian cross validasi untuk mengecek tingkat ke stabilan model ML.

        model_name.append(name)
        f1_score.append(cross_val_score(estimator, X_train,y_train, cv=skfold,scoring='f1').mean())
        f1_std.append(cross_val_score(estimator, X_train,y_train, cv=skfold,scoring='f1').std())

    return pd.DataFrame({
        'model': model_name,
        'f1_score':f1_score,
        'f1_std':f1_std
        })


In [167]:
evaluate(models)

Unnamed: 0,model,f1_score,f1_std
0,Log Regression,0.849323,0.004637
1,DecisionTree,0.849093,0.012075
2,KNeighbours,0.833174,0.009642


logistic regresi model terbaik apabila dilihat dari stabilitasnya. dikarenakan hasil f1 score log reg dan model lain tidak berbeda jauh, saya memutuskan untuk memilih logreg sebagai model terbaik, dengan alasan lebih stabilnya model logistic regresi

## check benchmark preformance against test data

In [189]:
estimator=Pipeline([('preprocess',transformer),('model',LogisticRegression())])
estimator.fit(X_train,y_train)
f1_score(y_test, estimator.predict(X_test))

0.8255234297108675

untuk mengetahui performa model, lakukan pengujian ulang dengan menggunakkan pipeline. untuk melihat score digunakan f1 score

## Hyperparameter Tuning

In [142]:
from sklearn.model_selection import GridSearchCV

In [183]:
hyperparam_space = {
    'model__C':[0.001,0.01,0.1,1,10,100],
    'model__solver':['liblinear', 'sag', 'saga'],
}

di atas merupakan contoh parameter dari logistic regression yang saya gunakan dalam hyperparameter tuning

hyperparameter tuning digunakan untuk meihat parameter terbaik apa yang harus digunakan. Hyperparameter tuning ini dapat membatu untuk meningkatkan performa model terpilih

In [184]:
estimator=Pipeline([('preprocess',transformer),('model',LogisticRegression())])

skfold= StratifiedKFold(n_splits=5)

grid= GridSearchCV(estimator,param_grid=hyperparam_space,cv=skfold,scoring='f1')
grid.fit(X_train,y_train)

GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=None, shuffle=False),
             estimator=Pipeline(steps=[('preprocess',
                                        ColumnTransformer(remainder='passthrough',
                                                          transformers=[('one_hot',
                                                                         OneHotEncoder(drop='first'),
                                                                         ['hotel',
                                                                          'meal',
                                                                          'market_segment',
                                                                          'distribution_channel',
                                                                          'reserved_room_type',
                                                                          'deposit_type',
                                                         

selanjutnya menggunakkan grid search untuk mendapatkan parameter terbaik, dan juga dalam proses ini juga dilakukan cross validation  (skfold) untuk melihat ke stabilan model

In [185]:
grid.best_params_

{'model__C': 100, 'model__solver': 'liblinear'}

In [186]:
grid.best_score_

0.8536539139056083

## check preformance model against test data after Hyperparameter Tuning

In [187]:
grid.best_estimator_.fit(X_train,y_train)
f1_score(y_test,grid.best_estimator_.predict(X_test))

0.8288822947576657

setelah melakukan hyperparameter tuning didapatkan bahwa terdapat peningkatan performa, meskipun tidak banyak dari 0.8255234297108675 menjadi 0.8288822947576657

## Ensamble Model

In [194]:
from sklearn.ensemble import RandomForestClassifier

In [191]:
estimator_1=Pipeline([('preprocess',transformer),('model',RandomForestClassifier())])
estimator_1.fit(X_train,y_train)
f1_score(y_test, estimator_1.predict(X_test))

0.8492307692307693

## Hyperparameter Tuning Ensamble Model

In [172]:
from sklearn.model_selection import RandomizedSearchCV

In [173]:
hyperparam_space = {
    'model__criterion':['gini','entropy'],
    'model__n_estimators':[2,5,10],
    'model__max_depth':[2,4,6,8,10],
    'model__min_samples_split':[1,2,4,6,8,10],
    'model__min_samples_leaf':[1,3,5]
}

In [176]:
estimator=Pipeline([('preprocess',transformer),('model',RandomForestClassifier())])

skfold= StratifiedKFold(n_splits=5)

randomized_1= RandomizedSearchCV(estimator,param_distributions=hyperparam_space,cv=skfold,scoring='f1')
randomized_1.fit(X_train,y_train)

RandomizedSearchCV(cv=StratifiedKFold(n_splits=5, random_state=None, shuffle=False),
                   estimator=Pipeline(steps=[('preprocess',
                                              ColumnTransformer(remainder='passthrough',
                                                                transformers=[('one_hot',
                                                                               OneHotEncoder(drop='first'),
                                                                               ['hotel',
                                                                                'meal',
                                                                                'market_segment',
                                                                                'distribution_channel',
                                                                                'reserved_room_type',
                                                                                'deposi

karena keterbatasan waktu saya menggunakan randomized search

In [178]:
randomized_1.best_params_

{'model__n_estimators': 10,
 'model__min_samples_split': 4,
 'model__min_samples_leaf': 5,
 'model__max_depth': 8,
 'model__criterion': 'gini'}

In [179]:
randomized_1.best_score_

0.8584045788464829

## check preformance model against test data after Hyperparameter Tuning Model Ensamble

In [180]:
randomized_1.best_estimator_.fit(X_train,y_train)
f1_score(y_test,randomized_1.best_estimator_.predict(X_test))

0.8499025341130604

## Summary

In [192]:
logreg_benchmark = f1_score(y_test, estimator.predict(X_test))
logreg_hyperparam = f1_score(y_test,grid.best_estimator_.predict(X_test))
randomforest_ensamble = f1_score(y_test, estimator_1.predict(X_test))
randomforest_hyperparam = f1_score(y_test,randomized_1.best_estimator_.predict(X_test))

In [193]:
score_list = [logreg_benchmark,logreg_hyperparam,randomforest_ensamble, randomforest_hyperparam] 
model_names = ['Logistic Regression','Logistic Regression Tuning','Random Forest Ensamble','Random Forest Ensamble Tuning' ]
df_summary = pd.DataFrame({
    'method':model_names,
    'score':score_list
})
df_summary

Unnamed: 0,method,score
0,Logistic Regression,0.825523
1,Logistic Regression Tuning,0.828882
2,Random Forest Ensamble,0.849231
3,Random Forest Ensamble Tuning,0.849903


# Model Terbaik adalah Model Random Forest Ensamble Tuning dengan score f1 0.849903