In [None]:
import numpy as np
import pandas as pd
# Import and read dataset
df = pd.read_csv('/kaggle/input/hotel-reservations-classification-dataset/Hotel Reservations.csv')
df.head(10)

In [None]:
# Checking if any rows are missing any data.
df.isnull().sum()

In [None]:
df.corr()

In [None]:
df.info()

In [None]:
# Determine count of unique values for each column in the dataframe
df.nunique()

In [None]:
df.columns

In [None]:
numerikal =  df.select_dtypes(include=[np.number])
kategorikal = df.select_dtypes(exclude=[np.number])

In [None]:
numerikal.columns

In [None]:
kategorikal.columns

In [None]:
df.duplicated().sum()

In [None]:
numerical =  df.select_dtypes(include=[np.number])
numerical.columns

In [None]:
categorical = df.select_dtypes(exclude=[np.number])
categorical.columns

In [None]:
import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt

df['booking_status'].value_counts().plot.bar()
plt.xlabel('booking_status')
plt.ylabel('Count')
plt.tight_layout()
plt.show()

categorical.booking_status.value_counts()

In [None]:
df.head(10)

In [None]:
plt.rcParams["figure.figsize"] = [17, 7]
plt.rcParams["figure.autolayout"] = True
ax = df[['no_of_adults', 'no_of_children', 'no_of_weekend_nights',
       'no_of_week_nights', 'required_car_parking_space', 'lead_time',
       'arrival_year', 'arrival_month', 'arrival_date', 'repeated_guest',
       'no_of_previous_cancellations', 'no_of_previous_bookings_not_canceled',
       'avg_price_per_room', 'no_of_special_requests']].plot(kind='box', title='boxplot')
plt.show()

In [None]:
sns.kdeplot(data=df, x="no_of_previous_bookings_not_canceled", hue="booking_status", multiple="stack")

In [None]:
sns.kdeplot(data=df, x="lead_time", hue="booking_status", multiple="stack")

In [None]:
sns.kdeplot(data=df, x="avg_price_per_room", hue="booking_status", multiple="stack")

In [None]:
# converting target variable into numerical value
df['booking_status'] = np.where((df['booking_status'] == 'Canceled'),1,0)

In [None]:
df.groupby(['arrival_month','arrival_year'])['booking_status'].sum().unstack().plot(kind='bar')
plt.title('Booking Status Count in 2017 to 2018')
plt.xlabel('Month (January - December)')
plt.ylabel('Count')
plt.show()

In [None]:
df.groupby(['room_type_reserved'])['booking_status'].sum().plot(kind='bar')
plt.title('Room Type Booked Count')
plt.xlabel('Room Type')
plt.ylabel('Count')
plt.show()

categorical.room_type_reserved.value_counts()

In [None]:
# Import label encoder 
from sklearn import preprocessing
# label_encoder object knows how to understand word labels. 
label_encoder = preprocessing.LabelEncoder()
# Encode labels in column 'Country'. 
df['room_type_reserved']= label_encoder.fit_transform(df['room_type_reserved']) 

In [None]:
df.groupby(['room_type_reserved'])['booking_status'].sum().plot(kind='bar')
plt.title('Room Type Booked Count')
plt.xlabel('Room Type')
plt.ylabel('Count')
plt.show()

categorical.room_type_reserved.value_counts()

In [None]:
df_pie=df.groupby(['type_of_meal_plan'])['booking_status'].sum().reset_index()
plt.figure(figsize=(8,8))
plt.title('Meal Plans')
plt.pie(df_pie['booking_status'], labels= df_pie['type_of_meal_plan'],autopct='%1.2f%%',)
plt.show()

categorical.type_of_meal_plan.value_counts()

In [None]:
df['type_of_meal_plan']= label_encoder.fit_transform(df['type_of_meal_plan']) 

df_pie=df.groupby(['type_of_meal_plan'])['booking_status'].sum().reset_index()
plt.figure(figsize=(8,8))
plt.title('Meal Plans')
plt.pie(df_pie['booking_status'], labels= df_pie['type_of_meal_plan'],autopct='%1.2f%%',)
plt.show()

categorical.type_of_meal_plan.value_counts()

In [None]:
df_pie=df.groupby(['market_segment_type'])['booking_status'].sum().reset_index()
plt.figure(figsize=(8,8))
plt.title('market_segment_type')
plt.pie(df_pie['booking_status'], labels= df_pie['market_segment_type'],autopct='%1.2f%%',)
plt.show()

categorical.market_segment_type.value_counts()

In [None]:
df['market_segment_type']= label_encoder.fit_transform(df['market_segment_type']) 

df_pie=df.groupby(['market_segment_type'])['booking_status'].sum().reset_index()
plt.figure(figsize=(8,8))
plt.title('market_segment_type')
plt.pie(df_pie['booking_status'], labels= df_pie['market_segment_type'],autopct='%1.2f%%',)
plt.show()

categorical.market_segment_type.value_counts()

In [None]:
df.groupby('required_car_parking_space')['Booking_ID'].agg(['count']).sort_values(by='count',ascending=False).plot(kind='pie',autopct='%1.2f%%',subplots=True,title='Required car parking space',figsize=(9,9))
numerical.required_car_parking_space.value_counts()

In [None]:
df.groupby(['no_of_week_nights'])['booking_status'].sum().plot(kind='bar')
plt.title('no_of_week_nights')
plt.xlabel('Room Type')
plt.ylabel('Count')
plt.show()

numerical.no_of_week_nights.value_counts()

In [None]:
df.corr()

In [None]:
sns.heatmap(df.corr())

In [None]:
print(f'Jumlah Baris Sebelum Outlier Dihapus: {len(df)}')
filtered_entries = np.array([True] * len(df))
for col in['lead_time','no_of_previous_bookings_not_canceled',
       'avg_price_per_room']:
    
    q1=df[col].quantile(0.25)
    q3=df[col].quantile(0.75)
    iqr=q3-q1

    min_IQR = q1 - (1.5 * iqr)
    max_IQR = q3 + (1.5 * iqr)

    filtered_entries=((df[col]>=min_IQR) & (df[col]<=max_IQR)) & filtered_entries
    df=df[filtered_entries]

print(f'Jumlah Baris Sebelum Outlier Dihapus: {len(df)}')

In [None]:
df.drop('Booking_ID', axis=1, inplace=True)

In [None]:
X = df.drop(columns='booking_status') 
X.head()

In [None]:
y = df['booking_status'] 
y.head()

In [None]:
from sklearn.model_selection import train_test_split,cross_validate
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.neighbors import KNeighborsClassifier

knn_model = KNeighborsClassifier(n_neighbors = 11) 
knn_model.fit(X_train,y_train)
predicted_knn = knn_model.predict(X_test)
accuracy_knn = knn_model.score(X_test,y_test)

akurasi_knn = (accuracy_score(y_test, predicted_knn)*100)
print("KNN accuracy:",akurasi_knn)

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, roc_curve
from sklearn.metrics import classification_report
print("confussion matrix")
print(confusion_matrix(y_test, predicted_knn))
print("-------------------------------------------")
print("Accuracy of KNN: {:.2f}%".format(akurasi_knn))
print(classification_report(y_test,predicted_knn))
print(f'ROC_AUC score: {roc_auc_score(y_test, predicted_knn)}.')

In [None]:
# Applying 10-Fold Cross Validation
from sklearn.model_selection import cross_val_score
scores_knn = cross_val_score(knn_model, X, y, cv = 10, scoring='accuracy')
print('Cross-validation scores_knn:{}'.format(scores_knn))

print("-------------------------------------------")

# compute Average cross-validation score
print("Accuracy: %.3f%% (%.3f%%)" % (scores_knn.mean()*100.0, scores_knn.std()*100.0))

print("-------------------------------------------")

# Confusion Matrix
from sklearn.model_selection import cross_val_predict
y_pred = cross_val_predict(knn_model, X, y, cv=10)
print("CONFUSION MATRIX")
cnf_matrix=confusion_matrix(y, y_pred)
print(cnf_matrix)


In [None]:
from sklearn import svm

clf = svm.SVC()
clf.fit(X_train,y_train)
predicted_svm = clf.predict(X_test)
accuracy_svm = clf.score(X_test,y_test)

akurasi_svm = (accuracy_score(y_test, predicted_svm)*100)
print("SVM accuracy:",akurasi_svm)

In [None]:
print("confussion matrix")
print(confusion_matrix(y_test, predicted_svm))
print("-------------------------------------------")
print("Accuracy of SVM: {:.2f}%".format(akurasi_svm))
print(classification_report(y_test,predicted_svm))
print(f'ROC_AUC score: {roc_auc_score(y_test, predicted_svm)}.')

In [None]:
# # Applying 10-Fold Cross Validation
# from sklearn.model_selection import cross_val_score
# scores_svm = cross_val_score(clf, X, y, cv = 10, scoring='accuracy')
# print('Cross-validation scores_svm:{}'.format(scores_svm))

# print("-------------------------------------------")

# # compute Average cross-validation score
# print("Accuracy: %.3f%% (%.3f%%)" % (scores_svm.mean()*100.0, scores_svm.std()*100.0))  

# print("-------------------------------------------")

# # Confusion Matrix
# from sklearn.model_selection import cross_val_predict
# y_pred = cross_val_predict(clf, X, y, cv=10)
# print("CONFUSION MATRIX")
# cnf_matrix=confusion_matrix(y, y_pred)
# print(cnf_matrix)

In [None]:
# Import the model we are using
from sklearn.ensemble import RandomForestRegressor
# Instantiate model with 1000 decision trees
rf = RandomForestRegressor(n_estimators = 1000, random_state = 42)
# Train the model on training data
rf.fit(X_train, y_train);

In [None]:
y_pred=clf.predict(X_test)

In [None]:
#Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics
# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

In [None]:
print("confussion matrix")
print(confusion_matrix(y_test, y_pred))
print("-------------------------------------------")
print("Accuracy of SVM: {:.2f}%".format(akurasi_svm))
print(classification_report(y_test,y_pred))
print(f'ROC_AUC score: {roc_auc_score(y_test, y_pred)}.')

In [None]:
importance=rf.feature_importances_
columns=X.columns
i=0

while i<len(columns):
    print(f"The Importance of feature '{columns[i]}' is {round(importance[i]*100,2)}%.")
    i+=1